提交 21752926 编写于 作者: W WangZhen

Merge branch 'develop' of https://github.com/PaddlePaddle/Paddle into quantization_inference_passes

test=develop
...@@ -212,7 +212,7 @@ endif() ...@@ -212,7 +212,7 @@ endif()
if (WITH_JEMALLOC) if (WITH_JEMALLOC)
find_package(JeMalloc REQUIRED) find_package(JeMalloc REQUIRED)
include_directories(${JEMALLOC_INCLUDE_DIR}) include_directories(${JEMALLOC_INCLUDE_DIR})
add_definitions(-DWITH_JEMALLOC) add_definitions(-DPADDLE_WITH_JEMALLOC)
endif() endif()
include(generic) # simplify cmake module include(generic) # simplify cmake module
...@@ -276,9 +276,3 @@ add_subdirectory(paddle) ...@@ -276,9 +276,3 @@ add_subdirectory(paddle)
if(WITH_PYTHON) if(WITH_PYTHON)
add_subdirectory(python) add_subdirectory(python)
endif() endif()
if(WITH_DOC)
find_package(Sphinx REQUIRED)
find_python_module(recommonmark REQUIRED)
add_subdirectory(doc)
endif()
...@@ -11,12 +11,10 @@ RUN /bin/bash -c 'if [[ -n ${UBUNTU_MIRROR} ]]; then sed -i 's#http://archive.ub ...@@ -11,12 +11,10 @@ RUN /bin/bash -c 'if [[ -n ${UBUNTU_MIRROR} ]]; then sed -i 's#http://archive.ub
# ENV variables # ENV variables
ARG WITH_GPU ARG WITH_GPU
ARG WITH_AVX ARG WITH_AVX
ARG WITH_DOC
ENV WOBOQ OFF ENV WOBOQ OFF
ENV WITH_GPU=${WITH_GPU:-ON} ENV WITH_GPU=${WITH_GPU:-ON}
ENV WITH_AVX=${WITH_AVX:-ON} ENV WITH_AVX=${WITH_AVX:-ON}
ENV WITH_DOC=${WITH_DOC:-OFF}
ENV HOME /root ENV HOME /root
# Add bash enhancements # Add bash enhancements
......
# - This module looks for Sphinx
# Find the Sphinx documentation generator
#
# This modules defines
# SPHINX_EXECUTABLE
# SPHINX_FOUND
find_program(SPHINX_EXECUTABLE
NAMES sphinx-build
PATHS
/usr/bin
/usr/local/bin
/opt/local/bin
DOC "Sphinx documentation generator"
)
if( NOT SPHINX_EXECUTABLE )
set(_Python_VERSIONS
2.7 2.6 2.5 2.4 2.3 2.2 2.1 2.0 1.6 1.5
)
foreach( _version ${_Python_VERSIONS} )
set( _sphinx_NAMES sphinx-build-${_version} )
find_program( SPHINX_EXECUTABLE
NAMES ${_sphinx_NAMES}
PATHS
/usr/bin
/usr/local/bin
/opt/loca/bin
DOC "Sphinx documentation generator"
)
endforeach()
endif()
include(FindPackageHandleStandardArgs)
find_package_handle_standard_args(Sphinx DEFAULT_MSG
SPHINX_EXECUTABLE
)
option( SPHINX_HTML_OUTPUT "Build a single HTML with the whole content." ON )
option( SPHINX_DIRHTML_OUTPUT "Build HTML pages, but with a single directory per document." OFF )
option( SPHINX_HTMLHELP_OUTPUT "Build HTML pages with additional information for building a documentation collection in htmlhelp." OFF )
option( SPHINX_QTHELP_OUTPUT "Build HTML pages with additional information for building a documentation collection in qthelp." OFF )
option( SPHINX_DEVHELP_OUTPUT "Build HTML pages with additional information for building a documentation collection in devhelp." OFF )
option( SPHINX_EPUB_OUTPUT "Build HTML pages with additional information for building a documentation collection in epub." OFF )
option( SPHINX_LATEX_OUTPUT "Build LaTeX sources that can be compiled to a PDF document using pdflatex." OFF )
option( SPHINX_MAN_OUTPUT "Build manual pages in groff format for UNIX systems." OFF )
option( SPHINX_TEXT_OUTPUT "Build plain text files." OFF )
mark_as_advanced(
SPHINX_EXECUTABLE
SPHINX_HTML_OUTPUT
SPHINX_DIRHTML_OUTPUT
SPHINX_HTMLHELP_OUTPUT
SPHINX_QTHELP_OUTPUT
SPHINX_DEVHELP_OUTPUT
SPHINX_EPUB_OUTPUT
SPHINX_LATEX_OUTPUT
SPHINX_MAN_OUTPUT
SPHINX_TEXT_OUTPUT
)
function( Sphinx_add_target target_name builder conf cache source destination )
add_custom_target( ${target_name} ALL
COMMAND ${SPHINX_EXECUTABLE} -b ${builder}
-d ${cache}
-c ${conf}
${source}
${destination}
COMMENT "Generating sphinx documentation: ${builder}"
COMMAND cd ${destination} && ln -sf ./index_*.html index.html
)
set_property(
DIRECTORY APPEND PROPERTY
ADDITIONAL_MAKE_CLEAN_FILES
${destination}
)
endfunction()
# Target dependencies can be optionally listed at the end.
function( Sphinx_add_targets target_base_name conf source base_destination )
set( _dependencies )
foreach( arg IN LISTS ARGN )
set( _dependencies ${_dependencies} ${arg} )
endforeach()
if( ${SPHINX_HTML_OUTPUT} )
Sphinx_add_target( ${target_base_name}_html html ${conf} ${source} ${base_destination}/html )
add_dependencies( ${target_base_name}_html ${_dependencies} )
endif()
if( ${SPHINX_DIRHTML_OUTPUT} )
Sphinx_add_target( ${target_base_name}_dirhtml dirhtml ${conf} ${source} ${base_destination}/dirhtml )
add_dependencies( ${target_base_name}_dirhtml ${_dependencies} )
endif()
if( ${SPHINX_QTHELP_OUTPUT} )
Sphinx_add_target( ${target_base_name}_qthelp qthelp ${conf} ${source} ${base_destination}/qthelp )
add_dependencies( ${target_base_name}_qthelp ${_dependencies} )
endif()
if( ${SPHINX_DEVHELP_OUTPUT} )
Sphinx_add_target( ${target_base_name}_devhelp devhelp ${conf} ${source} ${base_destination}/devhelp )
add_dependencies( ${target_base_name}_devhelp ${_dependencies} )
endif()
if( ${SPHINX_EPUB_OUTPUT} )
Sphinx_add_target( ${target_base_name}_epub epub ${conf} ${source} ${base_destination}/epub )
add_dependencies( ${target_base_name}_epub ${_dependencies} )
endif()
if( ${SPHINX_LATEX_OUTPUT} )
Sphinx_add_target( ${target_base_name}_latex latex ${conf} ${source} ${base_destination}/latex )
add_dependencies( ${target_base_name}_latex ${_dependencies} )
endif()
if( ${SPHINX_MAN_OUTPUT} )
Sphinx_add_target( ${target_base_name}_man man ${conf} ${source} ${base_destination}/man )
add_dependencies( ${target_base_name}_man ${_dependencies} )
endif()
if( ${SPHINX_TEXT_OUTPUT} )
Sphinx_add_target( ${target_base_name}_text text ${conf} ${source} ${base_destination}/text )
add_dependencies( ${target_base_name}_text ${_dependencies} )
endif()
if( ${BUILD_TESTING} )
sphinx_add_target( ${target_base_name}_linkcheck linkcheck ${conf} ${source} ${base_destination}/linkcheck )
add_dependencies( ${target_base_name}_linkcheck ${_dependencies} )
endif()
endfunction()
...@@ -388,6 +388,7 @@ function(cc_test TARGET_NAME) ...@@ -388,6 +388,7 @@ function(cc_test TARGET_NAME)
endif() endif()
set_property(TEST ${TARGET_NAME} PROPERTY ENVIRONMENT FLAGS_cpu_deterministic=true) set_property(TEST ${TARGET_NAME} PROPERTY ENVIRONMENT FLAGS_cpu_deterministic=true)
set_property(TEST ${TARGET_NAME} PROPERTY ENVIRONMENT FLAGS_init_allocated_mem=true) set_property(TEST ${TARGET_NAME} PROPERTY ENVIRONMENT FLAGS_init_allocated_mem=true)
set_property(TEST ${TARGET_NAME} PROPERTY ENVIRONMENT FLAGS_limit_of_tmp_allocation=4294967296) # 4G
set_property(TEST ${TARGET_NAME} PROPERTY ENVIRONMENT FLAGS_cudnn_deterministic=true) set_property(TEST ${TARGET_NAME} PROPERTY ENVIRONMENT FLAGS_cudnn_deterministic=true)
# No unit test should exceed 10 minutes. # No unit test should exceed 10 minutes.
set_tests_properties(${TARGET_NAME} PROPERTIES TIMEOUT 600) set_tests_properties(${TARGET_NAME} PROPERTIES TIMEOUT 600)
...@@ -460,6 +461,7 @@ function(nv_test TARGET_NAME) ...@@ -460,6 +461,7 @@ function(nv_test TARGET_NAME)
endif() endif()
set_property(TEST ${TARGET_NAME} PROPERTY ENVIRONMENT FLAGS_cpu_deterministic=true) set_property(TEST ${TARGET_NAME} PROPERTY ENVIRONMENT FLAGS_cpu_deterministic=true)
set_property(TEST ${TARGET_NAME} PROPERTY ENVIRONMENT FLAGS_init_allocated_mem=true) set_property(TEST ${TARGET_NAME} PROPERTY ENVIRONMENT FLAGS_init_allocated_mem=true)
set_property(TEST ${TARGET_NAME} PROPERTY ENVIRONMENT FLAGS_limit_of_tmp_allocation=4294967296) # 4G
set_property(TEST ${TARGET_NAME} PROPERTY ENVIRONMENT FLAGS_cudnn_deterministic=true) set_property(TEST ${TARGET_NAME} PROPERTY ENVIRONMENT FLAGS_cudnn_deterministic=true)
endif() endif()
endfunction(nv_test) endfunction(nv_test)
...@@ -708,9 +710,10 @@ function(py_test TARGET_NAME) ...@@ -708,9 +710,10 @@ function(py_test TARGET_NAME)
set(oneValueArgs "") set(oneValueArgs "")
set(multiValueArgs SRCS DEPS ARGS ENVS) set(multiValueArgs SRCS DEPS ARGS ENVS)
cmake_parse_arguments(py_test "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN}) cmake_parse_arguments(py_test "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
add_test(NAME ${TARGET_NAME} add_test(NAME ${TARGET_NAME}
COMMAND ${CMAKE_COMMAND} -E env FLAGS_init_allocated_mem=true FLAGS_cudnn_deterministic=true COMMAND ${CMAKE_COMMAND} -E env FLAGS_init_allocated_mem=true FLAGS_cudnn_deterministic=true
FLAGS_cpu_deterministic=true FLAGS_cpu_deterministic=true FLAGS_limit_of_tmp_allocation=4294967296 # 4G
PYTHONPATH=${PADDLE_BINARY_DIR}/python ${py_test_ENVS} PYTHONPATH=${PADDLE_BINARY_DIR}/python ${py_test_ENVS}
${PYTHON_EXECUTABLE} -u ${py_test_SRCS} ${py_test_ARGS} ${PYTHON_EXECUTABLE} -u ${py_test_SRCS} ${py_test_ARGS}
WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}) WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR})
......
...@@ -52,8 +52,8 @@ function(op_library TARGET) ...@@ -52,8 +52,8 @@ function(op_library TARGET)
endif() endif()
if(WITH_MKLDNN) if(WITH_MKLDNN)
string(REPLACE "_op" "_mkldnn_op" MKLDNN_FILE "${TARGET}") string(REPLACE "_op" "_mkldnn_op" MKLDNN_FILE "${TARGET}")
if (EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/${MKLDNN_FILE}.cc) if (EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/mkldnn/${MKLDNN_FILE}.cc)
list(APPEND mkldnn_cc_srcs ${MKLDNN_FILE}.cc) list(APPEND mkldnn_cc_srcs mkldnn/${MKLDNN_FILE}.cc)
endif() endif()
endif() endif()
else() else()
......
...@@ -67,6 +67,7 @@ paddle.fluid.initializer.BilinearInitializer.__init__ ArgSpec(args=['self'], var ...@@ -67,6 +67,7 @@ paddle.fluid.initializer.BilinearInitializer.__init__ ArgSpec(args=['self'], var
paddle.fluid.initializer.MSRAInitializer.__init__ ArgSpec(args=['self', 'uniform', 'fan_in', 'seed'], varargs=None, keywords=None, defaults=(True, None, 0)) paddle.fluid.initializer.MSRAInitializer.__init__ ArgSpec(args=['self', 'uniform', 'fan_in', 'seed'], varargs=None, keywords=None, defaults=(True, None, 0))
paddle.fluid.initializer.force_init_on_cpu ArgSpec(args=[], varargs=None, keywords=None, defaults=None) paddle.fluid.initializer.force_init_on_cpu ArgSpec(args=[], varargs=None, keywords=None, defaults=None)
paddle.fluid.initializer.init_on_cpu ArgSpec(args=[], varargs='args', keywords='kwds', defaults=None) paddle.fluid.initializer.init_on_cpu ArgSpec(args=[], varargs='args', keywords='kwds', defaults=None)
paddle.fluid.initializer.NumpyArrayInitializer.__init__ ArgSpec(args=['self', 'value'], varargs=None, keywords=None, defaults=None)
paddle.fluid.layers.fc ArgSpec(args=['input', 'size', 'num_flatten_dims', 'param_attr', 'bias_attr', 'act', 'is_test', 'name'], varargs=None, keywords=None, defaults=(1, None, None, None, False, None)) paddle.fluid.layers.fc ArgSpec(args=['input', 'size', 'num_flatten_dims', 'param_attr', 'bias_attr', 'act', 'is_test', 'name'], varargs=None, keywords=None, defaults=(1, None, None, None, False, None))
paddle.fluid.layers.embedding ArgSpec(args=['input', 'size', 'is_sparse', 'is_distributed', 'padding_idx', 'param_attr', 'dtype'], varargs=None, keywords=None, defaults=(False, False, None, None, 'float32')) paddle.fluid.layers.embedding ArgSpec(args=['input', 'size', 'is_sparse', 'is_distributed', 'padding_idx', 'param_attr', 'dtype'], varargs=None, keywords=None, defaults=(False, False, None, None, 'float32'))
paddle.fluid.layers.dynamic_lstm ArgSpec(args=['input', 'size', 'h_0', 'c_0', 'param_attr', 'bias_attr', 'use_peepholes', 'is_reverse', 'gate_activation', 'cell_activation', 'candidate_activation', 'dtype', 'name'], varargs=None, keywords=None, defaults=(None, None, None, None, True, False, 'sigmoid', 'tanh', 'tanh', 'float32', None)) paddle.fluid.layers.dynamic_lstm ArgSpec(args=['input', 'size', 'h_0', 'c_0', 'param_attr', 'bias_attr', 'use_peepholes', 'is_reverse', 'gate_activation', 'cell_activation', 'candidate_activation', 'dtype', 'name'], varargs=None, keywords=None, defaults=(None, None, None, None, True, False, 'sigmoid', 'tanh', 'tanh', 'float32', None))
...@@ -121,7 +122,7 @@ paddle.fluid.layers.transpose ArgSpec(args=['x', 'perm', 'name'], varargs=None, ...@@ -121,7 +122,7 @@ paddle.fluid.layers.transpose ArgSpec(args=['x', 'perm', 'name'], varargs=None,
paddle.fluid.layers.im2sequence ArgSpec(args=['input', 'filter_size', 'stride', 'padding', 'input_image_size', 'out_stride', 'name'], varargs=None, keywords=None, defaults=(1, 1, 0, None, 1, None)) paddle.fluid.layers.im2sequence ArgSpec(args=['input', 'filter_size', 'stride', 'padding', 'input_image_size', 'out_stride', 'name'], varargs=None, keywords=None, defaults=(1, 1, 0, None, 1, None))
paddle.fluid.layers.nce ArgSpec(args=['input', 'label', 'num_total_classes', 'sample_weight', 'param_attr', 'bias_attr', 'num_neg_samples', 'name', 'sampler', 'custom_dist', 'seed', 'is_sparse'], varargs=None, keywords=None, defaults=(None, None, None, None, None, 'uniform', None, 0, False)) paddle.fluid.layers.nce ArgSpec(args=['input', 'label', 'num_total_classes', 'sample_weight', 'param_attr', 'bias_attr', 'num_neg_samples', 'name', 'sampler', 'custom_dist', 'seed', 'is_sparse'], varargs=None, keywords=None, defaults=(None, None, None, None, None, 'uniform', None, 0, False))
paddle.fluid.layers.hsigmoid ArgSpec(args=['input', 'label', 'num_classes', 'param_attr', 'bias_attr', 'name', 'path_table', 'path_code', 'is_custom', 'is_sparse'], varargs=None, keywords=None, defaults=(None, None, None, None, None, False, False)) paddle.fluid.layers.hsigmoid ArgSpec(args=['input', 'label', 'num_classes', 'param_attr', 'bias_attr', 'name', 'path_table', 'path_code', 'is_custom', 'is_sparse'], varargs=None, keywords=None, defaults=(None, None, None, None, None, False, False))
paddle.fluid.layers.beam_search ArgSpec(args=['pre_ids', 'pre_scores', 'ids', 'scores', 'beam_size', 'end_id', 'level', 'name'], varargs=None, keywords=None, defaults=(0, None)) paddle.fluid.layers.beam_search ArgSpec(args=['pre_ids', 'pre_scores', 'ids', 'scores', 'beam_size', 'end_id', 'level', 'is_accumulated', 'name', 'return_parent_idx'], varargs=None, keywords=None, defaults=(0, True, None, False))
paddle.fluid.layers.row_conv ArgSpec(args=['input', 'future_context_size', 'param_attr', 'act'], varargs=None, keywords=None, defaults=(None, None)) paddle.fluid.layers.row_conv ArgSpec(args=['input', 'future_context_size', 'param_attr', 'act'], varargs=None, keywords=None, defaults=(None, None))
paddle.fluid.layers.multiplex ArgSpec(args=['inputs', 'index'], varargs=None, keywords=None, defaults=None) paddle.fluid.layers.multiplex ArgSpec(args=['inputs', 'index'], varargs=None, keywords=None, defaults=None)
paddle.fluid.layers.layer_norm ArgSpec(args=['input', 'scale', 'shift', 'begin_norm_axis', 'epsilon', 'param_attr', 'bias_attr', 'act', 'name'], varargs=None, keywords=None, defaults=(True, True, 1, 1e-05, None, None, None, None)) paddle.fluid.layers.layer_norm ArgSpec(args=['input', 'scale', 'shift', 'begin_norm_axis', 'epsilon', 'param_attr', 'bias_attr', 'act', 'name'], varargs=None, keywords=None, defaults=(True, True, 1, 1e-05, None, None, None, None))
...@@ -141,10 +142,10 @@ paddle.fluid.layers.label_smooth ArgSpec(args=['label', 'prior_dist', 'epsilon', ...@@ -141,10 +142,10 @@ paddle.fluid.layers.label_smooth ArgSpec(args=['label', 'prior_dist', 'epsilon',
paddle.fluid.layers.roi_pool ArgSpec(args=['input', 'rois', 'pooled_height', 'pooled_width', 'spatial_scale'], varargs=None, keywords=None, defaults=(1, 1, 1.0)) paddle.fluid.layers.roi_pool ArgSpec(args=['input', 'rois', 'pooled_height', 'pooled_width', 'spatial_scale'], varargs=None, keywords=None, defaults=(1, 1, 1.0))
paddle.fluid.layers.roi_align ArgSpec(args=['input', 'rois', 'pooled_height', 'pooled_width', 'spatial_scale', 'sampling_ratio', 'name'], varargs=None, keywords=None, defaults=(1, 1, 1.0, -1, None)) paddle.fluid.layers.roi_align ArgSpec(args=['input', 'rois', 'pooled_height', 'pooled_width', 'spatial_scale', 'sampling_ratio', 'name'], varargs=None, keywords=None, defaults=(1, 1, 1.0, -1, None))
paddle.fluid.layers.dice_loss ArgSpec(args=['input', 'label', 'epsilon'], varargs=None, keywords=None, defaults=(1e-05,)) paddle.fluid.layers.dice_loss ArgSpec(args=['input', 'label', 'epsilon'], varargs=None, keywords=None, defaults=(1e-05,))
paddle.fluid.layers.image_resize ArgSpec(args=['input', 'out_shape', 'scale', 'name', 'resample', 'actual_shape'], varargs=None, keywords=None, defaults=(None, None, None, 'BILINEAR', None)) paddle.fluid.layers.image_resize ArgSpec(args=['input', 'out_shape', 'scale', 'name', 'resample', 'actual_shape', 'align_corners', 'align_mode'], varargs=None, keywords=None, defaults=(None, None, None, 'BILINEAR', None, True, 1))
paddle.fluid.layers.image_resize_short ArgSpec(args=['input', 'out_short_len', 'resample'], varargs=None, keywords=None, defaults=('BILINEAR',)) paddle.fluid.layers.image_resize_short ArgSpec(args=['input', 'out_short_len', 'resample'], varargs=None, keywords=None, defaults=('BILINEAR',))
paddle.fluid.layers.resize_bilinear ArgSpec(args=['input', 'out_shape', 'scale', 'name', 'actual_shape'], varargs=None, keywords=None, defaults=(None, None, None, None)) paddle.fluid.layers.resize_bilinear ArgSpec(args=['input', 'out_shape', 'scale', 'name', 'actual_shape', 'align_corners', 'align_mode'], varargs=None, keywords=None, defaults=(None, None, None, None, True, 1))
paddle.fluid.layers.resize_nearest ArgSpec(args=['input', 'out_shape', 'scale', 'name', 'actual_shape'], varargs=None, keywords=None, defaults=(None, None, None, None)) paddle.fluid.layers.resize_nearest ArgSpec(args=['input', 'out_shape', 'scale', 'name', 'actual_shape', 'align_corners'], varargs=None, keywords=None, defaults=(None, None, None, None, True))
paddle.fluid.layers.gather ArgSpec(args=['input', 'index'], varargs=None, keywords=None, defaults=None) paddle.fluid.layers.gather ArgSpec(args=['input', 'index'], varargs=None, keywords=None, defaults=None)
paddle.fluid.layers.scatter ArgSpec(args=['input', 'index', 'updates', 'name'], varargs=None, keywords=None, defaults=(None,)) paddle.fluid.layers.scatter ArgSpec(args=['input', 'index', 'updates', 'name'], varargs=None, keywords=None, defaults=(None,))
paddle.fluid.layers.sequence_scatter ArgSpec(args=['input', 'index', 'updates', 'name'], varargs=None, keywords=None, defaults=(None,)) paddle.fluid.layers.sequence_scatter ArgSpec(args=['input', 'index', 'updates', 'name'], varargs=None, keywords=None, defaults=(None,))
...@@ -197,7 +198,7 @@ paddle.fluid.layers.clip ArgSpec(args=['x', 'min', 'max', 'name'], varargs=None, ...@@ -197,7 +198,7 @@ paddle.fluid.layers.clip ArgSpec(args=['x', 'min', 'max', 'name'], varargs=None,
paddle.fluid.layers.clip_by_norm ArgSpec(args=['x', 'max_norm', 'name'], varargs=None, keywords=None, defaults=(None,)) paddle.fluid.layers.clip_by_norm ArgSpec(args=['x', 'max_norm', 'name'], varargs=None, keywords=None, defaults=(None,))
paddle.fluid.layers.mean ArgSpec(args=['x', 'name'], varargs=None, keywords=None, defaults=(None,)) paddle.fluid.layers.mean ArgSpec(args=['x', 'name'], varargs=None, keywords=None, defaults=(None,))
paddle.fluid.layers.mul ArgSpec(args=['x', 'y', 'x_num_col_dims', 'y_num_col_dims', 'name'], varargs=None, keywords=None, defaults=(1, 1, None)) paddle.fluid.layers.mul ArgSpec(args=['x', 'y', 'x_num_col_dims', 'y_num_col_dims', 'name'], varargs=None, keywords=None, defaults=(1, 1, None))
paddle.fluid.layers.sigmoid_cross_entropy_with_logits ArgSpec(args=['x', 'label', 'ignore_index', 'name'], varargs=None, keywords=None, defaults=(-100, None)) paddle.fluid.layers.sigmoid_cross_entropy_with_logits ArgSpec(args=['x', 'label', 'ignore_index', 'name', 'normalize'], varargs=None, keywords=None, defaults=(-100, None, False))
paddle.fluid.layers.maxout ArgSpec(args=['x', 'groups', 'name'], varargs=None, keywords=None, defaults=(None,)) paddle.fluid.layers.maxout ArgSpec(args=['x', 'groups', 'name'], varargs=None, keywords=None, defaults=(None,))
paddle.fluid.layers.space_to_depth ArgSpec(args=['x', 'blocksize', 'name'], varargs=None, keywords=None, defaults=(None,)) paddle.fluid.layers.space_to_depth ArgSpec(args=['x', 'blocksize', 'name'], varargs=None, keywords=None, defaults=(None,))
paddle.fluid.layers.affine_grid ArgSpec(args=['theta', 'out_shape', 'name'], varargs=None, keywords=None, defaults=(None,)) paddle.fluid.layers.affine_grid ArgSpec(args=['theta', 'out_shape', 'name'], varargs=None, keywords=None, defaults=(None,))
...@@ -212,6 +213,7 @@ paddle.fluid.layers.bilinear_tensor_product ArgSpec(args=['x', 'y', 'size', 'act ...@@ -212,6 +213,7 @@ paddle.fluid.layers.bilinear_tensor_product ArgSpec(args=['x', 'y', 'size', 'act
paddle.fluid.layers.merge_selected_rows ArgSpec(args=['x', 'name'], varargs=None, keywords=None, defaults=(None,)) paddle.fluid.layers.merge_selected_rows ArgSpec(args=['x', 'name'], varargs=None, keywords=None, defaults=(None,))
paddle.fluid.layers.get_tensor_from_selected_rows ArgSpec(args=['x', 'name'], varargs=None, keywords=None, defaults=(None,)) paddle.fluid.layers.get_tensor_from_selected_rows ArgSpec(args=['x', 'name'], varargs=None, keywords=None, defaults=(None,))
paddle.fluid.layers.lstm ArgSpec(args=['input', 'init_h', 'init_c', 'max_len', 'hidden_size', 'num_layers', 'dropout_prob', 'is_bidirec', 'is_test', 'name', 'default_initializer', 'seed'], varargs=None, keywords=None, defaults=(0.0, False, False, None, None, -1)) paddle.fluid.layers.lstm ArgSpec(args=['input', 'init_h', 'init_c', 'max_len', 'hidden_size', 'num_layers', 'dropout_prob', 'is_bidirec', 'is_test', 'name', 'default_initializer', 'seed'], varargs=None, keywords=None, defaults=(0.0, False, False, None, None, -1))
paddle.fluid.layers.shuffle_channel ArgSpec(args=['x', 'group', 'name'], varargs=None, keywords=None, defaults=(None,))
paddle.fluid.layers.py_func ArgSpec(args=['func', 'x', 'out', 'backward_func', 'skip_vars_in_backward_input'], varargs=None, keywords=None, defaults=(None, None)) paddle.fluid.layers.py_func ArgSpec(args=['func', 'x', 'out', 'backward_func', 'skip_vars_in_backward_input'], varargs=None, keywords=None, defaults=(None, None))
paddle.fluid.layers.psroi_pool ArgSpec(args=['input', 'rois', 'output_channels', 'spatial_scale', 'pooled_height', 'pooled_width', 'name'], varargs=None, keywords=None, defaults=(None,)) paddle.fluid.layers.psroi_pool ArgSpec(args=['input', 'rois', 'output_channels', 'spatial_scale', 'pooled_height', 'pooled_width', 'name'], varargs=None, keywords=None, defaults=(None,))
paddle.fluid.layers.teacher_student_sigmoid_loss ArgSpec(args=['input', 'label', 'soft_max_up_bound', 'soft_max_lower_bound'], varargs=None, keywords=None, defaults=(15.0, -15.0)) paddle.fluid.layers.teacher_student_sigmoid_loss ArgSpec(args=['input', 'label', 'soft_max_up_bound', 'soft_max_lower_bound'], varargs=None, keywords=None, defaults=(15.0, -15.0))
...@@ -318,10 +320,12 @@ paddle.fluid.layers.anchor_generator ArgSpec(args=['input', 'anchor_sizes', 'asp ...@@ -318,10 +320,12 @@ paddle.fluid.layers.anchor_generator ArgSpec(args=['input', 'anchor_sizes', 'asp
paddle.fluid.layers.roi_perspective_transform ArgSpec(args=['input', 'rois', 'transformed_height', 'transformed_width', 'spatial_scale'], varargs=None, keywords=None, defaults=(1.0,)) paddle.fluid.layers.roi_perspective_transform ArgSpec(args=['input', 'rois', 'transformed_height', 'transformed_width', 'spatial_scale'], varargs=None, keywords=None, defaults=(1.0,))
paddle.fluid.layers.generate_proposal_labels ArgSpec(args=['rpn_rois', 'gt_classes', 'is_crowd', 'gt_boxes', 'im_info', 'batch_size_per_im', 'fg_fraction', 'fg_thresh', 'bg_thresh_hi', 'bg_thresh_lo', 'bbox_reg_weights', 'class_nums', 'use_random'], varargs=None, keywords=None, defaults=(256, 0.25, 0.25, 0.5, 0.0, [0.1, 0.1, 0.2, 0.2], None, True)) paddle.fluid.layers.generate_proposal_labels ArgSpec(args=['rpn_rois', 'gt_classes', 'is_crowd', 'gt_boxes', 'im_info', 'batch_size_per_im', 'fg_fraction', 'fg_thresh', 'bg_thresh_hi', 'bg_thresh_lo', 'bbox_reg_weights', 'class_nums', 'use_random'], varargs=None, keywords=None, defaults=(256, 0.25, 0.25, 0.5, 0.0, [0.1, 0.1, 0.2, 0.2], None, True))
paddle.fluid.layers.generate_proposals ArgSpec(args=['scores', 'bbox_deltas', 'im_info', 'anchors', 'variances', 'pre_nms_top_n', 'post_nms_top_n', 'nms_thresh', 'min_size', 'eta', 'name'], varargs=None, keywords=None, defaults=(6000, 1000, 0.5, 0.1, 1.0, None)) paddle.fluid.layers.generate_proposals ArgSpec(args=['scores', 'bbox_deltas', 'im_info', 'anchors', 'variances', 'pre_nms_top_n', 'post_nms_top_n', 'nms_thresh', 'min_size', 'eta', 'name'], varargs=None, keywords=None, defaults=(6000, 1000, 0.5, 0.1, 1.0, None))
paddle.fluid.layers.generate_mask_labels ArgSpec(args=['im_info', 'gt_classes', 'is_crowd', 'gt_segms', 'rois', 'labels_int32', 'num_classes', 'resolution'], varargs=None, keywords=None, defaults=None)
paddle.fluid.layers.iou_similarity ArgSpec(args=['x', 'y', 'name'], varargs=None, keywords=None, defaults=(None,)) paddle.fluid.layers.iou_similarity ArgSpec(args=['x', 'y', 'name'], varargs=None, keywords=None, defaults=(None,))
paddle.fluid.layers.box_coder ArgSpec(args=['prior_box', 'prior_box_var', 'target_box', 'code_type', 'box_normalized', 'name'], varargs=None, keywords=None, defaults=('encode_center_size', True, None)) paddle.fluid.layers.box_coder ArgSpec(args=['prior_box', 'prior_box_var', 'target_box', 'code_type', 'box_normalized', 'name', 'axis'], varargs=None, keywords=None, defaults=('encode_center_size', True, None, 0))
paddle.fluid.layers.polygon_box_transform ArgSpec(args=['input', 'name'], varargs=None, keywords=None, defaults=(None,)) paddle.fluid.layers.polygon_box_transform ArgSpec(args=['input', 'name'], varargs=None, keywords=None, defaults=(None,))
paddle.fluid.layers.yolov3_loss ArgSpec(args=['x', 'gtbox', 'gtlabel', 'anchors', 'class_num', 'ignore_thresh', 'loss_weight_xy', 'loss_weight_wh', 'loss_weight_conf_target', 'loss_weight_conf_notarget', 'loss_weight_class', 'name'], varargs=None, keywords=None, defaults=(None, None, None, None, None, None)) paddle.fluid.layers.yolov3_loss ArgSpec(args=['x', 'gtbox', 'gtlabel', 'anchors', 'anchor_mask', 'class_num', 'ignore_thresh', 'downsample_ratio', 'name'], varargs=None, keywords=None, defaults=(None,))
paddle.fluid.layers.multiclass_nms ArgSpec(args=['bboxes', 'scores', 'score_threshold', 'nms_top_k', 'keep_top_k', 'nms_threshold', 'normalized', 'nms_eta', 'background_label', 'name'], varargs=None, keywords=None, defaults=(0.3, True, 1.0, 0, None))
paddle.fluid.layers.accuracy ArgSpec(args=['input', 'label', 'k', 'correct', 'total'], varargs=None, keywords=None, defaults=(1, None, None)) paddle.fluid.layers.accuracy ArgSpec(args=['input', 'label', 'k', 'correct', 'total'], varargs=None, keywords=None, defaults=(1, None, None))
paddle.fluid.layers.auc ArgSpec(args=['input', 'label', 'curve', 'num_thresholds', 'topk', 'slide_steps'], varargs=None, keywords=None, defaults=('ROC', 4095, 1, 1)) paddle.fluid.layers.auc ArgSpec(args=['input', 'label', 'curve', 'num_thresholds', 'topk', 'slide_steps'], varargs=None, keywords=None, defaults=('ROC', 4095, 1, 1))
paddle.fluid.layers.exponential_decay ArgSpec(args=['learning_rate', 'decay_steps', 'decay_rate', 'staircase'], varargs=None, keywords=None, defaults=(False,)) paddle.fluid.layers.exponential_decay ArgSpec(args=['learning_rate', 'decay_steps', 'decay_rate', 'staircase'], varargs=None, keywords=None, defaults=(False,))
...@@ -357,6 +361,10 @@ paddle.fluid.contrib.QuantizeTranspiler.__init__ ArgSpec(args=['self', 'weight_b ...@@ -357,6 +361,10 @@ paddle.fluid.contrib.QuantizeTranspiler.__init__ ArgSpec(args=['self', 'weight_b
paddle.fluid.contrib.QuantizeTranspiler.convert_to_int8 ArgSpec(args=['self', 'program', 'place', 'scope'], varargs=None, keywords=None, defaults=(None,)) paddle.fluid.contrib.QuantizeTranspiler.convert_to_int8 ArgSpec(args=['self', 'program', 'place', 'scope'], varargs=None, keywords=None, defaults=(None,))
paddle.fluid.contrib.QuantizeTranspiler.freeze_program ArgSpec(args=['self', 'program', 'place', 'fuse_bn', 'scope'], varargs=None, keywords=None, defaults=(False, None)) paddle.fluid.contrib.QuantizeTranspiler.freeze_program ArgSpec(args=['self', 'program', 'place', 'fuse_bn', 'scope'], varargs=None, keywords=None, defaults=(False, None))
paddle.fluid.contrib.QuantizeTranspiler.training_transpile ArgSpec(args=['self', 'program', 'startup_program'], varargs=None, keywords=None, defaults=(None, None)) paddle.fluid.contrib.QuantizeTranspiler.training_transpile ArgSpec(args=['self', 'program', 'startup_program'], varargs=None, keywords=None, defaults=(None, None))
paddle.fluid.contrib.Calibrator.__init__ ArgSpec(args=['self'], varargs='args', keywords='kwargs', defaults=None)
paddle.fluid.contrib.Calibrator.sample_data ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None)
paddle.fluid.contrib.Calibrator.save_int8_model ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None)
paddle.fluid.contrib.reader.ctr_reader.ctr_reader ArgSpec(args=['feed_dict', 'file_type', 'file_format', 'dense_slot_index', 'sparse_slot_index', 'capacity', 'thread_num', 'batch_size', 'file_list', 'slots', 'name'], varargs=None, keywords=None, defaults=(None,))
paddle.fluid.contrib.build_compressor ArgSpec(args=['place', 'data_reader', 'data_feeder', 'scope', 'metrics', 'epoch', 'config'], varargs=None, keywords=None, defaults=(None, None, None, None, None, None, None)) paddle.fluid.contrib.build_compressor ArgSpec(args=['place', 'data_reader', 'data_feeder', 'scope', 'metrics', 'epoch', 'config'], varargs=None, keywords=None, defaults=(None, None, None, None, None, None, None))
paddle.fluid.contrib.CompressPass.__init__ ArgSpec(args=['self', 'place', 'data_reader', 'data_feeder', 'scope', 'metrics', 'epoch', 'program_exe'], varargs=None, keywords=None, defaults=(None, None, None, None, None, None, None)) paddle.fluid.contrib.CompressPass.__init__ ArgSpec(args=['self', 'place', 'data_reader', 'data_feeder', 'scope', 'metrics', 'epoch', 'program_exe'], varargs=None, keywords=None, defaults=(None, None, None, None, None, None, None))
paddle.fluid.contrib.CompressPass.add_strategy ArgSpec(args=['self', 'strategy'], varargs=None, keywords=None, defaults=None) paddle.fluid.contrib.CompressPass.add_strategy ArgSpec(args=['self', 'strategy'], varargs=None, keywords=None, defaults=None)
......
#windows treat symbolic file as a real file, which is different with unix #windows treat symbolic file as a real file, which is different with unix
#We create a hidden file and compile it instead of origin source file. #We create a hidden file and compile it instead of origin source file.
function(windows_symbolic TARGET) function(windows_symbolic TARGET)
...@@ -129,12 +128,6 @@ cc_test(version_test SRCS version_test.cc DEPS version) ...@@ -129,12 +128,6 @@ cc_test(version_test SRCS version_test.cc DEPS version)
cc_library(proto_desc SRCS var_desc.cc op_desc.cc block_desc.cc program_desc.cc DEPS shape_inference op_info operator glog version) cc_library(proto_desc SRCS var_desc.cc op_desc.cc block_desc.cc program_desc.cc DEPS shape_inference op_info operator glog version)
if(WITH_NGRAPH)
cc_library(ngraph_bridge SRCS ngraph_bridge.cc DEPS operator framework_proto ngraph)
cc_library(ngraph_operator SRCS ngraph_operator.cc DEPS ngraph_bridge operator op_info device_context tensor scope glog
shape_inference data_transform lod_tensor profiler)
endif(WITH_NGRAPH)
cc_library(op_registry SRCS op_registry.cc DEPS op_proto_maker op_info operator glog proto_desc) cc_library(op_registry SRCS op_registry.cc DEPS op_proto_maker op_info operator glog proto_desc)
nv_test(op_registry_test SRCS op_registry_test.cc DEPS op_registry) nv_test(op_registry_test SRCS op_registry_test.cc DEPS op_registry)
...@@ -171,13 +164,12 @@ if(WITH_DISTRIBUTE) ...@@ -171,13 +164,12 @@ if(WITH_DISTRIBUTE)
set(DISTRIBUTE_COMPILE_FLAGS "-Wno-non-virtual-dtor -Wno-error=non-virtual-dtor -Wno-error=delete-non-virtual-dtor") set(DISTRIBUTE_COMPILE_FLAGS "-Wno-non-virtual-dtor -Wno-error=non-virtual-dtor -Wno-error=delete-non-virtual-dtor")
set_source_files_properties(executor.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS}) set_source_files_properties(executor.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
else() else()
if(WITH_NGRAPH) if (WITH_NGRAPH)
cc_library(executor SRCS executor.cc DEPS op_registry device_context scope framework_proto glog lod_rank_table feed_fetch_method graph_to_program_pass ngraph_operator variable_helper) cc_library(executor SRCS executor.cc DEPS op_registry device_context scope framework_proto glog lod_rank_table feed_fetch_method graph_to_program_pass variable_helper ngraph_engine)
else(WITH_NGRAPH) else ()
cc_library(executor SRCS executor.cc DEPS op_registry device_context scope framework_proto glog lod_rank_table feed_fetch_method graph_to_program_pass variable_helper) cc_library(executor SRCS executor.cc DEPS op_registry device_context scope framework_proto glog lod_rank_table feed_fetch_method graph_to_program_pass variable_helper)
endif(WITH_NGRAPH) endif()
cc_test(test_naive_executor SRCS naive_executor_test.cc DEPS naive_executor elementwise_add_op) cc_test(test_naive_executor SRCS naive_executor_test.cc DEPS naive_executor elementwise_add_op)
endif() endif()
...@@ -214,3 +206,24 @@ endif (NOT WIN32) ...@@ -214,3 +206,24 @@ endif (NOT WIN32)
cc_library(dlpack_tensor SRCS dlpack_tensor.cc DEPS tensor dlpack) cc_library(dlpack_tensor SRCS dlpack_tensor.cc DEPS tensor dlpack)
cc_test(dlpack_tensor_test SRCS dlpack_tensor_test.cc DEPS dlpack_tensor glog) cc_test(dlpack_tensor_test SRCS dlpack_tensor_test.cc DEPS dlpack_tensor glog)
# Get the current working branch
execute_process(
COMMAND git rev-parse --abbrev-ref HEAD
WORKING_DIRECTORY ${CMAKE_SOURCE_DIR}
OUTPUT_VARIABLE PADDLE_BRANCH
OUTPUT_STRIP_TRAILING_WHITESPACE
)
# Get the latest abbreviated commit hash of the working branch
execute_process(
COMMAND git log -1 --format=%h
WORKING_DIRECTORY ${CMAKE_SOURCE_DIR}
OUTPUT_VARIABLE PADDLE_COMMIT
OUTPUT_STRIP_TRAILING_WHITESPACE
)
message(STATUS "commit: ${PADDLE_COMMIT}")
message(STATUS "branch: ${PADDLE_BRANCH}")
configure_file(commit.h.in commit.h)
#pragma once
#include <string>
namespace paddle {
namespace framework {
static std::string paddle_commit() {
return "@PADDLE_COMMIT@";
}
static std::string paddle_compile_branch() {
return "@PADDLE_BRANCH@";
}
static std::string paddle_version() {
return "@PADDLE_VERSION@";
}
} // namespace framework
} // namespace paddle
...@@ -91,7 +91,7 @@ struct BuildStrategy { ...@@ -91,7 +91,7 @@ struct BuildStrategy {
int num_trainers_{1}; int num_trainers_{1};
int trainer_id_{0}; int trainer_id_{0};
std::vector<std::string> trainers_endpoints_; std::vector<std::string> trainers_endpoints_;
bool remove_unnecessary_lock_{false}; bool remove_unnecessary_lock_{true};
// NOTE: // NOTE:
// Before you add new options, think if it's a general strategy that works // Before you add new options, think if it's a general strategy that works
......
...@@ -25,6 +25,9 @@ struct ExecutionStrategy { ...@@ -25,6 +25,9 @@ struct ExecutionStrategy {
size_t num_threads_{0}; size_t num_threads_{0};
bool use_cuda_{true}; bool use_cuda_{true};
bool allow_op_delay_{false}; bool allow_op_delay_{false};
// If we set this to 1, we will delete all variables when finish a batch. and
// this will loss 15%+ performance.
// Please be aware about this parameters.
size_t num_iteration_per_drop_scope_{1}; size_t num_iteration_per_drop_scope_{1};
ExecutorType type_{kDefault}; ExecutorType type_{kDefault};
bool dry_run_{false}; bool dry_run_{false};
......
...@@ -27,7 +27,7 @@ limitations under the License. */ ...@@ -27,7 +27,7 @@ limitations under the License. */
#include "paddle/fluid/platform/profiler.h" #include "paddle/fluid/platform/profiler.h"
#ifdef PADDLE_WITH_NGRAPH #ifdef PADDLE_WITH_NGRAPH
#include "paddle/fluid/framework/ngraph_operator.h" #include "paddle/fluid/operators/ngraph/ngraph_engine.h"
#endif #endif
DECLARE_bool(benchmark); DECLARE_bool(benchmark);
...@@ -133,24 +133,6 @@ static void DeleteUnusedTensors( ...@@ -133,24 +133,6 @@ static void DeleteUnusedTensors(
} }
} }
static void EnableFusedOp(ExecutorPrepareContext* ctx) {
#ifdef PADDLE_WITH_NGRAPH
VLOG(3) << "use_ngraph=True";
auto intervals = NgraphOperator::NgraphOpIntervals(&ctx->ops_);
for (auto& interval : intervals) {
auto* ng_op = new NgraphOperator(ctx->prog_, ctx->block_id_, interval.at(0),
interval.at(1));
*interval[0] = std::unique_ptr<OperatorBase>(ng_op);
}
for (auto it = intervals.rbegin(); it != intervals.rend(); ++it) {
ctx->ops_.erase(it->at(0) + 1, it->at(1));
}
#else
LOG(WARNING)
<< "'NGRAPH' is not supported, Please re-compile with WITH_NGRAPH option";
#endif
}
Executor::Executor(const platform::Place& place) : place_(place) {} Executor::Executor(const platform::Place& place) : place_(place) {}
void Executor::Close() { void Executor::Close() {
...@@ -204,6 +186,9 @@ void Executor::Run(const ProgramDesc& pdesc, Scope* scope, int block_id, ...@@ -204,6 +186,9 @@ void Executor::Run(const ProgramDesc& pdesc, Scope* scope, int block_id,
bool create_local_scope, bool create_vars) { bool create_local_scope, bool create_vars) {
platform::RecordBlock b(block_id); platform::RecordBlock b(block_id);
if (FLAGS_use_mkldnn) EnableMKLDNN(pdesc); if (FLAGS_use_mkldnn) EnableMKLDNN(pdesc);
#ifdef PADDLE_WITH_NGRAPH
if (FLAGS_use_ngraph) operators::NgraphEngine::EnableNgraph(pdesc);
#endif
auto ctx = Prepare(pdesc, block_id); auto ctx = Prepare(pdesc, block_id);
RunPreparedContext(ctx.get(), scope, create_local_scope, create_vars); RunPreparedContext(ctx.get(), scope, create_local_scope, create_vars);
} }
...@@ -379,7 +364,6 @@ std::unique_ptr<ExecutorPrepareContext> Executor::Prepare( ...@@ -379,7 +364,6 @@ std::unique_ptr<ExecutorPrepareContext> Executor::Prepare(
for (auto& op_desc : block.AllOps()) { for (auto& op_desc : block.AllOps()) {
ctx->ops_.push_back(OpRegistry::CreateOp(*op_desc)); ctx->ops_.push_back(OpRegistry::CreateOp(*op_desc));
} }
if (FLAGS_use_ngraph) EnableFusedOp(ctx.get());
return ctx; return ctx;
} }
......
...@@ -10,8 +10,22 @@ function(pass_library TARGET DEST) ...@@ -10,8 +10,22 @@ function(pass_library TARGET DEST)
set(options "") set(options "")
set(oneValueArgs "") set(oneValueArgs "")
set(multiValueArgs SRCS DEPS) set(multiValueArgs SRCS DEPS)
set(targetPrefix "")
# Get optional argument
set(extraMacroArgs ${ARGN})
list(LENGTH extraMacroArgs numExtraMacroArgs)
if(numExtraMacroArgs GREATER 0)
list(GET extraMacroArgs 0 targetPrefix)
endif()
cmake_parse_arguments(op_library "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN}) cmake_parse_arguments(op_library "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
cc_library(${TARGET} SRCS ${TARGET}.cc DEPS graph_pattern_detector pass fuse_pass_base ${op_library_DEPS}) if(targetPrefix)
cc_library(${TARGET} SRCS ${targetPrefix}/${TARGET}.cc DEPS graph_pattern_detector pass fuse_pass_base ${op_library_DEPS})
else()
cc_library(${TARGET} SRCS ${TARGET}.cc DEPS graph_pattern_detector pass fuse_pass_base ${op_library_DEPS})
endif()
# add more DEST here, such as train, dist and collect USE_PASS into a file automatically. # add more DEST here, such as train, dist and collect USE_PASS into a file automatically.
if (${DEST} STREQUAL "base" OR ${DEST} STREQUAL "inference") if (${DEST} STREQUAL "base" OR ${DEST} STREQUAL "inference")
message(STATUS "add pass ${TARGET} ${DEST}") message(STATUS "add pass ${TARGET} ${DEST}")
...@@ -62,11 +76,11 @@ foreach (index RANGE 3 6) ...@@ -62,11 +76,11 @@ foreach (index RANGE 3 6)
endforeach() endforeach()
if(WITH_MKLDNN) if(WITH_MKLDNN)
pass_library(mkldnn_placement_pass base) pass_library(mkldnn_placement_pass base mkldnn)
pass_library(depthwise_conv_mkldnn_pass base) pass_library(depthwise_conv_mkldnn_pass base mkldnn)
pass_library(conv_bias_mkldnn_fuse_pass inference) pass_library(conv_bias_mkldnn_fuse_pass inference mkldnn)
pass_library(conv_relu_mkldnn_fuse_pass inference) pass_library(conv_relu_mkldnn_fuse_pass inference mkldnn)
pass_library(conv_elementwise_add_mkldnn_fuse_pass inference) pass_library(conv_elementwise_add_mkldnn_fuse_pass inference mkldnn)
endif() endif()
cc_library(fuse_elewise_add_act_pass SRCS fuse_elewise_add_act_pass.cc DEPS pass graph_pattern_detector ) cc_library(fuse_elewise_add_act_pass SRCS fuse_elewise_add_act_pass.cc DEPS pass graph_pattern_detector )
...@@ -86,7 +100,7 @@ cc_test(test_fc_fuse_pass SRCS fc_fuse_pass_tester.cc DEPS fc_fuse_pass framewor ...@@ -86,7 +100,7 @@ cc_test(test_fc_fuse_pass SRCS fc_fuse_pass_tester.cc DEPS fc_fuse_pass framewor
cc_test(test_seqpool_concat_fuse_pass SRCS seqpool_concat_fuse_pass_tester.cc DEPS seqpool_concat_fuse_pass framework_proto) cc_test(test_seqpool_concat_fuse_pass SRCS seqpool_concat_fuse_pass_tester.cc DEPS seqpool_concat_fuse_pass framework_proto)
cc_test(test_is_test_pass SRCS is_test_pass_tester.cc DEPS is_test_pass) cc_test(test_is_test_pass SRCS is_test_pass_tester.cc DEPS is_test_pass)
if (WITH_MKLDNN) if (WITH_MKLDNN)
cc_test(test_depthwise_conv_mkldnn_pass SRCS depthwise_conv_mkldnn_pass_tester.cc DEPS depthwise_conv_mkldnn_pass) cc_test(test_depthwise_conv_mkldnn_pass SRCS mkldnn/depthwise_conv_mkldnn_pass_tester.cc DEPS depthwise_conv_mkldnn_pass)
cc_test(test_conv_relu_mkldnn_fuse_pass SRCS conv_relu_mkldnn_fuse_pass_tester.cc DEPS conv_relu_mkldnn_fuse_pass) cc_test(test_conv_relu_mkldnn_fuse_pass SRCS mkldnn/conv_relu_mkldnn_fuse_pass_tester.cc DEPS conv_relu_mkldnn_fuse_pass)
cc_test(test_conv_elementwise_add_mkldnn_fuse_pass SRCS conv_elementwise_add_mkldnn_fuse_pass_tester.cc DEPS conv_elementwise_add_mkldnn_fuse_pass) cc_test(test_conv_elementwise_add_mkldnn_fuse_pass SRCS mkldnn/conv_elementwise_add_mkldnn_fuse_pass_tester.cc DEPS conv_elementwise_add_mkldnn_fuse_pass)
endif () endif ()
...@@ -14,6 +14,7 @@ ...@@ -14,6 +14,7 @@
#include "paddle/fluid/framework/ir/graph_traits.h" #include "paddle/fluid/framework/ir/graph_traits.h"
#include <set>
#include <vector> #include <vector>
namespace paddle { namespace paddle {
...@@ -79,7 +80,7 @@ NodesTSIterator::NodesTSIterator(const std::vector<Node *> &source) { ...@@ -79,7 +80,7 @@ NodesTSIterator::NodesTSIterator(const std::vector<Node *> &source) {
} }
std::unordered_set<Node *> visited; std::unordered_set<Node *> visited;
std::unordered_set<Node *> to_visit{source.begin(), source.end()}; std::set<Node *> to_visit{source.begin(), source.end()};
std::vector<Node *> inlink_visited; std::vector<Node *> inlink_visited;
while (!to_visit.empty()) { while (!to_visit.empty()) {
......
...@@ -12,7 +12,7 @@ ...@@ -12,7 +12,7 @@
// See the License for the specific language governing permissions and // See the License for the specific language governing permissions and
// limitations under the License. // limitations under the License.
#include "paddle/fluid/framework/ir/conv_bias_mkldnn_fuse_pass.h" #include "paddle/fluid/framework/ir/mkldnn/conv_bias_mkldnn_fuse_pass.h"
#include <functional> #include <functional>
#include <string> #include <string>
#include <vector> #include <vector>
......
...@@ -12,7 +12,7 @@ ...@@ -12,7 +12,7 @@
// See the License for the specific language governing permissions and // See the License for the specific language governing permissions and
// limitations under the License. // limitations under the License.
#include "paddle/fluid/framework/ir/conv_elementwise_add_mkldnn_fuse_pass.h" #include "paddle/fluid/framework/ir/mkldnn/conv_elementwise_add_mkldnn_fuse_pass.h"
#include <functional> #include <functional>
#include <list> #include <list>
#include <map> #include <map>
......
...@@ -15,8 +15,8 @@ ...@@ -15,8 +15,8 @@
#include <gtest/gtest.h> #include <gtest/gtest.h>
#include <string> #include <string>
#include "paddle/fluid/framework/ir/conv_elementwise_add_mkldnn_fuse_pass.h"
#include "paddle/fluid/framework/ir/graph_traits.h" #include "paddle/fluid/framework/ir/graph_traits.h"
#include "paddle/fluid/framework/ir/mkldnn/conv_elementwise_add_mkldnn_fuse_pass.h"
namespace paddle { namespace paddle {
namespace framework { namespace framework {
......
...@@ -12,7 +12,7 @@ ...@@ -12,7 +12,7 @@
// See the License for the specific language governing permissions and // See the License for the specific language governing permissions and
// limitations under the License. // limitations under the License.
#include "paddle/fluid/framework/ir/conv_relu_mkldnn_fuse_pass.h" #include "paddle/fluid/framework/ir/mkldnn/conv_relu_mkldnn_fuse_pass.h"
#include <string> #include <string>
#include <vector> #include <vector>
#include "paddle/fluid/platform/enforce.h" #include "paddle/fluid/platform/enforce.h"
......
...@@ -12,7 +12,7 @@ ...@@ -12,7 +12,7 @@
// See the License for the specific language governing permissions and // See the License for the specific language governing permissions and
// limitations under the License. // limitations under the License.
#include "paddle/fluid/framework/ir/conv_relu_mkldnn_fuse_pass.h" #include "paddle/fluid/framework/ir/mkldnn/conv_relu_mkldnn_fuse_pass.h"
#include <gtest/gtest.h> #include <gtest/gtest.h>
#include "paddle/fluid/framework/op_proto_maker.h" #include "paddle/fluid/framework/op_proto_maker.h"
......
...@@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. ...@@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#include "paddle/fluid/framework/ir/depthwise_conv_mkldnn_pass.h" #include "paddle/fluid/framework/ir/mkldnn/depthwise_conv_mkldnn_pass.h"
#include "paddle/fluid/framework/ir/graph_pattern_detector.h" #include "paddle/fluid/framework/ir/graph_pattern_detector.h"
namespace paddle { namespace paddle {
......
...@@ -12,7 +12,7 @@ ...@@ -12,7 +12,7 @@
// See the License for the specific language governing permissions and // See the License for the specific language governing permissions and
// limitations under the License. // limitations under the License.
#include "paddle/fluid/framework/ir/depthwise_conv_mkldnn_pass.h" #include "paddle/fluid/framework/ir/mkldnn/depthwise_conv_mkldnn_pass.h"
#include <gtest/gtest.h> #include <gtest/gtest.h>
......
...@@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. ...@@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#include "paddle/fluid/framework/ir/mkldnn_placement_pass.h" #include "paddle/fluid/framework/ir/mkldnn/mkldnn_placement_pass.h"
#include <string> #include <string>
namespace paddle { namespace paddle {
......
...@@ -54,13 +54,14 @@ std::ostream &operator<<(std::ostream &os, const LoD &lod) { ...@@ -54,13 +54,14 @@ std::ostream &operator<<(std::ostream &os, const LoD &lod) {
std::ostream &operator<<(std::ostream &os, const LoDTensor &t) { std::ostream &operator<<(std::ostream &os, const LoDTensor &t) {
if (!platform::is_cpu_place(t.place())) { if (!platform::is_cpu_place(t.place())) {
LoDTensor tt; LoDTensor cpu_tensor;
framework::TensorCopy(t, platform::CPUPlace(), &tt); cpu_tensor.set_lod(t.lod());
framework::TensorCopy(t, platform::CPUPlace(), &cpu_tensor);
platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance(); platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance();
auto &dev_ctx = *pool.Get(t.place()); auto &dev_ctx = *pool.Get(t.place());
dev_ctx.Wait(); dev_ctx.Wait();
os << tt; os << cpu_tensor;
return os; return os;
} }
......
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. /* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License"); Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License. you may not use this file except in compliance with the License.
You may obtain a copy of the License at You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0 http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS, distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#pragma once #pragma once
......
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include <algorithm>
#include <string>
#include <unordered_map>
#include <vector>
#include "paddle/fluid/framework/attribute.h"
#include "paddle/fluid/framework/op_info.h"
#include "paddle/fluid/framework/op_kernel_type.h"
#include "paddle/fluid/framework/operator.h"
#include "paddle/fluid/framework/program_desc.h"
#include "paddle/fluid/framework/scope.h"
#include "paddle/fluid/platform/variant.h"
#include "ngraph/type/element_type.hpp"
namespace paddle {
namespace framework {
class NgraphOperator : public OperatorBase {
public:
static std::vector<
std::vector<std::vector<std::unique_ptr<OperatorBase>>::iterator>>
NgraphOpIntervals(
std::vector<std::unique_ptr<paddle::framework::OperatorBase>>* ops);
explicit NgraphOperator(
const ProgramDesc& prog, size_t block_id,
std::vector<std::unique_ptr<OperatorBase>>::iterator start,
std::vector<std::unique_ptr<OperatorBase>>::iterator end,
const std::string& type = "fused_op", const VariableNameMap& inputs = {},
const VariableNameMap& outputs = {}, const AttributeMap& attrs = {});
void RunImpl(const Scope& scope, const platform::Place& place) const final;
private:
const ProgramDesc pdesc_;
size_t block_;
std::vector<std::shared_ptr<OperatorBase>> fused_ops_;
std::unordered_map<std::string, ngraph::element::Type> var_type_map_;
std::unordered_set<std::string> persistables_;
std::unordered_set<std::string> fetches_;
std::unordered_set<std::string> post_op_inputs_;
bool is_full_ = false;
void Process();
};
} // namespace framework
} // namespace paddle
...@@ -19,8 +19,6 @@ limitations under the License. */ ...@@ -19,8 +19,6 @@ limitations under the License. */
#include <sstream> #include <sstream>
#include <string> #include <string>
#include <vector> #include <vector>
#include "gflags/gflags.h"
#include "glog/logging.h"
#include "paddle/fluid/framework/data_transform.h" #include "paddle/fluid/framework/data_transform.h"
#include "paddle/fluid/framework/executor.h" #include "paddle/fluid/framework/executor.h"
#include "paddle/fluid/framework/lod_tensor.h" #include "paddle/fluid/framework/lod_tensor.h"
...@@ -557,18 +555,17 @@ Tensor* ExecutionContext::LegacyOutput<Tensor>(const std::string& name) const { ...@@ -557,18 +555,17 @@ Tensor* ExecutionContext::LegacyOutput<Tensor>(const std::string& name) const {
template <> template <>
std::vector<Tensor*> ExecutionContext::MultiOutput<Tensor>( std::vector<Tensor*> ExecutionContext::MultiOutput<Tensor>(
const std::string& name) const { const std::string& name) const {
auto names = op().Outputs(name); auto it = ctx_.outputs.find(name);
if (it == ctx_.outputs.end()) {
return {};
}
const std::vector<Variable*>& vars = it->second;
std::vector<Tensor*> res; std::vector<Tensor*> res;
res.reserve(names.size()); res.reserve(vars.size());
std::transform(names.begin(), names.end(), std::back_inserter(res), std::transform(vars.begin(), vars.end(), std::back_inserter(res),
[&](const std::string& sub_name) -> Tensor* { [&](Variable* var) -> Tensor* {
auto var = scope_.FindVar(sub_name); return var == nullptr ? nullptr
if (var == nullptr) return nullptr; : var->GetMutable<LoDTensor>();
PADDLE_ENFORCE(
var->IsType<LoDTensor>(),
"%s should be LoDTensor, but the received type is %s",
sub_name, ToTypeName(var->Type()));
return var->GetMutable<LoDTensor>();
}); });
return res; return res;
} }
...@@ -1075,7 +1072,9 @@ Scope* OperatorWithKernel::PrepareData( ...@@ -1075,7 +1072,9 @@ Scope* OperatorWithKernel::PrepareData(
proto::VarType::Type OperatorWithKernel::IndicateDataType( proto::VarType::Type OperatorWithKernel::IndicateDataType(
const ExecutionContext& ctx) const { const ExecutionContext& ctx) const {
int data_type = -1; proto::VarType::Type dafault_data_type =
static_cast<proto::VarType::Type>(-1);
proto::VarType::Type data_type = dafault_data_type;
for (auto& input : this->inputs_) { for (auto& input : this->inputs_) {
const std::vector<const Variable*> vars = ctx.MultiInputVar(input.first); const std::vector<const Variable*> vars = ctx.MultiInputVar(input.first);
for (size_t i = 0; i < vars.size(); ++i) { for (size_t i = 0; i < vars.size(); ++i) {
...@@ -1092,18 +1091,19 @@ proto::VarType::Type OperatorWithKernel::IndicateDataType( ...@@ -1092,18 +1091,19 @@ proto::VarType::Type OperatorWithKernel::IndicateDataType(
if (t != nullptr) { if (t != nullptr) {
PADDLE_ENFORCE(t->IsInitialized(), "Input %s(%lu)is not initialized", PADDLE_ENFORCE(t->IsInitialized(), "Input %s(%lu)is not initialized",
input.first, i); input.first, i);
int tmp = static_cast<int>(t->type()); proto::VarType::Type tmp = t->type();
PADDLE_ENFORCE( PADDLE_ENFORCE(
tmp == data_type || data_type == -1, tmp == data_type || data_type == dafault_data_type,
"DataType of Paddle Op %s must be the same. Get (%d) != (%d)", "DataType of Paddle Op %s must be the same. Get (%d) != (%d)",
Type(), data_type, tmp); Type(), DataTypeToString(data_type), DataTypeToString(tmp));
data_type = tmp; data_type = tmp;
} }
} }
} }
} }
PADDLE_ENFORCE(data_type != -1, "DataType should be indicated by input"); PADDLE_ENFORCE(data_type != dafault_data_type,
return static_cast<proto::VarType::Type>(data_type); "DataType should be indicated by input");
return data_type;
} }
OpKernelType OperatorWithKernel::GetExpectedKernelType( OpKernelType OperatorWithKernel::GetExpectedKernelType(
......
...@@ -25,7 +25,8 @@ inline const T* Tensor::data() const { ...@@ -25,7 +25,8 @@ inline const T* Tensor::data() const {
check_memory_size(); check_memory_size();
bool valid = bool valid =
std::is_same<T, void>::value || type_ == DataTypeTrait<T>::DataType; std::is_same<T, void>::value || type_ == DataTypeTrait<T>::DataType;
PADDLE_ENFORCE(valid, "Tensor holds the wrong type, it holds %d", type_); PADDLE_ENFORCE(valid, "Tensor holds the wrong type, it holds %d",
DataTypeToString(type_));
return reinterpret_cast<const T*>( return reinterpret_cast<const T*>(
reinterpret_cast<uintptr_t>(holder_->ptr()) + offset_); reinterpret_cast<uintptr_t>(holder_->ptr()) + offset_);
......
if(WITH_PYTHON) if(WITH_PYTHON)
cc_library(layer SRCS layer.cc DEPS proto_desc operator) cc_library(layer SRCS layer.cc DEPS proto_desc operator device_context blas)
cc_library(tracer SRCS tracer.cc DEPS proto_desc) cc_library(tracer SRCS tracer.cc DEPS proto_desc device_context)
cc_library(engine SRCS engine.cc) cc_library(engine SRCS engine.cc)
endif() endif()
...@@ -13,6 +13,7 @@ ...@@ -13,6 +13,7 @@
// limitations under the License. // limitations under the License.
#include "paddle/fluid/imperative/layer.h" #include "paddle/fluid/imperative/layer.h"
#include <deque> #include <deque>
#include <limits> #include <limits>
#include <map> #include <map>
...@@ -22,6 +23,9 @@ ...@@ -22,6 +23,9 @@
#include "paddle/fluid/framework/lod_tensor.h" #include "paddle/fluid/framework/lod_tensor.h"
#include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/framework/operator.h" #include "paddle/fluid/framework/operator.h"
#include "paddle/fluid/framework/tensor_util.h"
#include "paddle/fluid/operators/math/blas.h"
#include "paddle/fluid/platform/device_context.h"
#include "paddle/fluid/string/printf.h" #include "paddle/fluid/string/printf.h"
namespace paddle { namespace paddle {
...@@ -34,22 +38,66 @@ std::map<int, py::object> py_funcs_; ...@@ -34,22 +38,66 @@ std::map<int, py::object> py_funcs_;
using framework::Variable; using framework::Variable;
void AddTo(Variable* src, Variable* dst) { namespace detail {
framework::LoDTensor* dst_tensor = dst->GetMutable<framework::LoDTensor>();
framework::LoDTensor* src_tensor = src->GetMutable<framework::LoDTensor>(); template <typename T>
class TensorAddToFunctor : public boost::static_visitor<> {
public:
TensorAddToFunctor(int64_t numel, const T* x, T* y)
: numel_(numel), x_(x), y_(y) {}
void operator()(const platform::CPUPlace& place) {
platform::CPUDeviceContext* ctx = dynamic_cast<platform::CPUDeviceContext*>(
platform::DeviceContextPool::Instance().Get(place));
auto blas = operators::math::GetBlas<platform::CPUDeviceContext, T>(*ctx);
blas.AXPY(numel_, 1., x_, y_);
}
#ifdef PADDLE_WITH_CUDA
void operator()(const platform::CUDAPlace& place) {
platform::CUDADeviceContext* ctx =
dynamic_cast<platform::CUDADeviceContext*>(
platform::DeviceContextPool::Instance().Get(place));
auto blas = operators::math::GetBlas<platform::CUDADeviceContext, T>(*ctx);
blas.AXPY(numel_, 1., x_, y_);
}
#else
void operator()(const platform::CUDAPlace& place) {
PADDLE_THROW("Do NOT support gradient merge in place %s", place);
}
#endif
// there is NO blas in CUDAPinnedPlace
void operator()(const platform::CUDAPinnedPlace& place) {
PADDLE_THROW("Do NOT support gradient merge in place %s", place);
}
private:
int64_t numel_;
const T* x_;
T* y_;
};
} // namespace detail
void AddTo(Variable* src, Variable* dst, platform::Place place) {
framework::Tensor* dst_tensor = dst->GetMutable<framework::LoDTensor>();
framework::Tensor* src_tensor = src->GetMutable<framework::LoDTensor>();
// FIXME(minqiyang): loss_grad op will pass a zero grad of label // FIXME(minqiyang): loss_grad op will pass a zero grad of label
// ugly fix for it // ugly fix for it
if (src_tensor->numel() == 0) { if (src_tensor->numel() == 0) {
return; return;
} }
PADDLE_ENFORCE(dst_tensor->numel() == src_tensor->numel(), PADDLE_ENFORCE(dst_tensor->numel() == src_tensor->numel(),
"dst_numel %lld vs. src_numel %lld", dst_tensor->numel(), "dst_numel %lld vs. src_numel %lld", dst_tensor->numel(),
src_tensor->numel()); src_tensor->numel());
float* dst_data = dst_tensor->mutable_data<float>(platform::CPUPlace());
const float* src_data = src_tensor->data<float>(); detail::TensorAddToFunctor<float> func(
for (int64_t i = 0; i < src_tensor->numel(); ++i) { src_tensor->numel(), src_tensor->data<float>(),
dst_data[i] += src_data[i]; dst_tensor->mutable_data<float>(place));
} boost::apply_visitor(func, place);
} }
class Autograd { class Autograd {
...@@ -108,6 +156,8 @@ class Autograd { ...@@ -108,6 +156,8 @@ class Autograd {
for (auto it : candidate->pre_ops_) { for (auto it : candidate->pre_ops_) {
for (OpBase* pre_op : it.second) { for (OpBase* pre_op : it.second) {
if (!pre_op) continue; if (!pre_op) continue;
VLOG(5) << "op dep " << candidate->op_desc_->Type() << " <---- "
<< it.first << " <---- " << pre_op->op_desc_->Type();
if (visited.find(pre_op) == visited.end()) { if (visited.find(pre_op) == visited.end()) {
visited.insert(pre_op); visited.insert(pre_op);
queue.push_back(pre_op); queue.push_back(pre_op);
...@@ -120,66 +170,104 @@ class Autograd { ...@@ -120,66 +170,104 @@ class Autograd {
} }
}; };
std::unique_ptr<VarBase> VarBase::NewVarBase(const platform::Place& dst_place,
const bool blocking) const {
PADDLE_ENFORCE(var_->IsInitialized(),
"Variable must be initialized when getting numpy tensor");
std::unique_ptr<VarBase> new_var(new VarBase());
framework::LoDTensor* tensor =
new_var->var_->GetMutable<framework::LoDTensor>();
tensor->Resize(var_->Get<framework::LoDTensor>().dims());
tensor->set_lod(var_->Get<framework::LoDTensor>().lod());
if (blocking) {
platform::DeviceContext* dev_ctx =
platform::DeviceContextPool::Instance().Get(dst_place);
framework::TensorCopySync(var_->Get<framework::LoDTensor>(), dst_place,
tensor);
dev_ctx->Wait();
} else {
framework::TensorCopy(var_->Get<framework::LoDTensor>(), dst_place, tensor);
}
if (platform::is_gpu_place(dst_place)) {
VLOG(3) << "copy tensor " << var_desc_->Name() << " from gpu";
}
return new_var;
}
framework::LoDTensor& VarBase::GradValue() { framework::LoDTensor& VarBase::GradValue() {
VLOG(3) << "get var grad " << var_desc_->Name(); VLOG(3) << "get var grad " << var_desc_->Name();
return *(grads_->var_->GetMutable<framework::LoDTensor>()); return *(grads_->var_->GetMutable<framework::LoDTensor>());
} }
std::map<std::string, std::vector<VarBase*>> OpBase::ApplyGrad() { std::map<std::string, std::vector<VarBase*>> OpBase::ApplyGrad() {
if (!grad_op_desc_ && backward_id_ <= 0) { if (grad_op_descs_.empty() && backward_id_ <= 0) {
LOG(WARNING) << "op with no grad: " << op_desc_->Type(); LOG(WARNING) << "op with no grad: " << op_desc_->Type();
return {}; return {};
} }
std::map<std::string, std::vector<framework::Variable*>> grad_outputs; std::vector<framework::VariableValueMap> grad_outputs;
if (backward_id_ > 0) { if (backward_id_ > 0) {
VLOG(3) << "py_layer_grad"; VLOG(3) << "py_layer_grad";
grad_outputs[framework::GradVarName(PyLayer::kFwdOut)] = PyLayer::ApplyGrad( grad_outputs.resize(1);
backward_id_, grad_outputs[0][framework::GradVarName(PyLayer::kFwdOut)] =
grad_input_vars_[framework::GradVarName(PyLayer::kFwdInp)]); PyLayer::ApplyGrad(
backward_id_,
grad_input_vars_[0][framework::GradVarName(PyLayer::kFwdInp)]);
} else { } else {
VLOG(3) << "op grad " << grad_op_desc_->Type(); grad_outputs.resize(grad_op_descs_.size());
for (auto it : grad_output_vars_) { for (size_t k = 0; k < grad_op_descs_.size(); ++k) {
auto& outputs = grad_outputs[it.first]; framework::OpDesc* grad_op_desc = grad_op_descs_[k];
for (size_t i = 0; i < it.second.size(); ++i) { VLOG(3) << "op grad " << grad_op_desc->Type();
// Allocate a new variable for (auto it : grad_output_vars_[k]) {
Variable* tmp_var = new framework::Variable(); auto& outputs = grad_outputs[k][it.first];
tmp_var->GetMutable<framework::LoDTensor>(); for (size_t i = 0; i < it.second.size(); ++i) {
outputs.push_back(tmp_var); // Allocate a new variable
Variable* tmp_var = new framework::Variable();
tmp_var->GetMutable<framework::LoDTensor>();
outputs.push_back(tmp_var);
}
} }
}
framework::RuntimeContext ctx(grad_input_vars_, grad_outputs); framework::RuntimeContext ctx(grad_input_vars_[k], grad_outputs[k]);
// No need to do compile time infer shape here. // No need to do compile time infer shape here.
// grad_op_desc_->InferShape(*block_); // grad_op_desc_->InferShape(*block_);
grad_op_desc_->InferVarType(block_); grad_op_desc->InferVarType(block_);
std::unique_ptr<framework::OperatorBase> opbase = std::unique_ptr<framework::OperatorBase> opbase =
framework::OpRegistry::CreateOp(*grad_op_desc_); framework::OpRegistry::CreateOp(*grad_op_desc);
framework::OperatorWithKernel* op_kernel = framework::OperatorWithKernel* op_kernel =
dynamic_cast<framework::OperatorWithKernel*>(opbase.get()); dynamic_cast<framework::OperatorWithKernel*>(opbase.get());
PADDLE_ENFORCE_NOT_NULL(op_kernel, "only support op with kernel"); PADDLE_ENFORCE_NOT_NULL(op_kernel, "only support op with kernel");
framework::Scope scope; framework::Scope scope;
platform::CPUPlace place; PreparedOp p = PreparedOp::Prepare(ctx, *op_kernel, place_);
PreparedOp p = PreparedOp::Prepare(ctx, *op_kernel, place); p.op.RuntimeInferShape(scope, place_, ctx);
p.op.RuntimeInferShape(scope, place, ctx); p.func(framework::ExecutionContext(p.op, scope, *p.dev_ctx, p.ctx));
p.func(framework::ExecutionContext(p.op, scope, *p.dev_ctx, p.ctx)); }
} }
for (auto it : grad_output_vars_) { for (size_t k = 0; k < grad_output_vars_.size(); ++k) {
auto& outputs = grad_outputs[it.first]; for (auto it : grad_output_vars_[k]) {
auto& origin_outputs = it.second; auto& outputs = grad_outputs[k][it.first];
PADDLE_ENFORCE_EQ(outputs.size(), origin_outputs.size()); auto& origin_outputs = it.second;
PADDLE_ENFORCE_EQ(outputs.size(), origin_outputs.size());
for (size_t i = 0; i < outputs.size(); ++i) {
framework::Variable* grad = outputs[i]; for (size_t i = 0; i < outputs.size(); ++i) {
framework::Variable* orig_grad = origin_outputs[i]; framework::Variable* grad = outputs[i];
AddTo(grad, orig_grad); framework::Variable* orig_grad = origin_outputs[i];
delete grad; AddTo(grad, orig_grad, place_);
delete grad;
}
} }
} }
return input_vars_; return input_vars_;
} }
...@@ -188,8 +276,10 @@ void VarBase::RunBackward() { ...@@ -188,8 +276,10 @@ void VarBase::RunBackward() {
VLOG(3) << "start backward"; VLOG(3) << "start backward";
auto grads_t = grads_->var_->GetMutable<framework::LoDTensor>(); auto grads_t = grads_->var_->GetMutable<framework::LoDTensor>();
float* data = grads_t->mutable_data<float>(platform::CPUPlace()); operators::math::set_constant(
std::fill(data, data + grads_t->numel(), 1.0); *(platform::DeviceContextPool::Instance().Get(
var_->GetMutable<framework::LoDTensor>()->place())),
grads_t, 1.0);
PADDLE_ENFORCE( PADDLE_ENFORCE(
grads_ == grads_ ==
......
...@@ -21,17 +21,22 @@ ...@@ -21,17 +21,22 @@
#include <map> // NOLINT #include <map> // NOLINT
#include <string> // NOLINT #include <string> // NOLINT
#include <vector> // NOLINT #include <vector> // NOLINT
#include <memory> // NOLINT
#include "paddle/fluid/framework/op_desc.h" #include "paddle/fluid/framework/op_desc.h"
#include "paddle/fluid/framework/operator.h" #include "paddle/fluid/framework/operator.h"
#include "paddle/fluid/framework/var_desc.h" #include "paddle/fluid/framework/var_desc.h"
#include "paddle/fluid/platform/enforce.h" #include "paddle/fluid/platform/enforce.h"
#include "paddle/fluid/platform/device_context.h"
#include "paddle/fluid/operators/math/math_function.h"
#include "paddle/fluid/imperative/type_defs.h" #include "paddle/fluid/imperative/type_defs.h"
namespace paddle { namespace paddle {
namespace imperative { namespace imperative {
class VarBase;
namespace py = ::pybind11; namespace py = ::pybind11;
class PreparedOp { class PreparedOp {
...@@ -81,6 +86,8 @@ class PreparedOp { ...@@ -81,6 +86,8 @@ class PreparedOp {
return PreparedOp(op, ctx, kernel_iter->second, dev_ctx); return PreparedOp(op, ctx, kernel_iter->second, dev_ctx);
} }
inline platform::DeviceContext* GetDeviceContext() const { return dev_ctx; }
const framework::OperatorBase& op; const framework::OperatorBase& op;
const framework::RuntimeContext& ctx; const framework::RuntimeContext& ctx;
framework::OperatorWithKernel::OpKernelFunc func; framework::OperatorWithKernel::OpKernelFunc func;
...@@ -134,20 +141,31 @@ class VarBase { ...@@ -134,20 +141,31 @@ class VarBase {
void RunBackward(); void RunBackward();
void TrackPreOp(OpBase* pre_op, const std::string& pre_op_out_name, void TrackPreOp(OpBase* pre_op, const std::string& pre_op_out_name,
int pre_op_out_idx, bool stop_gradient) { int pre_op_out_idx, bool pre_op_stop_gradient) {
pre_op_ = pre_op; pre_op_ = pre_op;
pre_op_out_name_ = pre_op_out_name; pre_op_out_name_ = pre_op_out_name;
pre_op_out_idx_ = pre_op_out_idx; pre_op_out_idx_ = pre_op_out_idx;
stop_gradient_ = stop_gradient; if (pre_op_stop_gradient) {
stop_gradient_ = pre_op_stop_gradient;
}
} }
void ClearGradient() { void ClearGradient() {
delete grads_; VLOG(1) << "clear gradient of " << var_desc_->Name();
grads_ = new VarBase(true); if (grads_ && grads_->var_ && grads_->var_->IsInitialized()) {
auto grads_t = grads_->var_->GetMutable<framework::LoDTensor>();
operators::math::set_constant(
*(platform::DeviceContextPool::Instance().Get(
grads_->var_->Get<framework::LoDTensor>().place())),
grads_t, 0.0);
}
} }
framework::LoDTensor& GradValue(); framework::LoDTensor& GradValue();
std::unique_ptr<VarBase> NewVarBase(const platform::Place& dst_place,
const bool blocking) const;
inline std::string GradName() const { inline std::string GradName() const {
PADDLE_ENFORCE( PADDLE_ENFORCE(
var_desc_, var_desc_,
...@@ -175,11 +193,13 @@ class OpBase { ...@@ -175,11 +193,13 @@ class OpBase {
OpBase() OpBase()
: op_desc_(nullptr), : op_desc_(nullptr),
forward_id_(-1), forward_id_(-1),
grad_op_desc_(nullptr), backward_id_(-1),
backward_id_(-1) {} place_(platform::CPUPlace()) {}
virtual ~OpBase() { virtual ~OpBase() {
if (grad_op_desc_) delete grad_op_desc_; for (framework::OpDesc* desc : grad_op_descs_) {
delete desc;
}
} }
std::map<std::string, std::vector<VarBase*>> ApplyGrad(); std::map<std::string, std::vector<VarBase*>> ApplyGrad();
...@@ -188,18 +208,25 @@ class OpBase { ...@@ -188,18 +208,25 @@ class OpBase {
// For pure python PyLayer, use `forward_id_`, otherwise, use op_desc_. // For pure python PyLayer, use `forward_id_`, otherwise, use op_desc_.
framework::OpDesc* op_desc_; framework::OpDesc* op_desc_;
int forward_id_; int forward_id_;
// When has backward, one of `grad_op_desc_` or `backward_id_` is set,
// When has backward, one of `grad_op_descs_` or `backward_id_` is set,
// not both. // not both.
framework::OpDesc* grad_op_desc_; // Note: each fwd op corresponds to a vector of bwd ops.
std::vector<framework::OpDesc*> grad_op_descs_;
int backward_id_; int backward_id_;
platform::Place place_;
VarBasePtrMap input_vars_; VarBasePtrMap input_vars_;
VarBasePtrMap output_vars_; VarBasePtrMap output_vars_;
OpBasePtrMap pre_ops_; OpBasePtrMap pre_ops_;
std::map<std::string, std::vector<int>> pre_ops_out_idx_; std::map<std::string, std::vector<int>> pre_ops_out_idx_;
framework::VariableValueMap grad_input_vars_; // Inputs to a vector of bwd ops.
framework::VariableValueMap grad_output_vars_; std::vector<framework::VariableValueMap> grad_input_vars_;
// Outputs to a vector of bwd ops.
std::vector<framework::VariableValueMap> grad_output_vars_;
framework::BlockDesc* block_; framework::BlockDesc* block_;
}; };
......
...@@ -14,33 +14,61 @@ ...@@ -14,33 +14,61 @@
#include "paddle/fluid/imperative/tracer.h" #include "paddle/fluid/imperative/tracer.h"
#include "paddle/fluid/operators/math/math_function.h"
#include "paddle/fluid/platform/device_context.h"
#include "paddle/fluid/platform/enforce.h"
namespace paddle { namespace paddle {
namespace imperative { namespace imperative {
void CreateGradOp(const framework::OpDesc& op_desc, void CreateGradOp(const framework::OpDesc& op_desc,
const std::unordered_set<std::string>& no_grad_set, const std::unordered_set<std::string>& no_grad_set,
const std::vector<framework::BlockDesc*>& grad_sub_block, const std::vector<framework::BlockDesc*>& grad_sub_block,
framework::OpDesc** grad_op_desc, std::vector<framework::OpDesc*>* grad_op_descs,
std::unordered_map<std::string, std::string>* grad_to_var) { std::unordered_map<std::string, std::string>* grad_to_var) {
std::vector<std::unique_ptr<framework::OpDesc>> grad_op_descs = PADDLE_ENFORCE(grad_op_descs->empty());
std::vector<std::unique_ptr<framework::OpDesc>> descs =
framework::OpInfoMap::Instance() framework::OpInfoMap::Instance()
.Get(op_desc.Type()) .Get(op_desc.Type())
.GradOpMaker()(op_desc, no_grad_set, grad_to_var, grad_sub_block); .GradOpMaker()(op_desc, no_grad_set, grad_to_var, grad_sub_block);
PADDLE_ENFORCE(grad_op_descs.size() == 1, "Only support 1 grad op now.");
// TODO(panyx0718): Leak? for (auto& desc : descs) {
*grad_op_desc = grad_op_descs[0].release(); grad_op_descs->emplace_back(desc.release());
}
} }
void InitVar(framework::Variable* var, framework::Variable* grad_var) { void InitVar(framework::Variable* var, framework::Variable* grad_var,
platform::DeviceContext* dev_ctx) {
PADDLE_ENFORCE_NOT_NULL(dev_ctx,
"Could not get valid device from forward op");
auto& var_t = var->Get<framework::LoDTensor>(); auto& var_t = var->Get<framework::LoDTensor>();
float* data = grad_var->GetMutable<framework::LoDTensor>()->mutable_data<float>(
grad_var->GetMutable<framework::LoDTensor>()->mutable_data<float>( var_t.dims(), dev_ctx->GetPlace());
var_t.dims(), platform::CPUPlace()); operators::math::set_constant(
std::fill(data, data + var_t.numel(), 0.0); *dev_ctx, grad_var->GetMutable<framework::LoDTensor>(), 0.0);
}
platform::Place GetExpectedPlace(platform::Place place, VarBasePtrMap inputs) {
platform::Place result = place;
for (auto it : inputs) {
for (VarBase* var : it.second) {
platform::Place tmp_place =
var->var_->Get<framework::LoDTensor>().place();
if (!platform::is_same_place(tmp_place, result)) {
PADDLE_THROW(
"Input variable should keep in the same place: %s, but get place: "
"%s of input %s instead",
result, tmp_place, it.first);
}
}
}
return result;
} }
void Tracer::Trace(OpBase* op, const VarBasePtrMap& inputs, void Tracer::Trace(OpBase* op, const VarBasePtrMap& inputs,
const VarBasePtrMap& outputs, framework::BlockDesc* block, const VarBasePtrMap& outputs, framework::BlockDesc* block,
const platform::Place expected_place,
const bool stop_gradient) { const bool stop_gradient) {
std::map<std::string, VarBase*> vars; std::map<std::string, VarBase*> vars;
...@@ -57,11 +85,12 @@ void Tracer::Trace(OpBase* op, const VarBasePtrMap& inputs, ...@@ -57,11 +85,12 @@ void Tracer::Trace(OpBase* op, const VarBasePtrMap& inputs,
op->input_vars_ = inputs; op->input_vars_ = inputs;
for (auto it : op->input_vars_) { for (auto it : op->input_vars_) {
auto& invars = invars_map[it.first]; auto& invars = invars_map[it.first];
invars.reserve(it.second.size());
for (VarBase* inp : it.second) { for (VarBase* inp : it.second) {
PADDLE_ENFORCE_NOT_NULL(inp->var_, "op %s input %s nullptr", PADDLE_ENFORCE_NOT_NULL(inp->var_, "op %s input %s nullptr",
op->op_desc_->Type(), inp->var_desc_->Name()); op->op_desc_->Type(), inp->var_desc_->Name());
invars.push_back(inp->var_); invars.emplace_back(inp->var_);
vars[inp->var_desc_->Name()] = inp; vars[inp->var_desc_->Name()] = inp;
if (inp->PreOp()) { if (inp->PreOp()) {
op->pre_ops_[it.first].push_back(inp->PreOp()); op->pre_ops_[it.first].push_back(inp->PreOp());
...@@ -78,9 +107,10 @@ void Tracer::Trace(OpBase* op, const VarBasePtrMap& inputs, ...@@ -78,9 +107,10 @@ void Tracer::Trace(OpBase* op, const VarBasePtrMap& inputs,
for (auto it : op->output_vars_) { for (auto it : op->output_vars_) {
auto& outvars = outvars_map[it.first]; auto& outvars = outvars_map[it.first];
const std::vector<VarBase*>& outputs = it.second; const std::vector<VarBase*>& outputs = it.second;
outvars.reserve(outputs.size());
for (size_t i = 0; i < outputs.size(); ++i) { for (size_t i = 0; i < outputs.size(); ++i) {
VarBase* out = outputs[i]; VarBase* out = outputs[i];
outvars.push_back(out->var_); outvars.emplace_back(out->var_);
vars[out->var_desc_->Name()] = out; vars[out->var_desc_->Name()] = out;
framework::VarDesc* var_desc = block->FindVar(out->var_desc_->Name()); framework::VarDesc* var_desc = block->FindVar(out->var_desc_->Name());
...@@ -105,51 +135,59 @@ void Tracer::Trace(OpBase* op, const VarBasePtrMap& inputs, ...@@ -105,51 +135,59 @@ void Tracer::Trace(OpBase* op, const VarBasePtrMap& inputs,
PADDLE_ENFORCE_NOT_NULL(op_kernel, "only support op with kernel"); PADDLE_ENFORCE_NOT_NULL(op_kernel, "only support op with kernel");
framework::Scope scope; framework::Scope scope;
platform::CPUPlace place; op->place_ = GetExpectedPlace(expected_place, inputs);
PreparedOp p = PreparedOp::Prepare(ctx, *op_kernel, place); PreparedOp prepared_op = PreparedOp::Prepare(ctx, *op_kernel, op->place_);
p.op.RuntimeInferShape(scope, place, ctx); prepared_op.op.RuntimeInferShape(scope, op->place_, ctx);
p.func(framework::ExecutionContext(p.op, scope, *p.dev_ctx, p.ctx)); prepared_op.func(framework::ExecutionContext(
prepared_op.op, scope, *prepared_op.dev_ctx, prepared_op.ctx));
if (!stop_gradient) { if (!stop_gradient) {
framework::OpDesc* grad_op_desc;
// TODO(panyx): Is this leaked?
std::unique_ptr<std::unordered_map<std::string, std::string>> grad_to_var( std::unique_ptr<std::unordered_map<std::string, std::string>> grad_to_var(
new std::unordered_map<std::string, std::string>()); new std::unordered_map<std::string, std::string>());
CreateGradOp(*op_desc, {}, {block}, &grad_op_desc, grad_to_var.get()); CreateGradOp(*op_desc, {}, {block}, &op->grad_op_descs_, grad_to_var.get());
op->grad_op_desc_ = grad_op_desc;
op->grad_input_vars_.resize(op->grad_op_descs_.size());
for (auto it : grad_op_desc->Inputs()) { op->grad_output_vars_.resize(op->grad_op_descs_.size());
auto& grad_in_vars = op->grad_input_vars_[it.first]; for (size_t i = 0; i < op->grad_op_descs_.size(); ++i) {
for (const std::string& grad_invar : it.second) { framework::OpDesc* grad_op_desc = op->grad_op_descs_[i];
block->FindRecursiveOrCreateVar(grad_invar); for (auto it : grad_op_desc->Inputs()) {
auto var_it = grad_to_var->find(grad_invar); auto& grad_in_vars = op->grad_input_vars_[i][it.first];
if (var_it == grad_to_var->end()) { for (const std::string& grad_invar : it.second) {
auto fwd_var_it = vars.find(grad_invar); block->FindRecursiveOrCreateVar(grad_invar);
PADDLE_ENFORCE(fwd_var_it != vars.end()); auto var_it = grad_to_var->find(grad_invar);
// Forward inputs or outputs. if (var_it == grad_to_var->end()) {
grad_in_vars.push_back(fwd_var_it->second->var_); auto fwd_var_it = vars.find(grad_invar);
} else { PADDLE_ENFORCE(fwd_var_it != vars.end());
VarBase* var = vars[var_it->second]; // Forward inputs or outputs.
if (!var->grads_->var_->IsInitialized()) { grad_in_vars.push_back(fwd_var_it->second->var_);
InitVar(var->var_, var->grads_->var_); } else {
VarBase* var = vars[var_it->second];
if (!var->grads_->var_->IsInitialized()) {
InitVar(var->var_, var->grads_->var_,
prepared_op.GetDeviceContext());
}
// Douts.
grad_in_vars.push_back(var->grads_->var_);
} }
// Douts.
grad_in_vars.push_back(var->grads_->var_);
} }
} }
}
for (auto it : grad_op_desc->Outputs()) { for (auto it : grad_op_desc->Outputs()) {
auto& grad_out_vars = op->grad_output_vars_[it.first]; auto& grad_out_vars = op->grad_output_vars_[i][it.first];
for (const std::string& grad_outvar : it.second) { for (const std::string& grad_outvar : it.second) {
block->FindRecursiveOrCreateVar(grad_outvar); block->FindRecursiveOrCreateVar(grad_outvar);
auto var_it = grad_to_var->find(grad_outvar); auto var_it = grad_to_var->find(grad_outvar);
PADDLE_ENFORCE(var_it != grad_to_var->end()); PADDLE_ENFORCE(var_it != grad_to_var->end(),
VarBase* var = vars[var_it->second]; "Could not found the grad op output var, should this "
if (!var->grads_->var_->IsInitialized()) { "operator %s's stop gradient be True",
InitVar(var->var_, var->grads_->var_); op_desc->Type());
VarBase* var = vars[var_it->second];
if (!var->grads_->var_->IsInitialized()) {
InitVar(var->var_, var->grads_->var_,
prepared_op.GetDeviceContext());
}
grad_out_vars.push_back(var->grads_->var_);
} }
grad_out_vars.push_back(var->grads_->var_);
} }
} }
} }
...@@ -178,10 +216,12 @@ std::vector<VarBase*> Tracer::PyTrace(OpBase* op, ...@@ -178,10 +216,12 @@ std::vector<VarBase*> Tracer::PyTrace(OpBase* op,
out->TrackPreOp(op, PyLayer::kFwdOut, i, stop_gradient); out->TrackPreOp(op, PyLayer::kFwdOut, i, stop_gradient);
} }
if (!stop_gradient) { if (!stop_gradient) {
op->grad_input_vars_.resize(1);
op->grad_output_vars_.resize(1);
auto& grad_input_vars = auto& grad_input_vars =
op->grad_input_vars_[framework::GradVarName(PyLayer::kFwdInp)]; op->grad_input_vars_[0][framework::GradVarName(PyLayer::kFwdInp)];
auto& grad_output_vars = auto& grad_output_vars =
op->grad_output_vars_[framework::GradVarName(PyLayer::kFwdOut)]; op->grad_output_vars_[0][framework::GradVarName(PyLayer::kFwdOut)];
for (const VarBase* inp : inputs) { for (const VarBase* inp : inputs) {
grad_input_vars.push_back(inp->var_); grad_input_vars.push_back(inp->var_);
...@@ -189,16 +229,23 @@ std::vector<VarBase*> Tracer::PyTrace(OpBase* op, ...@@ -189,16 +229,23 @@ std::vector<VarBase*> Tracer::PyTrace(OpBase* op,
for (VarBase* out : outputs) { for (VarBase* out : outputs) {
grad_input_vars.push_back(out->var_); grad_input_vars.push_back(out->var_);
} }
platform::CPUPlace place;
for (VarBase* out : outputs) { for (VarBase* out : outputs) {
grad_input_vars.push_back(out->grads_->var_); grad_input_vars.push_back(out->grads_->var_);
if (!grad_input_vars.back()->IsInitialized()) { if (!grad_input_vars.back()->IsInitialized()) {
InitVar(out->var_, grad_input_vars.back()); // TODO(minqiyang): Add GPU support for PyLayer, only support CPU now
InitVar(out->var_, grad_input_vars.back(),
platform::DeviceContextPool::Instance().Get(place));
} }
} }
for (const VarBase* inp : inputs) { for (const VarBase* inp : inputs) {
grad_output_vars.push_back(inp->grads_->var_); grad_output_vars.push_back(inp->grads_->var_);
if (!grad_output_vars.back()->IsInitialized()) { if (!grad_output_vars.back()->IsInitialized()) {
InitVar(inp->var_, grad_output_vars.back()); // TODO(minqiyang): Add GPU support for PyLayer, only support CPU now
InitVar(inp->var_, grad_output_vars.back(),
platform::DeviceContextPool::Instance().Get(place));
} }
} }
} }
......
...@@ -22,6 +22,7 @@ ...@@ -22,6 +22,7 @@
#include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/imperative/engine.h" #include "paddle/fluid/imperative/engine.h"
#include "paddle/fluid/imperative/layer.h" #include "paddle/fluid/imperative/layer.h"
#include "paddle/fluid/platform/place.h"
namespace paddle { namespace paddle {
namespace imperative { namespace imperative {
...@@ -34,21 +35,25 @@ void CreateGradOp(const framework::OpDesc& op_desc, ...@@ -34,21 +35,25 @@ void CreateGradOp(const framework::OpDesc& op_desc,
void InitVar(framework::Variable* var, framework::Variable* grad_var); void InitVar(framework::Variable* var, framework::Variable* grad_var);
platform::Place GetExpectedPlace(platform::Place place, VarBasePtrMap inputs);
class Tracer { class Tracer {
public: public:
explicit Tracer(framework::BlockDesc* root_block) : root_block_(root_block) {} explicit Tracer(framework::BlockDesc* root_block) : root_block_(root_block) {}
virtual ~Tracer() {} virtual ~Tracer() {}
void Trace(OpBase* op, void Trace(OpBase* op, const VarBasePtrMap& inputs,
const std::map<std::string, std::vector<VarBase*>>& inputs, const VarBasePtrMap& outputs, framework::BlockDesc* block,
const std::map<std::string, std::vector<VarBase*>>& outputs, const platform::Place expected_place,
framework::BlockDesc* block, const bool stop_gradient = false); const bool stop_gradient = false);
std::vector<VarBase*> PyTrace(OpBase* op, const std::vector<VarBase*>& inputs, std::vector<VarBase*> PyTrace(OpBase* op, const std::vector<VarBase*>& inputs,
bool stop_gradient = false); bool stop_gradient = false);
private: private:
platform::Place GetPlace(const VarBasePtrMap& inputs);
framework::BlockDesc* root_block_; framework::BlockDesc* root_block_;
}; };
......
...@@ -28,6 +28,7 @@ ...@@ -28,6 +28,7 @@
#include "paddle/fluid/framework/ir/graph.h" #include "paddle/fluid/framework/ir/graph.h"
#include "paddle/fluid/framework/program_desc.h" #include "paddle/fluid/framework/program_desc.h"
#include "paddle/fluid/framework/scope.h" #include "paddle/fluid/framework/scope.h"
#include "paddle/fluid/inference/api/paddle_analysis_config.h"
#include "paddle/fluid/platform/variant.h" #include "paddle/fluid/platform/variant.h"
namespace paddle { namespace paddle {
...@@ -130,10 +131,14 @@ struct Argument { ...@@ -130,10 +131,14 @@ struct Argument {
DECL_ARGUMENT_FIELD(tensorrt_max_batch_size, TensorRtMaxBatchSize, int); DECL_ARGUMENT_FIELD(tensorrt_max_batch_size, TensorRtMaxBatchSize, int);
DECL_ARGUMENT_FIELD(tensorrt_workspace_size, TensorRtWorkspaceSize, int); DECL_ARGUMENT_FIELD(tensorrt_workspace_size, TensorRtWorkspaceSize, int);
DECL_ARGUMENT_FIELD(tensorrt_min_subgraph_size, TensorRtMinSubgraphSize, int); DECL_ARGUMENT_FIELD(tensorrt_min_subgraph_size, TensorRtMinSubgraphSize, int);
DECL_ARGUMENT_FIELD(tensorrt_precision_mode, TensorRtPrecisionMode,
AnalysisConfig::Precision);
// Memory optimized related. // Memory optimized related.
DECL_ARGUMENT_FIELD(enable_memory_optim, EnableMemoryOptim, bool); DECL_ARGUMENT_FIELD(enable_memory_optim, EnableMemoryOptim, bool);
DECL_ARGUMENT_FIELD(memory_optim_force_update, MemoryOptimForceUpdate, bool); DECL_ARGUMENT_FIELD(static_memory_optim, StaticMemoryOptim, bool);
DECL_ARGUMENT_FIELD(static_memory_optim_force_update,
StaticMemoryOptimForceUpdate, bool);
// Indicate which kind of sort algorithm is used for operators, the memory // Indicate which kind of sort algorithm is used for operators, the memory
// optimization relays on the sort algorithm. // optimization relays on the sort algorithm.
DECL_ARGUMENT_FIELD(memory_optim_sort_kind, MemoryOptimSortKind, int); DECL_ARGUMENT_FIELD(memory_optim_sort_kind, MemoryOptimSortKind, int);
......
...@@ -36,6 +36,14 @@ void SetAttr<int>(framework::proto::OpDesc *op, const std::string &name, ...@@ -36,6 +36,14 @@ void SetAttr<int>(framework::proto::OpDesc *op, const std::string &name,
attr->set_i(data); attr->set_i(data);
} }
template <> template <>
void SetAttr<bool>(framework::proto::OpDesc *op, const std::string &name,
const bool &data) {
auto *attr = op->add_attrs();
attr->set_name(name);
attr->set_type(paddle::framework::proto::AttrType::BOOLEAN);
attr->set_b(data);
}
template <>
void SetAttr<int64_t>(framework::proto::OpDesc *op, const std::string &name, void SetAttr<int64_t>(framework::proto::OpDesc *op, const std::string &name,
const int64_t &data) { const int64_t &data) {
auto *attr = op->add_attrs(); auto *attr = op->add_attrs();
......
...@@ -17,6 +17,7 @@ limitations under the License. */ ...@@ -17,6 +17,7 @@ limitations under the License. */
#include <sys/stat.h> #include <sys/stat.h>
#include <cstdio> #include <cstdio>
#include <fstream> #include <fstream>
#include <set>
#include <string> #include <string>
#include <typeindex> #include <typeindex>
#include <unordered_map> #include <unordered_map>
...@@ -29,9 +30,14 @@ limitations under the License. */ ...@@ -29,9 +30,14 @@ limitations under the License. */
#include "paddle/fluid/platform/port.h" #include "paddle/fluid/platform/port.h"
#ifdef _WIN32 #ifdef _WIN32
#define GCC_ATTRIBUTE(attr__) ; #include <direct.h>
#include <io.h>
#define GCC_ATTRIBUTE(attr__)
#define MKDIR(path) _mkdir(path)
#else #else
#include <unistd.h>
#define GCC_ATTRIBUTE(attr__) __attribute__((attr__)); #define GCC_ATTRIBUTE(attr__) __attribute__((attr__));
#define MKDIR(path) mkdir(path, S_IRWXU | S_IRWXG | S_IROTH | S_IXOTH)
#endif #endif
#define __SHOULD_USE_RESULT__ GCC_ATTRIBUTE(warn_unused_result) #define __SHOULD_USE_RESULT__ GCC_ATTRIBUTE(warn_unused_result)
...@@ -163,6 +169,54 @@ static bool PathExists(const std::string &path) { ...@@ -163,6 +169,54 @@ static bool PathExists(const std::string &path) {
return false; return false;
} }
static std::string GetDirRoot(const std::string &path) {
char sep = '/';
#ifdef _WIN32
sep = '\\';
#endif
size_t i = path.rfind(sep, path.length());
if (i != std::string::npos) {
return (path.substr(0, i));
}
return path;
}
static std::string GetOrCreateModelOptCacheDir(const std::string &model_root) {
std::string opt_cache_dir = model_root + "/_opt_cache/";
if (!PathExists(opt_cache_dir)) {
PADDLE_ENFORCE(MKDIR(opt_cache_dir.c_str()) != -1,
"Can not create optimize cache directory: %s, Make sure you "
"have permission to write",
opt_cache_dir);
}
return opt_cache_dir;
}
static std::string GetTrtCalibPath(const std::string &model_root,
const std::string &engine_key) {
return model_root + "/trt_calib_" + engine_key;
}
// If there is no calib table data file in model_opt_cache_dir, return "".
static std::string GetTrtCalibTableData(const std::string &model_opt_cache_dir,
const std::string &engine_key,
bool enable_int8) {
std::string trt_calib_table_path =
GetTrtCalibPath(model_opt_cache_dir, engine_key);
if (enable_int8 && FileExists(trt_calib_table_path)) {
VLOG(3) << "Calibration table file: " << trt_calib_table_path
<< "is found here";
std::ifstream infile(trt_calib_table_path, std::ios::in);
std::stringstream buffer;
buffer << infile.rdbuf();
std::string calibration_data(buffer.str());
return calibration_data;
}
return "";
}
} // namespace analysis } // namespace analysis
} // namespace inference } // namespace inference
} // namespace paddle } // namespace paddle
......
...@@ -67,6 +67,20 @@ void IRPassManager::CreatePasses(Argument *argument, ...@@ -67,6 +67,20 @@ void IRPassManager::CreatePasses(Argument *argument,
pass->Set("max_batch_size", new int(argument->tensorrt_max_batch_size())); pass->Set("max_batch_size", new int(argument->tensorrt_max_batch_size()));
pass->Set("min_subgraph_size", pass->Set("min_subgraph_size",
new int(argument->tensorrt_min_subgraph_size())); new int(argument->tensorrt_min_subgraph_size()));
pass->Set("program",
new framework::ProgramDesc *(&argument->main_program()));
bool enable_int8 = argument->tensorrt_precision_mode() ==
AnalysisConfig::Precision::kInt8;
pass->Set("enable_int8", new bool(enable_int8));
std::string model_opt_cache_dir =
argument->Has("model_dir")
? argument->model_dir()
: GetDirRoot(argument->model_program_path());
pass->Set(
"model_opt_cache_dir",
new std::string(GetOrCreateModelOptCacheDir(model_opt_cache_dir)));
} }
// graph_ = pass->Apply(std::move(graph_)); // graph_ = pass->Apply(std::move(graph_));
...@@ -91,11 +105,14 @@ std::unique_ptr<Graph> IRPassManager::Apply(std::unique_ptr<Graph> graph) { ...@@ -91,11 +105,14 @@ std::unique_ptr<Graph> IRPassManager::Apply(std::unique_ptr<Graph> graph) {
} }
framework::proto::ProgramDesc IRPassManager::AcquireProgram( framework::proto::ProgramDesc IRPassManager::AcquireProgram(
std::unique_ptr<Graph> *graph, const ProgramDesc &program) const { std::unique_ptr<Graph> *graph, ProgramDesc *program) const {
auto pass = auto pass =
framework::ir::PassRegistry::Instance().Get("graph_to_program_pass"); framework::ir::PassRegistry::Instance().Get("graph_to_program_pass");
ProgramDesc desc(program); // Direct using ProgramDesc desc(argument->main_program()) may cause
// incomplete copies of information.
ProgramDesc desc;
desc.CopyFrom(*program->Proto());
pass->SetNotOwned("program", &desc); pass->SetNotOwned("program", &desc);
auto *the_graph = graph->release(); auto *the_graph = graph->release();
*graph = pass->Apply(std::unique_ptr<Graph>(the_graph)); *graph = pass->Apply(std::unique_ptr<Graph>(the_graph));
......
...@@ -29,6 +29,7 @@ ...@@ -29,6 +29,7 @@
#include "paddle/fluid/framework/program_desc.h" #include "paddle/fluid/framework/program_desc.h"
#include "paddle/fluid/framework/scope.h" #include "paddle/fluid/framework/scope.h"
#include "paddle/fluid/inference/analysis/argument.h" #include "paddle/fluid/inference/analysis/argument.h"
#include "paddle/fluid/inference/analysis/helper.h"
namespace paddle { namespace paddle {
namespace inference { namespace inference {
...@@ -42,8 +43,8 @@ class IRPassManager final { ...@@ -42,8 +43,8 @@ class IRPassManager final {
std::unique_ptr<Graph> Apply(std::unique_ptr<Graph> graph); std::unique_ptr<Graph> Apply(std::unique_ptr<Graph> graph);
framework::proto::ProgramDesc AcquireProgram( framework::proto::ProgramDesc AcquireProgram(std::unique_ptr<Graph> *graph,
std::unique_ptr<Graph> *graph, const ProgramDesc &program) const; ProgramDesc *program) const;
framework::ir::Graph &graph() const { return *graph_; } framework::ir::Graph &graph() const { return *graph_; }
......
...@@ -13,6 +13,7 @@ ...@@ -13,6 +13,7 @@
// limitations under the License. // limitations under the License.
#include <algorithm> #include <algorithm>
#include <set>
#include <string> #include <string>
#include <vector> #include <vector>
...@@ -67,12 +68,33 @@ std::unique_ptr<framework::ir::Graph> analysis::TensorRtSubgraphPass::ApplyImpl( ...@@ -67,12 +68,33 @@ std::unique_ptr<framework::ir::Graph> analysis::TensorRtSubgraphPass::ApplyImpl(
return graph; return graph;
} }
std::string GenerateEngineKey(const std::set<std::string> &engine_inputs,
const std::set<std::string> &engine_outputs) {
std::string engine_hash_key = "";
for (auto name : engine_inputs) {
engine_hash_key += name;
}
for (auto name : engine_outputs) {
engine_hash_key += name;
}
auto engine_key = std::to_string(std::hash<std::string>()(engine_hash_key));
return engine_key;
}
void TensorRtSubgraphPass::CreateTensorRTOp(framework::ir::Node *node, void TensorRtSubgraphPass::CreateTensorRTOp(framework::ir::Node *node,
Graph *graph) const { Graph *graph) const {
auto *op_desc = node->Op(); auto *op_desc = node->Op();
auto &subgraph = *Agent(node).subgraph(); auto &subgraph = *Agent(node).subgraph();
PADDLE_ENFORCE(!subgraph.empty()); PADDLE_ENFORCE(!subgraph.empty());
framework::ProgramDesc *program_desc =
Get<framework::ProgramDesc *>("program");
// Add new block for TensorRTEngineOP
const framework::BlockDesc &main_block =
program_desc->Block(framework::kRootBlockIndex);
// const framework::BlockDesc& main_block = program_desc->Block(0);
framework::BlockDesc *new_block = program_desc->AppendBlock(main_block);
// An fake block desc. // An fake block desc.
framework::proto::BlockDesc block_proto; framework::proto::BlockDesc block_proto;
framework::BlockDesc block_desc(nullptr, &block_proto); framework::BlockDesc block_desc(nullptr, &block_proto);
...@@ -82,13 +104,18 @@ void TensorRtSubgraphPass::CreateTensorRTOp(framework::ir::Node *node, ...@@ -82,13 +104,18 @@ void TensorRtSubgraphPass::CreateTensorRTOp(framework::ir::Node *node,
subgraph.size()); subgraph.size());
for (auto *node : subgraph) { for (auto *node : subgraph) {
auto *new_block_op = new_block->AppendOp();
auto *op = block_desc.AppendOp(); auto *op = block_desc.AppendOp();
*new_block_op->Proto() = *node->Op()->Proto();
*op->Proto() = *node->Op()->Proto(); *op->Proto() = *node->Op()->Proto();
} }
// collect inputs // Then, we will use the input_names_with_id and output_names_with_id to
std::unordered_set<std::string> input_names; // generate the eigine key.
std::unordered_set<std::string> input_names_with_id; // So, We use set instead of unordered_set here to ensure that the engine key
// is unique.
std::set<std::string> input_names;
std::set<std::string> input_names_with_id;
for (auto *x : node->inputs) { for (auto *x : node->inputs) {
input_names.insert(x->Name()); input_names.insert(x->Name());
input_names_with_id.insert(x->Name() + std::to_string(x->id())); input_names_with_id.insert(x->Name() + std::to_string(x->id()));
...@@ -96,8 +123,8 @@ void TensorRtSubgraphPass::CreateTensorRTOp(framework::ir::Node *node, ...@@ -96,8 +123,8 @@ void TensorRtSubgraphPass::CreateTensorRTOp(framework::ir::Node *node,
op_desc->SetInput( op_desc->SetInput(
"Xs", std::vector<std::string>(input_names.begin(), input_names.end())); "Xs", std::vector<std::string>(input_names.begin(), input_names.end()));
std::unordered_set<std::string> output_names; std::set<std::string> output_names;
std::unordered_set<std::string> output_names_with_id; std::set<std::string> output_names_with_id;
for (auto *x : node->outputs) { for (auto *x : node->outputs) {
output_names.insert(x->Name()); output_names.insert(x->Name());
output_names_with_id.insert(x->Name() + std::to_string(x->id())); output_names_with_id.insert(x->Name() + std::to_string(x->id()));
...@@ -182,7 +209,6 @@ void TensorRtSubgraphPass::CreateTensorRTOp(framework::ir::Node *node, ...@@ -182,7 +209,6 @@ void TensorRtSubgraphPass::CreateTensorRTOp(framework::ir::Node *node,
// to Tensor. // to Tensor.
std::vector<std::string> output_mapping; std::vector<std::string> output_mapping;
for (auto name : output_names) { for (auto name : output_names) {
// LOG(INFO) << name << " " << output_name_map.size();
PADDLE_ENFORCE(output_name_map.count(name) != 0); PADDLE_ENFORCE(output_name_map.count(name) != 0);
output_mapping.push_back(output_name_map[name]); output_mapping.push_back(output_name_map[name]);
} }
...@@ -193,16 +219,29 @@ void TensorRtSubgraphPass::CreateTensorRTOp(framework::ir::Node *node, ...@@ -193,16 +219,29 @@ void TensorRtSubgraphPass::CreateTensorRTOp(framework::ir::Node *node,
*vars->Add() = *node->Var()->Proto(); *vars->Add() = *node->Var()->Proto();
} }
} }
PADDLE_ENFORCE(!block_desc.Proto()->vars().empty(), PADDLE_ENFORCE(!block_desc.Proto()->vars().empty(),
"the block has no var-desc"); "the block has no var-desc");
PADDLE_ENFORCE(!output_mapping.empty()); PADDLE_ENFORCE(!output_mapping.empty());
// Set attrs op_desc->SetBlockAttr("sub_block", new_block);
SetAttr(op_desc->Proto(), "subgraph", SetAttr(op_desc->Proto(), "subgraph",
block_desc.Proto()->SerializeAsString()); block_desc.Proto()->SerializeAsString());
// Set attrs
SetAttr(op_desc->Proto(), "max_batch_size", Get<int>("max_batch_size")); SetAttr(op_desc->Proto(), "max_batch_size", Get<int>("max_batch_size"));
SetAttr(op_desc->Proto(), "workspace_size", Get<int>("workspace_size")); SetAttr(op_desc->Proto(), "workspace_size", Get<int>("workspace_size"));
SetAttr(op_desc->Proto(), "parameters", ExtractParameters(graph->Nodes())); SetAttr(op_desc->Proto(), "parameters", ExtractParameters(graph->Nodes()));
SetAttr(op_desc->Proto(), "output_name_mapping", output_mapping); SetAttr(op_desc->Proto(), "output_name_mapping", output_mapping);
auto enable_int8 = Get<bool>("enable_int8");
auto engine_key =
GenerateEngineKey(input_names_with_id, output_names_with_id);
std::string calibration_data = GetTrtCalibTableData(
Get<std::string>("model_opt_cache_dir"), engine_key, enable_int8);
SetAttr(op_desc->Proto(), "calibration_data", calibration_data);
SetAttr(op_desc->Proto(), "enable_int8", enable_int8);
SetAttr(op_desc->Proto(), "engine_key", engine_key);
} }
std::vector<std::string> ExtractParameters( std::vector<std::string> ExtractParameters(
......
cc_library(ir_graph_build_pass SRCS ir_graph_build_pass.cc DEPS analysis_pass argument ir_pass_manager) cc_library(ir_graph_build_pass SRCS ir_graph_build_pass.cc DEPS analysis_pass argument ir_pass_manager)
cc_library(ir_analysis_pass SRCS ir_analysis_pass.cc DEPS analysis_pass argument ir_pass_manager) cc_library(ir_analysis_pass SRCS ir_analysis_pass.cc DEPS analysis_pass argument ir_pass_manager)
cc_library(memory_optim_pass SRCS memory_optimize_pass.cc DEPS analysis_pass) cc_library(memory_optim_pass SRCS memory_optimize_pass.cc DEPS analysis_pass zero_copy_tensor)
cc_library(ir_params_sync_among_devices_pass SRCS ir_params_sync_among_devices_pass.cc DEPS analysis_pass argument ir_pass_manager) cc_library(ir_params_sync_among_devices_pass SRCS ir_params_sync_among_devices_pass.cc DEPS analysis_pass argument ir_pass_manager)
cc_library(ir_graph_to_program_pass SRCS ir_graph_to_program_pass.cc DEPS analysis_pass graph_to_program_pass) cc_library(ir_graph_to_program_pass SRCS ir_graph_to_program_pass.cc DEPS analysis_pass graph_to_program_pass)
......
...@@ -31,7 +31,11 @@ void IrGraphToProgramPass::RunImpl(Argument *argument) { ...@@ -31,7 +31,11 @@ void IrGraphToProgramPass::RunImpl(Argument *argument) {
} }
std::unique_ptr<Graph> graph(argument->main_graph_ptr()); std::unique_ptr<Graph> graph(argument->main_graph_ptr());
framework::ProgramDesc desc(argument->main_program());
// Direct using ProgramDesc desc(argument->main_program()) may cause
// incomplete copies of information.
framework::ProgramDesc desc;
desc.CopyFrom(*argument->main_program().Proto());
pass->SetNotOwned("program", &desc); pass->SetNotOwned("program", &desc);
auto thegraph = pass->Apply(std::move(graph)); auto thegraph = pass->Apply(std::move(graph));
thegraph.release(); // the argument still own the graph. thegraph.release(); // the argument still own the graph.
......
...@@ -444,6 +444,26 @@ std::vector<std::map<std::string, std::vector<int>>> DeseralizeBatchVarShapes( ...@@ -444,6 +444,26 @@ std::vector<std::map<std::string, std::vector<int>>> DeseralizeBatchVarShapes(
return batch_shapes; return batch_shapes;
} }
// Replace the -1 in shape to a real number to fake the shape.
std::vector<std::map<std::string, std::vector<int>>> FakeBatchVarShapes(
const framework::ProgramDesc& program) {
std::vector<std::map<std::string, std::vector<int>>> res;
res.emplace_back();
auto& record = res.front();
const int fake_batch_size = 3;
for (auto* var : program.Block(0).AllVars()) {
if (var->GetType() ==
framework::proto::VarType::Type::VarType_Type_LOD_TENSOR) {
auto shape = var->GetShape();
for (auto& v : shape) {
if (v < 0) v = fake_batch_size;
}
record[var->Name()].assign(shape.begin(), shape.end());
}
}
return res;
}
// Calculate the average dim of each tensor from the batch shape cache. // Calculate the average dim of each tensor from the batch shape cache.
std::unordered_map<std::string, size_t> GetBatchAverageSize( std::unordered_map<std::string, size_t> GetBatchAverageSize(
const std::vector<std::map<std::string, std::vector<int>>>& batches) { const std::vector<std::map<std::string, std::vector<int>>>& batches) {
...@@ -478,6 +498,7 @@ std::vector<std::unordered_set<std::string>> AnalysisBatchShapesByBatchSize( ...@@ -478,6 +498,7 @@ std::vector<std::unordered_set<std::string>> AnalysisBatchShapesByBatchSize(
std::unordered_map<std::string, std::stringstream> var_batchsize_hashes; std::unordered_map<std::string, std::stringstream> var_batchsize_hashes;
for (auto& batch : batches) { for (auto& batch : batches) {
for (auto& ele : batch) { for (auto& ele : batch) {
PADDLE_ENFORCE(!ele.second.empty());
int batch_size = ele.second.front(); int batch_size = ele.second.front();
// TODO(Superjomn) might consume large memory here, use combine hash. // TODO(Superjomn) might consume large memory here, use combine hash.
var_batchsize_hashes[ele.first] << batch_size; var_batchsize_hashes[ele.first] << batch_size;
...@@ -538,9 +559,21 @@ std::vector<std::unordered_set<std::string>> AnalysisBatchShapesBySimilarSize( ...@@ -538,9 +559,21 @@ std::vector<std::unordered_set<std::string>> AnalysisBatchShapesBySimilarSize(
std::string MemoryOptimizePass::repr() const { return "memory optimize pass"; } std::string MemoryOptimizePass::repr() const { return "memory optimize pass"; }
std::pair<size_t, size_t> GetRange(
const std::unordered_map<std::string, size_t>& ave_size) {
auto res = std::make_pair(std::numeric_limits<size_t>::max(),
std::numeric_limits<size_t>::min());
for (auto& item : ave_size) {
res.first = std::min(item.second, res.first);
res.second = std::max(item.second, res.second);
}
return res;
}
void MemoryOptimizePass::RunImpl(Argument* argument) { void MemoryOptimizePass::RunImpl(Argument* argument) {
// When force update, should not optimize memory. // When force update, should not optimize memory.
if (!argument->enable_memory_optim() || argument->memory_optim_force_update()) if (!argument->enable_memory_optim() ||
argument->static_memory_optim_force_update())
return; return;
graph_ = argument->main_graph_ptr(); graph_ = argument->main_graph_ptr();
...@@ -549,21 +582,38 @@ void MemoryOptimizePass::RunImpl(Argument* argument) { ...@@ -549,21 +582,38 @@ void MemoryOptimizePass::RunImpl(Argument* argument) {
argument->model_program_path_valid() ? argument->model_program_path() argument->model_program_path_valid() ? argument->model_program_path()
: ""); : "");
VLOG(3) << "Load memory cache from " << path; VLOG(3) << "Load memory cache from " << path;
if (inference::IsFileExists(path)) { std::vector<std::map<std::string, std::vector<int>>> batches;
VLOG(4) << "Performing memory optimize";
auto batches = DeseralizeBatchVarShapes(path); if (argument->static_memory_optim() && inference::IsFileExists(path)) {
auto var_batch_ave_size = GetBatchAverageSize(batches); string::PrettyLogInfo("--- Performing static memory optimize");
batches = DeseralizeBatchVarShapes(path);
} else {
string::PrettyLogInfo("--- Performing dynamic memory optimize");
batches = FakeBatchVarShapes(argument->main_program());
}
auto var_batch_ave_size = GetBatchAverageSize(batches);
// Get min and max memory size.
const auto range = GetRange(var_batch_ave_size);
const int cluster_size = std::max(
static_cast<int>((range.second - range.first) / 100 /*cluster num*/),
1024);
const int cluster_size1 = std::max(
static_cast<int>((range.second - range.first) / 1000 /*cluster num*/),
1024);
std::unordered_map<std::string, Node*> tensor_nodes; std::unordered_map<std::string, Node*> tensor_nodes;
space_table_t space_table; space_table_t space_table;
CollectVarMemorySize(var_batch_ave_size, &tensor_nodes, &space_table); CollectVarMemorySize(var_batch_ave_size, &tensor_nodes, &space_table);
std::unordered_map<std::string, std::string> reuse_table; std::unordered_map<std::string, std::string> reuse_table;
double max_saving_ratio = 0.; double max_saving_ratio = 0.;
std::vector<std::function<MemoryAllocation()>> strategies; std::vector<std::function<MemoryAllocation()>> strategies;
for (int sort_kind = 0; sort_kind < 2; sort_kind++) { for (int sort_kind = 0; sort_kind < 2; sort_kind++) {
if (argument->static_memory_optim()) {
// This strategy only make scene in static memory optimize.
strategies.emplace_back([&, sort_kind] { strategies.emplace_back([&, sort_kind] {
auto clustered_vars_by_batch_size = auto clustered_vars_by_batch_size =
AnalysisBatchShapesByBatchSize(batches); AnalysisBatchShapesByBatchSize(batches);
...@@ -572,71 +622,67 @@ void MemoryOptimizePass::RunImpl(Argument* argument) { ...@@ -572,71 +622,67 @@ void MemoryOptimizePass::RunImpl(Argument* argument) {
space_table, &reuse_table, sort_kind, &allocation); space_table, &reuse_table, sort_kind, &allocation);
return allocation; return allocation;
}); });
}
strategies.emplace_back([&, sort_kind] { strategies.emplace_back([&, sort_kind] {
auto clustered_vars_by_ave_size = AnalysisBatchShapesBySimilarSize( auto clustered_vars_by_ave_size =
space_table, batches, 1024); // interval 1kb AnalysisBatchShapesBySimilarSize(space_table, batches, cluster_size);
MemoryAllocation allocation; MemoryAllocation allocation;
MakeReusePlan(clustered_vars_by_ave_size, var_batch_ave_size, MakeReusePlan(clustered_vars_by_ave_size, var_batch_ave_size, space_table,
space_table, &reuse_table, sort_kind, &allocation); &reuse_table, sort_kind, &allocation);
return allocation; return allocation;
}); });
strategies.emplace_back([&, sort_kind] {
auto clustered_vars_by_ave_size =
AnalysisBatchShapesBySimilarSize(space_table, batches, cluster_size1);
MemoryAllocation allocation;
MakeReusePlan(clustered_vars_by_ave_size, var_batch_ave_size, space_table,
&reuse_table, sort_kind, &allocation);
return allocation;
});
strategies.emplace_back([&, sort_kind] {
auto clustered_vars_by_ave_size = AnalysisBatchShapesBySimilarSize(
space_table, batches,
std::numeric_limits<int>::max()); // no intervals
MemoryAllocation allocation;
MakeReusePlan(clustered_vars_by_ave_size, var_batch_ave_size, space_table,
&reuse_table, sort_kind, &allocation);
return allocation;
});
}
strategies.emplace_back([&, sort_kind] { std::function<MemoryAllocation()>* best_strategy{nullptr};
auto clustered_vars_by_ave_size = AnalysisBatchShapesBySimilarSize(
space_table, batches, 1024 * 1024); // interval 1MB
MemoryAllocation allocation;
MakeReusePlan(clustered_vars_by_ave_size, var_batch_ave_size,
space_table, &reuse_table, sort_kind, &allocation);
return allocation;
});
strategies.emplace_back([&, sort_kind] { // Try all strategies to get the best result.
auto clustered_vars_by_ave_size = AnalysisBatchShapesBySimilarSize( for (auto& strategy : strategies) {
space_table, batches, auto allocation = strategy();
std::numeric_limits<int>::max()); // no intervals string::PrettyLogDetail("--- get strategy saving %f memory for workspace",
MemoryAllocation allocation; allocation.GetSavingRatio());
MakeReusePlan(clustered_vars_by_ave_size, var_batch_ave_size, if (allocation.GetSavingRatio() > max_saving_ratio) {
space_table, &reuse_table, sort_kind, &allocation); max_saving_ratio = allocation.GetSavingRatio();
return allocation; best_strategy = &strategy;
});
} }
}
if (!best_strategy) {
LOG(ERROR) << "This model makes poor memory optimize, skip memory optimize";
return;
}
auto memory_allocation = (*best_strategy)();
std::function<MemoryAllocation()>* best_strategy{nullptr}; string::PrettyLogInfo(
"--- Saved %.2f%s memory for workspace(temporary variables)",
memory_allocation.GetSavingRatio() * 100, "%");
// Try all strategies to get the best result. argument->main_graph().Set(framework::ir::kGraphToProgramVarsToRemove,
for (auto& strategy : strategies) { new std::unordered_set<std::string>);
auto allocation = strategy(); auto& vars2remove =
string::PrettyLogDetail("--- get strategy saving %f memory for workspace", argument->main_graph().Get<std::unordered_set<std::string>>(
allocation.GetSavingRatio()); framework::ir::kGraphToProgramVarsToRemove);
if (allocation.GetSavingRatio() > max_saving_ratio) {
max_saving_ratio = allocation.GetSavingRatio(); PerformReusePlan(reuse_table, memory_allocation.sort_kind, &vars2remove);
best_strategy = &strategy; argument->SetMemoryOptimSortKind(memory_allocation.sort_kind);
}
}
if (!best_strategy) {
LOG(ERROR)
<< "This model makes poor memory optimize, skip memory optimize";
return;
}
auto memory_allocation = (*best_strategy)();
string::PrettyLogH2(
"--- Saved %.2f%s memory for workspace(temporary variables)",
memory_allocation.GetSavingRatio() * 100, "%");
string::PrettyLogDetail("--- Allocated %d MB",
memory_allocation.allocated / 1024. / 1024.);
string::PrettyLogDetail("--- Saved %d MB",
memory_allocation.saved / 1024. / 1024.);
argument->main_graph().Set(framework::ir::kGraphToProgramVarsToRemove,
new std::unordered_set<std::string>);
auto& vars2remove =
argument->main_graph().Get<std::unordered_set<std::string>>(
framework::ir::kGraphToProgramVarsToRemove);
PerformReusePlan(reuse_table, memory_allocation.sort_kind, &vars2remove);
argument->SetMemoryOptimSortKind(memory_allocation.sort_kind);
}
} }
float MemoryOptimizePass::MemoryAllocation::GetSavingRatio() const { float MemoryOptimizePass::MemoryAllocation::GetSavingRatio() const {
......
...@@ -13,9 +13,11 @@ ...@@ -13,9 +13,11 @@
// limitations under the License. // limitations under the License.
#pragma once #pragma once
#include <string>
#include <utility>
#include <vector>
#include "paddle/fluid/inference/analysis/analysis_pass.h" #include "paddle/fluid/inference/analysis/analysis_pass.h"
#include "paddle/fluid/inference/analysis/passes/memory_optimize_pass.h" #include "paddle/fluid/platform/port.h"
namespace paddle { namespace paddle {
namespace inference { namespace inference {
......
...@@ -22,7 +22,7 @@ ...@@ -22,7 +22,7 @@
namespace paddle { namespace paddle {
PassStrategy *contrib::AnalysisConfig::pass_builder() const { PassStrategy *AnalysisConfig::pass_builder() const {
if (!pass_builder_.get()) { if (!pass_builder_.get()) {
if (use_gpu_) { if (use_gpu_) {
LOG(INFO) << "Create GPU IR passes"; LOG(INFO) << "Create GPU IR passes";
...@@ -42,27 +42,27 @@ PassStrategy *contrib::AnalysisConfig::pass_builder() const { ...@@ -42,27 +42,27 @@ PassStrategy *contrib::AnalysisConfig::pass_builder() const {
return pass_builder_.get(); return pass_builder_.get();
} }
contrib::AnalysisConfig::AnalysisConfig(const std::string &model_dir) { AnalysisConfig::AnalysisConfig(const std::string &model_dir) {
model_dir_ = model_dir; model_dir_ = model_dir;
Update(); Update();
} }
contrib::AnalysisConfig::AnalysisConfig(const std::string &prog_file, AnalysisConfig::AnalysisConfig(const std::string &prog_file,
const std::string &params_file) { const std::string &params_file) {
prog_file_ = prog_file; prog_file_ = prog_file;
params_file_ = params_file; params_file_ = params_file;
Update(); Update();
} }
void contrib::AnalysisConfig::SetModel(const std::string &prog_file_path, void AnalysisConfig::SetModel(const std::string &prog_file_path,
const std::string &params_file_path) { const std::string &params_file_path) {
prog_file_ = prog_file_path; prog_file_ = prog_file_path;
params_file_ = params_file_path; params_file_ = params_file_path;
Update(); Update();
} }
void contrib::AnalysisConfig::EnableUseGpu(uint64_t memory_pool_init_size_mb, void AnalysisConfig::EnableUseGpu(uint64_t memory_pool_init_size_mb,
int device_id) { int device_id) {
#ifdef PADDLE_WITH_CUDA #ifdef PADDLE_WITH_CUDA
use_gpu_ = true; use_gpu_ = true;
memory_pool_init_size_mb_ = memory_pool_init_size_mb; memory_pool_init_size_mb_ = memory_pool_init_size_mb;
...@@ -74,13 +74,13 @@ void contrib::AnalysisConfig::EnableUseGpu(uint64_t memory_pool_init_size_mb, ...@@ -74,13 +74,13 @@ void contrib::AnalysisConfig::EnableUseGpu(uint64_t memory_pool_init_size_mb,
Update(); Update();
} }
void contrib::AnalysisConfig::DisableGpu() { void AnalysisConfig::DisableGpu() {
use_gpu_ = false; use_gpu_ = false;
Update(); Update();
} }
contrib::AnalysisConfig::AnalysisConfig(const contrib::AnalysisConfig &other) { AnalysisConfig::AnalysisConfig(const AnalysisConfig &other) {
#define CP_MEMBER(member__) member__ = other.member__; #define CP_MEMBER(member__) member__ = other.member__;
// Model related. // Model related.
...@@ -95,12 +95,14 @@ contrib::AnalysisConfig::AnalysisConfig(const contrib::AnalysisConfig &other) { ...@@ -95,12 +95,14 @@ contrib::AnalysisConfig::AnalysisConfig(const contrib::AnalysisConfig &other) {
CP_MEMBER(memory_pool_init_size_mb_); CP_MEMBER(memory_pool_init_size_mb_);
CP_MEMBER(enable_memory_optim_); CP_MEMBER(enable_memory_optim_);
CP_MEMBER(memory_optim_force_update_); CP_MEMBER(static_memory_optim_);
CP_MEMBER(static_memory_optim_force_update_);
// TensorRT releated. // TensorRT releated.
CP_MEMBER(use_tensorrt_); CP_MEMBER(use_tensorrt_);
CP_MEMBER(tensorrt_workspace_size_); CP_MEMBER(tensorrt_workspace_size_);
CP_MEMBER(tensorrt_max_batchsize_); CP_MEMBER(tensorrt_max_batchsize_);
CP_MEMBER(tensorrt_min_subgraph_size_); CP_MEMBER(tensorrt_min_subgraph_size_);
CP_MEMBER(tensorrt_precision_mode_);
// MKLDNN releated. // MKLDNN releated.
CP_MEMBER(use_mkldnn_); CP_MEMBER(use_mkldnn_);
CP_MEMBER(mkldnn_enabled_op_types_); CP_MEMBER(mkldnn_enabled_op_types_);
...@@ -128,7 +130,7 @@ contrib::AnalysisConfig::AnalysisConfig(const contrib::AnalysisConfig &other) { ...@@ -128,7 +130,7 @@ contrib::AnalysisConfig::AnalysisConfig(const contrib::AnalysisConfig &other) {
Update(); Update();
} }
void contrib::AnalysisConfig::EnableMKLDNN() { void AnalysisConfig::EnableMKLDNN() {
#ifdef PADDLE_WITH_MKLDNN #ifdef PADDLE_WITH_MKLDNN
pass_builder()->EnableMKLDNN(); pass_builder()->EnableMKLDNN();
use_mkldnn_ = true; use_mkldnn_ = true;
...@@ -140,9 +142,9 @@ void contrib::AnalysisConfig::EnableMKLDNN() { ...@@ -140,9 +142,9 @@ void contrib::AnalysisConfig::EnableMKLDNN() {
Update(); Update();
} }
void contrib::AnalysisConfig::EnableTensorRtEngine(int workspace_size, void AnalysisConfig::EnableTensorRtEngine(
int max_batch_size, int workspace_size, int max_batch_size, int min_subgraph_size,
int min_subgraph_size) { AnalysisConfig::Precision precision_mode) {
#ifdef PADDLE_WITH_CUDA #ifdef PADDLE_WITH_CUDA
if (!use_gpu()) { if (!use_gpu()) {
LOG(ERROR) << "To use TensorRT engine, please call EnableGpu() first"; LOG(ERROR) << "To use TensorRT engine, please call EnableGpu() first";
...@@ -153,6 +155,7 @@ void contrib::AnalysisConfig::EnableTensorRtEngine(int workspace_size, ...@@ -153,6 +155,7 @@ void contrib::AnalysisConfig::EnableTensorRtEngine(int workspace_size,
tensorrt_workspace_size_ = workspace_size; tensorrt_workspace_size_ = workspace_size;
tensorrt_max_batchsize_ = max_batch_size; tensorrt_max_batchsize_ = max_batch_size;
tensorrt_min_subgraph_size_ = min_subgraph_size; tensorrt_min_subgraph_size_ = min_subgraph_size;
tensorrt_precision_mode_ = precision_mode;
Update(); Update();
#else #else
...@@ -162,7 +165,7 @@ void contrib::AnalysisConfig::EnableTensorRtEngine(int workspace_size, ...@@ -162,7 +165,7 @@ void contrib::AnalysisConfig::EnableTensorRtEngine(int workspace_size,
} }
// TODO(Superjomn) refactor this, buggy. // TODO(Superjomn) refactor this, buggy.
void contrib::AnalysisConfig::Update() { void AnalysisConfig::Update() {
auto info = SerializeInfoCache(); auto info = SerializeInfoCache();
if (info == serialized_info_cache_) return; if (info == serialized_info_cache_) return;
...@@ -222,7 +225,7 @@ void contrib::AnalysisConfig::Update() { ...@@ -222,7 +225,7 @@ void contrib::AnalysisConfig::Update() {
} }
} }
std::string contrib::AnalysisConfig::SerializeInfoCache() { std::string AnalysisConfig::SerializeInfoCache() {
std::stringstream ss; std::stringstream ss;
ss << model_dir_; ss << model_dir_;
ss << prog_file_; ss << prog_file_;
...@@ -238,7 +241,8 @@ std::string contrib::AnalysisConfig::SerializeInfoCache() { ...@@ -238,7 +241,8 @@ std::string contrib::AnalysisConfig::SerializeInfoCache() {
ss << tensorrt_min_subgraph_size_; ss << tensorrt_min_subgraph_size_;
ss << enable_memory_optim_; ss << enable_memory_optim_;
ss << memory_optim_force_update_; ss << static_memory_optim_;
ss << static_memory_optim_force_update_;
ss << use_mkldnn_; ss << use_mkldnn_;
for (auto &item : mkldnn_enabled_op_types_) ss << item; for (auto &item : mkldnn_enabled_op_types_) ss << item;
...@@ -256,14 +260,14 @@ std::string contrib::AnalysisConfig::SerializeInfoCache() { ...@@ -256,14 +260,14 @@ std::string contrib::AnalysisConfig::SerializeInfoCache() {
return ss.str(); return ss.str();
} }
void contrib::AnalysisConfig::SetCpuMathLibraryNumThreads( void AnalysisConfig::SetCpuMathLibraryNumThreads(
int cpu_math_library_num_threads) { int cpu_math_library_num_threads) {
cpu_math_library_num_threads_ = cpu_math_library_num_threads; cpu_math_library_num_threads_ = cpu_math_library_num_threads;
Update(); Update();
} }
float contrib::AnalysisConfig::fraction_of_gpu_memory_for_pool() const { float AnalysisConfig::fraction_of_gpu_memory_for_pool() const {
#ifdef PADDLE_WITH_CUDA #ifdef PADDLE_WITH_CUDA
// Get the GPU memory details and calculate the fraction of memory for the // Get the GPU memory details and calculate the fraction of memory for the
// GPU memory pool. // GPU memory pool.
...@@ -278,21 +282,23 @@ float contrib::AnalysisConfig::fraction_of_gpu_memory_for_pool() const { ...@@ -278,21 +282,23 @@ float contrib::AnalysisConfig::fraction_of_gpu_memory_for_pool() const {
#endif #endif
} }
void contrib::AnalysisConfig::EnableMemoryOptim(bool force_update_cache) { void AnalysisConfig::EnableMemoryOptim(bool static_optim,
bool force_update_static_cache) {
enable_memory_optim_ = true; enable_memory_optim_ = true;
memory_optim_force_update_ = force_update_cache; static_memory_optim_ = static_optim;
static_memory_optim_force_update_ = force_update_static_cache;
Update(); Update();
} }
bool contrib::AnalysisConfig::enable_memory_optim() const { bool AnalysisConfig::enable_memory_optim() const {
return enable_memory_optim_; return enable_memory_optim_;
} }
void contrib::AnalysisConfig::SetModelBuffer(const char *prog_buffer, void AnalysisConfig::SetModelBuffer(const char *prog_buffer,
size_t prog_buffer_size, size_t prog_buffer_size,
const char *param_buffer, const char *param_buffer,
size_t param_buffer_size) { size_t param_buffer_size) {
prog_file_ = std::string(prog_buffer, prog_buffer + prog_buffer_size); prog_file_ = std::string(prog_buffer, prog_buffer + prog_buffer_size);
params_file_ = std::string(param_buffer, param_buffer + param_buffer_size); params_file_ = std::string(param_buffer, param_buffer + param_buffer_size);
model_from_memory_ = true; model_from_memory_ = true;
...@@ -300,4 +306,16 @@ void contrib::AnalysisConfig::SetModelBuffer(const char *prog_buffer, ...@@ -300,4 +306,16 @@ void contrib::AnalysisConfig::SetModelBuffer(const char *prog_buffer,
Update(); Update();
} }
NativeConfig AnalysisConfig::ToNativeConfig() const {
NativeConfig config;
config.model_dir = model_dir_;
config.prog_file = prog_file_;
config.param_file = params_file_;
config.use_gpu = use_gpu_;
config.device = device_id_;
config.fraction_of_gpu_memory = fraction_of_gpu_memory_for_pool();
config.specify_input_name = specify_input_name_;
return config;
}
} // namespace paddle } // namespace paddle
...@@ -15,6 +15,7 @@ ...@@ -15,6 +15,7 @@
#include "paddle/fluid/inference/api/analysis_predictor.h" #include "paddle/fluid/inference/api/analysis_predictor.h"
#include <glog/logging.h> #include <glog/logging.h>
#include <algorithm> #include <algorithm>
#include <fstream>
#include <memory> #include <memory>
#include <string> #include <string>
#include <vector> #include <vector>
...@@ -25,6 +26,7 @@ ...@@ -25,6 +26,7 @@
#include "paddle/fluid/framework/naive_executor.h" #include "paddle/fluid/framework/naive_executor.h"
#include "paddle/fluid/framework/scope.h" #include "paddle/fluid/framework/scope.h"
#include "paddle/fluid/framework/var_type_traits.h" #include "paddle/fluid/framework/var_type_traits.h"
#include "paddle/fluid/inference/analysis/helper.h"
#include "paddle/fluid/inference/analysis/passes/memory_optimize_pass.h" #include "paddle/fluid/inference/analysis/passes/memory_optimize_pass.h"
#include "paddle/fluid/inference/api/helper.h" #include "paddle/fluid/inference/api/helper.h"
#include "paddle/fluid/inference/api/paddle_inference_api.h" #include "paddle/fluid/inference/api/paddle_inference_api.h"
...@@ -37,13 +39,20 @@ ...@@ -37,13 +39,20 @@
#if PADDLE_WITH_TENSORRT #if PADDLE_WITH_TENSORRT
#include "paddle/fluid/inference/tensorrt/convert/op_converter.h" #include "paddle/fluid/inference/tensorrt/convert/op_converter.h"
#include "paddle/fluid/inference/tensorrt/trt_int8_calibrator.h"
#endif #endif
DECLARE_bool(profile); DECLARE_bool(profile);
namespace paddle { namespace paddle {
using contrib::AnalysisConfig; using inference::Singleton;
#if PADDLE_WITH_TENSORRT
using inference::tensorrt::TRTInt8Calibrator;
using inference::tensorrt::TRTCalibratorEngine;
using inference::tensorrt::TRTCalibratorEngineManager;
#endif
namespace { namespace {
bool IsPersistable(const framework::VarDesc *var) { bool IsPersistable(const framework::VarDesc *var) {
...@@ -113,6 +122,15 @@ bool AnalysisPredictor::PrepareProgram( ...@@ -113,6 +122,15 @@ bool AnalysisPredictor::PrepareProgram(
if (!program) { if (!program) {
if (!LoadProgramDesc()) return false; if (!LoadProgramDesc()) return false;
// If not cloned, the parameters should be loaded.
// If config_.ir_optim() is True, parameters is loaded in
// OptimizeInferenceProgram(), but other persistable variables
// (like RAW type var) are not created in scope.
// If config_.ir_optim() is False, parameters is loaded in LoadParameters(),
// still need to create other persistable variables.
// So in both case, create persistable variables at first.
executor_->CreateVariables(*inference_program_, 0, true, sub_scope_);
// Optimize the program, and load parameters and modify them in the // Optimize the program, and load parameters and modify them in the
// scope_. // scope_.
// This will change the scope_ address. // This will change the scope_ address.
...@@ -120,15 +138,6 @@ bool AnalysisPredictor::PrepareProgram( ...@@ -120,15 +138,6 @@ bool AnalysisPredictor::PrepareProgram(
status_ir_optim_enabled_ = true; status_ir_optim_enabled_ = true;
OptimizeInferenceProgram(); OptimizeInferenceProgram();
} else { } else {
// If the parent_scope is passed, we assert that the persistable variables
// are already created, so just create the no persistable variables.
// If not cloned, the parameters should be loaded
// OptimizeInferenceProgram.
// So in both cases, just the local variables are needed to load, not the
// parematers.
executor_->CreateVariables(*inference_program_, 0, true, sub_scope_);
// Load parameters // Load parameters
LOG(INFO) << "load parameters "; LOG(INFO) << "load parameters ";
LoadParameters(); LoadParameters();
...@@ -298,15 +307,15 @@ void AnalysisPredictor::GetFetchOne(const framework::LoDTensor &fetch, ...@@ -298,15 +307,15 @@ void AnalysisPredictor::GetFetchOne(const framework::LoDTensor &fetch,
bool AnalysisPredictor::GetFetch(std::vector<PaddleTensor> *outputs, bool AnalysisPredictor::GetFetch(std::vector<PaddleTensor> *outputs,
framework::Scope *scope) { framework::Scope *scope) {
VLOG(3) << "Predictor::get_fetch"; VLOG(3) << "Predictor::get_fetch";
outputs->resize(fetchs_.size()); outputs->resize(fetches_.size());
for (size_t i = 0; i < fetchs_.size(); ++i) { for (size_t i = 0; i < fetches_.size(); ++i) {
int idx = boost::get<int>(fetchs_[i]->GetAttr("col")); int idx = boost::get<int>(fetches_[i]->GetAttr("col"));
PADDLE_ENFORCE((size_t)idx == i); PADDLE_ENFORCE((size_t)idx == i);
framework::LoDTensor &fetch = framework::LoDTensor &fetch =
framework::GetFetchVariable(*scope, "fetch", idx); framework::GetFetchVariable(*scope, "fetch", idx);
auto type = fetch.type(); auto type = fetch.type();
auto output = &(outputs->at(i)); auto output = &(outputs->at(i));
output->name = fetchs_[idx]->Input("X")[0]; output->name = fetches_[idx]->Input("X")[0];
if (type == framework::proto::VarType::FP32) { if (type == framework::proto::VarType::FP32) {
GetFetchOne<float>(fetch, output); GetFetchOne<float>(fetch, output);
output->dtype = PaddleDType::FLOAT32; output->dtype = PaddleDType::FLOAT32;
...@@ -327,7 +336,9 @@ void AnalysisPredictor::OptimizeInferenceProgram() { ...@@ -327,7 +336,9 @@ void AnalysisPredictor::OptimizeInferenceProgram() {
argument_.SetUseGPU(config_.use_gpu()); argument_.SetUseGPU(config_.use_gpu());
argument_.SetGPUDeviceId(config_.gpu_device_id()); argument_.SetGPUDeviceId(config_.gpu_device_id());
argument_.SetEnableMemoryOptim(config_.enable_memory_optim()); argument_.SetEnableMemoryOptim(config_.enable_memory_optim());
argument_.SetMemoryOptimForceUpdate(config_.memory_optim_force_update_); argument_.SetStaticMemoryOptim(config_.static_memory_optim_);
argument_.SetStaticMemoryOptimForceUpdate(
config_.static_memory_optim_force_update_);
argument_.SetModelFromMemory(config_.model_from_memory_); argument_.SetModelFromMemory(config_.model_from_memory_);
// Analyze inference_program // Analyze inference_program
if (!config_.model_dir().empty()) { if (!config_.model_dir().empty()) {
...@@ -337,6 +348,8 @@ void AnalysisPredictor::OptimizeInferenceProgram() { ...@@ -337,6 +348,8 @@ void AnalysisPredictor::OptimizeInferenceProgram() {
!config_.params_file().empty(), !config_.params_file().empty(),
"Either model_dir or (param_file, prog_file) should be set."); "Either model_dir or (param_file, prog_file) should be set.");
PADDLE_ENFORCE(!config_.prog_file().empty()); PADDLE_ENFORCE(!config_.prog_file().empty());
std::string dir = inference::analysis::GetDirRoot(config_.prog_file());
argument_.SetModelProgramPath(config_.prog_file()); argument_.SetModelProgramPath(config_.prog_file());
argument_.SetModelParamsPath(config_.params_file()); argument_.SetModelParamsPath(config_.params_file());
} }
...@@ -347,6 +360,7 @@ void AnalysisPredictor::OptimizeInferenceProgram() { ...@@ -347,6 +360,7 @@ void AnalysisPredictor::OptimizeInferenceProgram() {
argument_.SetTensorRtWorkspaceSize(config_.tensorrt_workspace_size_); argument_.SetTensorRtWorkspaceSize(config_.tensorrt_workspace_size_);
argument_.SetTensorRtMaxBatchSize(config_.tensorrt_max_batchsize_); argument_.SetTensorRtMaxBatchSize(config_.tensorrt_max_batchsize_);
argument_.SetTensorRtMinSubgraphSize(config_.tensorrt_min_subgraph_size_); argument_.SetTensorRtMinSubgraphSize(config_.tensorrt_min_subgraph_size_);
argument_.SetTensorRtPrecisionMode(config_.tensorrt_precision_mode_);
} }
if (config_.use_mkldnn_) { if (config_.use_mkldnn_) {
...@@ -361,7 +375,7 @@ void AnalysisPredictor::OptimizeInferenceProgram() { ...@@ -361,7 +375,7 @@ void AnalysisPredictor::OptimizeInferenceProgram() {
} }
argument_.SetIrAnalysisPasses(passes); argument_.SetIrAnalysisPasses(passes);
argument_.SetAnalysisPasses(config_.pass_builder()->AnalysisPasses()); argument_.SetAnalysisPasses(config_.pass_builder()->AnalysisPasses());
argument_.SetScopeNotOwned(const_cast<framework::Scope *>(scope_.get())); argument_.SetScopeNotOwned(scope_.get());
Analyzer().Run(&argument_); Analyzer().Run(&argument_);
PADDLE_ENFORCE(argument_.scope_valid()); PADDLE_ENFORCE(argument_.scope_valid());
...@@ -422,10 +436,10 @@ void AnalysisPredictor::PrepareFeedFetch() { ...@@ -422,10 +436,10 @@ void AnalysisPredictor::PrepareFeedFetch() {
feed_names_[op->Output("Out")[0]] = idx; feed_names_[op->Output("Out")[0]] = idx;
} else if (op->Type() == "fetch") { } else if (op->Type() == "fetch") {
int idx = boost::get<int>(op->GetAttr("col")); int idx = boost::get<int>(op->GetAttr("col"));
if (fetchs_.size() <= static_cast<size_t>(idx)) { if (fetches_.size() <= static_cast<size_t>(idx)) {
fetchs_.resize(idx + 1); fetches_.resize(idx + 1);
} }
fetchs_[idx] = op; fetches_[idx] = op;
} }
} }
} }
...@@ -567,7 +581,67 @@ bool AnalysisPredictor::LoadParameters() { ...@@ -567,7 +581,67 @@ bool AnalysisPredictor::LoadParameters() {
return true; return true;
} }
#if PADDLE_WITH_TENSORRT
bool AnalysisPredictor::SaveTrtCalibToDisk() {
PADDLE_ENFORCE(config_.tensorrt_engine_enabled(),
"This func can be invoked only in trt mode");
auto &block = inference_program_->Block(0);
for (auto &op_desc : block.AllOps()) {
if (op_desc->Type() == "tensorrt_engine") {
std::string engine_name =
boost::get<std::string>(op_desc->GetAttr("engine_key"));
if (!Singleton<TRTCalibratorEngineManager>::Global().Has(engine_name)) {
LOG(ERROR) << "You should run the predictor(with trt) on the real data "
"to generate calibration info";
return false;
}
TRTCalibratorEngine *calib_engine =
Singleton<TRTCalibratorEngineManager>::Global().Get(engine_name);
LOG(INFO) << "Wait for calib threads done.";
calib_engine->calib_->waitAndSetDone();
LOG(INFO) << "Generating TRT Calibration table data, this may cost a lot "
"of time...";
calib_engine->thr_->join();
std::string calibration_table_data =
calib_engine->calib_->getCalibrationTableAsString();
if (calibration_table_data.empty()) {
LOG(ERROR) << "the calibration table is empty.";
return false;
}
std::string model_opt_cache_dir =
argument_.Has("model_dir")
? argument_.model_dir()
: inference::analysis::GetDirRoot(argument_.model_program_path());
std::string calibration_table_data_path =
inference::analysis::GetTrtCalibPath(
inference::analysis::GetOrCreateModelOptCacheDir(
model_opt_cache_dir),
engine_name);
std::ofstream ofile(calibration_table_data_path, std::ios::out);
LOG(INFO) << "Write Paddle-TRT INT8 calibration table data to file "
<< calibration_table_data_path;
ofile << calibration_table_data;
ofile.close();
}
}
// Free all calibrator resources.
Singleton<TRTCalibratorEngineManager>::Global().DeleteALL();
return true;
}
#endif
AnalysisPredictor::~AnalysisPredictor() { AnalysisPredictor::~AnalysisPredictor() {
#if PADDLE_WITH_TENSORRT
if (config_.tensorrt_engine_enabled() &&
config_.tensorrt_precision_mode_ == AnalysisConfig::Precision::kInt8 &&
Singleton<TRTCalibratorEngineManager>::Global().Has()) {
SaveTrtCalibToDisk();
}
#endif
if (FLAGS_profile) { if (FLAGS_profile) {
platform::DisableProfiler(platform::EventSortingKey::kTotal, platform::DisableProfiler(platform::EventSortingKey::kTotal,
"./profile.log"); "./profile.log");
...@@ -638,12 +712,12 @@ bool AnalysisPredictor::need_collect_var_shapes_for_memory_optim() { ...@@ -638,12 +712,12 @@ bool AnalysisPredictor::need_collect_var_shapes_for_memory_optim() {
// check if the cache exists // check if the cache exists
if (!config_.enable_memory_optim()) { if (!config_.enable_memory_optim()) {
need = false; need = false;
} else if (config_.enable_memory_optim() && } else if (config_.static_memory_optim_ &&
!inference::IsFileExists(inference::analysis::GetMemoryCachePath( !inference::IsFileExists(inference::analysis::GetMemoryCachePath(
config_.model_dir(), config_.prog_file()))) { config_.model_dir(), config_.prog_file()))) {
need = true; need = true;
} else if (config_.enable_memory_optim() && } else if (config_.static_memory_optim_ &&
config_.memory_optim_force_update_) { config_.static_memory_optim_force_update_) {
need = true; need = true;
} }
...@@ -651,11 +725,15 @@ bool AnalysisPredictor::need_collect_var_shapes_for_memory_optim() { ...@@ -651,11 +725,15 @@ bool AnalysisPredictor::need_collect_var_shapes_for_memory_optim() {
return need; return need;
} }
std::string AnalysisPredictor::GetSeriazlizedProgram() const {
return inference_program_->Proto()->SerializeAsString();
}
template <> template <>
std::unique_ptr<PaddlePredictor> CreatePaddlePredictor<contrib::AnalysisConfig>( std::unique_ptr<PaddlePredictor> CreatePaddlePredictor<AnalysisConfig>(
const contrib::AnalysisConfig &config) { const AnalysisConfig &config) {
return CreatePaddlePredictor<contrib::AnalysisConfig, return CreatePaddlePredictor<AnalysisConfig, PaddleEngineKind::kAnalysis>(
PaddleEngineKind::kAnalysis>(config); config);
} }
} // namespace paddle } // namespace paddle
......
...@@ -33,7 +33,6 @@ using inference::analysis::Argument; ...@@ -33,7 +33,6 @@ using inference::analysis::Argument;
using inference::analysis::Analyzer; using inference::analysis::Analyzer;
using framework::proto::ProgramDesc; using framework::proto::ProgramDesc;
using framework::NaiveExecutor; using framework::NaiveExecutor;
using contrib::AnalysisConfig;
/** \brief This predictor is based on the original native predictor with IR and /** \brief This predictor is based on the original native predictor with IR and
* Analysis support. * Analysis support.
...@@ -75,6 +74,8 @@ class AnalysisPredictor : public PaddlePredictor { ...@@ -75,6 +74,8 @@ class AnalysisPredictor : public PaddlePredictor {
void SetMkldnnThreadID(int tid); void SetMkldnnThreadID(int tid);
std::string GetSeriazlizedProgram() const override;
protected: protected:
// For memory optimization. // For memory optimization.
bool need_collect_var_shapes_for_memory_optim(); bool need_collect_var_shapes_for_memory_optim();
...@@ -97,6 +98,21 @@ class AnalysisPredictor : public PaddlePredictor { ...@@ -97,6 +98,21 @@ class AnalysisPredictor : public PaddlePredictor {
void GetFetchOne(const framework::LoDTensor &fetchs, void GetFetchOne(const framework::LoDTensor &fetchs,
PaddleTensor *output_data); PaddleTensor *output_data);
#if PADDLE_WITH_TENSORRT
// When we use Paddle-TRT INT8 engine, we need to generate calibration table
// data first,
// the calibration table contains the range for each op's input and output,
// this whole process can be divided into several steps:
//
// 1. Builds a 32-bit engine, runs it on the calibration set, and records a
// histogram for each
// tensor of the distribution of activation values.
// 2. Builds a calibration table from the histograms.
//
// After step 2, we need to store the calibration table on disk
bool SaveTrtCalibToDisk();
#endif
// Some more detailed tests, they are made the friends of the predictor, so that // Some more detailed tests, they are made the friends of the predictor, so that
// the all the details can be tested. // the all the details can be tested.
#if PADDLE_WITH_TESTING #if PADDLE_WITH_TESTING
...@@ -106,7 +122,7 @@ class AnalysisPredictor : public PaddlePredictor { ...@@ -106,7 +122,7 @@ class AnalysisPredictor : public PaddlePredictor {
#endif #endif
private: private:
contrib::AnalysisConfig config_; AnalysisConfig config_;
Argument argument_; Argument argument_;
std::unique_ptr<NaiveExecutor> executor_; std::unique_ptr<NaiveExecutor> executor_;
platform::Place place_; platform::Place place_;
...@@ -115,7 +131,7 @@ class AnalysisPredictor : public PaddlePredictor { ...@@ -115,7 +131,7 @@ class AnalysisPredictor : public PaddlePredictor {
std::shared_ptr<framework::ProgramDesc> inference_program_; std::shared_ptr<framework::ProgramDesc> inference_program_;
std::vector<framework::OpDesc *> feeds_; std::vector<framework::OpDesc *> feeds_;
std::map<std::string, size_t> feed_names_; std::map<std::string, size_t> feed_names_;
std::vector<framework::OpDesc *> fetchs_; std::vector<framework::OpDesc *> fetches_;
// Memory buffer for feed inputs. The temporary LoDTensor will cause serious // Memory buffer for feed inputs. The temporary LoDTensor will cause serious
// concurrency problems, wrong results and memory leak, so cache them. // concurrency problems, wrong results and memory leak, so cache them.
std::vector<framework::LoDTensor> feed_tensors_; std::vector<framework::LoDTensor> feed_tensors_;
......
...@@ -24,7 +24,6 @@ ...@@ -24,7 +24,6 @@
DEFINE_string(dirname, "", "dirname to tests."); DEFINE_string(dirname, "", "dirname to tests.");
namespace paddle { namespace paddle {
using contrib::AnalysisConfig;
TEST(AnalysisPredictor, analysis_off) { TEST(AnalysisPredictor, analysis_off) {
AnalysisConfig config; AnalysisConfig config;
...@@ -215,6 +214,8 @@ TEST(AnalysisPredictor, memory_optim) { ...@@ -215,6 +214,8 @@ TEST(AnalysisPredictor, memory_optim) {
{ {
// The first predictor help to cache the memory optimize strategy. // The first predictor help to cache the memory optimize strategy.
auto predictor = CreatePaddlePredictor<AnalysisConfig>(config); auto predictor = CreatePaddlePredictor<AnalysisConfig>(config);
LOG(INFO) << "serialized program: " << predictor->GetSeriazlizedProgram();
ASSERT_FALSE(predictor->GetSeriazlizedProgram().empty());
// Run several times to check the parameters are not reused by mistake. // Run several times to check the parameters are not reused by mistake.
for (int i = 0; i < 5; i++) { for (int i = 0; i < 5; i++) {
......
...@@ -12,6 +12,8 @@ ...@@ -12,6 +12,8 @@
// See the License for the specific language governing permissions and // See the License for the specific language governing permissions and
// limitations under the License. // limitations under the License.
#include <sstream>
#include "paddle/fluid/framework/commit.h"
#include "paddle/fluid/framework/lod_tensor.h" #include "paddle/fluid/framework/lod_tensor.h"
#include "paddle/fluid/framework/scope.h" #include "paddle/fluid/framework/scope.h"
#include "paddle/fluid/inference/api/paddle_inference_api.h" #include "paddle/fluid/inference/api/paddle_inference_api.h"
...@@ -97,4 +99,12 @@ void PaddleBuf::Free() { ...@@ -97,4 +99,12 @@ void PaddleBuf::Free() {
} }
} }
std::string get_version() {
std::stringstream ss;
ss << "version: " << framework::paddle_version() << "\n";
ss << "commit: " << framework::paddle_commit() << "\n";
ss << "branch: " << framework::paddle_compile_branch() << "\n";
return ss.str();
}
} // namespace paddle } // namespace paddle
...@@ -295,7 +295,7 @@ TEST(inference_api_native, image_classification_gpu) { ...@@ -295,7 +295,7 @@ TEST(inference_api_native, image_classification_gpu) {
#endif #endif
TEST(PassBuilder, Delete) { TEST(PassBuilder, Delete) {
contrib::AnalysisConfig config; AnalysisConfig config;
config.DisableGpu(); config.DisableGpu();
config.pass_builder()->DeletePass("attention_lstm_fuse_pass"); config.pass_builder()->DeletePass("attention_lstm_fuse_pass");
const auto& passes = config.pass_builder()->AllPasses(); const auto& passes = config.pass_builder()->AllPasses();
......
...@@ -61,4 +61,10 @@ TEST(paddle_inference_api, demo) { ...@@ -61,4 +61,10 @@ TEST(paddle_inference_api, demo) {
predictor->Run({}, &outputs); predictor->Run({}, &outputs);
} }
TEST(paddle_inference_api, get_version) {
LOG(INFO) << "paddle version:\n" << get_version();
auto version = get_version();
ASSERT_FALSE(version.empty());
}
} // namespace paddle } // namespace paddle
...@@ -36,7 +36,7 @@ namespace demo { ...@@ -36,7 +36,7 @@ namespace demo {
*/ */
void Main() { void Main() {
std::unique_ptr<PaddlePredictor> predictor; std::unique_ptr<PaddlePredictor> predictor;
paddle::contrib::AnalysisConfig config; paddle::AnalysisConfig config;
config.EnableUseGpu(100, 0); config.EnableUseGpu(100, 0);
config.SetModel(FLAGS_modeldir + "/__model__", config.SetModel(FLAGS_modeldir + "/__model__",
FLAGS_modeldir + "/__params__"); FLAGS_modeldir + "/__params__");
......
...@@ -34,7 +34,6 @@ DEFINE_bool(use_gpu, false, "Whether use gpu."); ...@@ -34,7 +34,6 @@ DEFINE_bool(use_gpu, false, "Whether use gpu.");
namespace paddle { namespace paddle {
namespace demo { namespace demo {
using contrib::AnalysisConfig;
/* /*
* Use the native and analysis fluid engine to inference the demo. * Use the native and analysis fluid engine to inference the demo.
*/ */
......
...@@ -29,11 +29,6 @@ ...@@ -29,11 +29,6 @@
namespace paddle { namespace paddle {
class AnalysisPredictor; class AnalysisPredictor;
// ==
//
// -----------------------------------------------------------------------------------
// NOTE: The following APIs are not mature yet, we are still working on them.
namespace contrib {
// NOTE WIP, not stable yet. // NOTE WIP, not stable yet.
struct AnalysisConfig { struct AnalysisConfig {
...@@ -42,6 +37,10 @@ struct AnalysisConfig { ...@@ -42,6 +37,10 @@ struct AnalysisConfig {
explicit AnalysisConfig(const std::string& model_dir); explicit AnalysisConfig(const std::string& model_dir);
explicit AnalysisConfig(const std::string& prog_file, explicit AnalysisConfig(const std::string& prog_file,
const std::string& params_file); const std::string& params_file);
enum class Precision {
kFloat32 = 0,
kInt8,
};
/** Set model with a directory. /** Set model with a directory.
*/ */
...@@ -135,7 +134,8 @@ struct AnalysisConfig { ...@@ -135,7 +134,8 @@ struct AnalysisConfig {
* subgraph is less than this, it will not transfer to TensorRT engine. * subgraph is less than this, it will not transfer to TensorRT engine.
*/ */
void EnableTensorRtEngine(int workspace_size = 1 << 20, void EnableTensorRtEngine(int workspace_size = 1 << 20,
int max_batch_size = 1, int min_subgraph_size = 3); int max_batch_size = 1, int min_subgraph_size = 3,
Precision precision = Precision::kFloat32);
/** A boolean state telling whether the TensorRT engine is used. /** A boolean state telling whether the TensorRT engine is used.
*/ */
bool tensorrt_engine_enabled() const { return use_tensorrt_; } bool tensorrt_engine_enabled() const { return use_tensorrt_; }
...@@ -162,17 +162,7 @@ struct AnalysisConfig { ...@@ -162,17 +162,7 @@ struct AnalysisConfig {
/** Transform the AnalysisConfig to NativeConfig. /** Transform the AnalysisConfig to NativeConfig.
*/ */
NativeConfig ToNativeConfig() const { NativeConfig ToNativeConfig() const;
NativeConfig config;
config.model_dir = model_dir_;
config.prog_file = prog_file_;
config.param_file = params_file_;
config.use_gpu = use_gpu_;
config.device = device_id_;
config.fraction_of_gpu_memory = fraction_of_gpu_memory_for_pool();
config.specify_input_name = specify_input_name_;
return config;
}
/** Specify the operator type list to use MKLDNN acceleration. /** Specify the operator type list to use MKLDNN acceleration.
* @param op_list the operator type list. * @param op_list the operator type list.
*/ */
...@@ -195,7 +185,8 @@ struct AnalysisConfig { ...@@ -195,7 +185,8 @@ struct AnalysisConfig {
/** Turn on memory optimize /** Turn on memory optimize
* NOTE still in development, will release latter. * NOTE still in development, will release latter.
*/ */
void EnableMemoryOptim(bool force_update_cache = false); void EnableMemoryOptim(bool static_optim = false,
bool force_update_static_cache = false);
/** Tell whether the memory optimization is activated. */ /** Tell whether the memory optimization is activated. */
bool enable_memory_optim() const; bool enable_memory_optim() const;
...@@ -238,10 +229,12 @@ struct AnalysisConfig { ...@@ -238,10 +229,12 @@ struct AnalysisConfig {
// We set this variable to control the minimum number of nodes in the // We set this variable to control the minimum number of nodes in the
// subgraph, 3 as default value. // subgraph, 3 as default value.
int tensorrt_min_subgraph_size_{3}; int tensorrt_min_subgraph_size_{3};
Precision tensorrt_precision_mode_;
// memory reuse related. // memory reuse related.
bool enable_memory_optim_{false}; bool enable_memory_optim_{false};
bool memory_optim_force_update_{false}; bool static_memory_optim_{false};
bool static_memory_optim_force_update_{false};
bool use_mkldnn_{false}; bool use_mkldnn_{false};
std::unordered_set<std::string> mkldnn_enabled_op_types_; std::unordered_set<std::string> mkldnn_enabled_op_types_;
...@@ -262,5 +255,4 @@ struct AnalysisConfig { ...@@ -262,5 +255,4 @@ struct AnalysisConfig {
mutable std::unique_ptr<PassStrategy> pass_builder_; mutable std::unique_ptr<PassStrategy> pass_builder_;
}; };
} // namespace contrib
} // namespace paddle } // namespace paddle
...@@ -215,6 +215,14 @@ class PaddlePredictor { ...@@ -215,6 +215,14 @@ class PaddlePredictor {
*/ */
virtual ~PaddlePredictor() = default; virtual ~PaddlePredictor() = default;
/** \brief Get the serialized model program that executes in inference phase.
* Its data type is ProgramDesc, which is a protobuf message.
*/
virtual std::string GetSeriazlizedProgram() const {
assert(false); // Force raise error.
return "NotImplemented";
}
/** The common configs for all the predictors. /** The common configs for all the predictors.
*/ */
struct Config { struct Config {
...@@ -288,4 +296,6 @@ std::unique_ptr<PaddlePredictor> CreatePaddlePredictor(const ConfigT& config); ...@@ -288,4 +296,6 @@ std::unique_ptr<PaddlePredictor> CreatePaddlePredictor(const ConfigT& config);
int PaddleDtypeSize(PaddleDType dtype); int PaddleDtypeSize(PaddleDType dtype);
std::string get_version();
} // namespace paddle } // namespace paddle
...@@ -154,13 +154,16 @@ class GpuPassStrategy : public PassStrategy { ...@@ -154,13 +154,16 @@ class GpuPassStrategy : public PassStrategy {
public: public:
GpuPassStrategy() : PassStrategy({}) { GpuPassStrategy() : PassStrategy({}) {
passes_.assign({ passes_.assign({
"infer_clean_graph_pass", // "infer_clean_graph_pass", //
"conv_affine_channel_fuse_pass", // "conv_affine_channel_fuse_pass", //
"conv_eltwiseadd_affine_channel_fuse_pass", // "conv_eltwiseadd_affine_channel_fuse_pass", //
"conv_bn_fuse_pass", // "conv_bn_fuse_pass", //
"conv_elementwise_add_act_fuse_pass", // #if CUDNN_VERSION >= 7100 // To run conv_fusion, the version of cudnn must be
"conv_elementwise_add2_act_fuse_pass", // // guaranteed at least v7
"conv_elementwise_add_fuse_pass", // "conv_elementwise_add_act_fuse_pass", //
"conv_elementwise_add2_act_fuse_pass", //
"conv_elementwise_add_fuse_pass", //
#endif
}); });
for (int i = 6; i >= 3; i--) { for (int i = 6; i >= 3; i--) {
......
nv_library(tensorrt_engine SRCS engine.cc DEPS ${GLOB_OPERATOR_DEPS} framework_proto device_context) nv_library(tensorrt_engine SRCS engine.cc trt_int8_calibrator.cc DEPS ${GLOB_OPERATOR_DEPS} framework_proto device_context)
nv_library(tensorrt_op_teller SRCS op_teller.cc DEPS framework_proto) nv_library(tensorrt_op_teller SRCS op_teller.cc DEPS framework_proto)
nv_test(test_tensorrt SRCS test_tensorrt.cc DEPS dynload_cuda device_context dynamic_loader) nv_test(test_tensorrt SRCS test_tensorrt.cc DEPS dynload_cuda device_context dynamic_loader)
nv_test(test_tensorrt_engine SRCS test_engine.cc DEPS dynload_cuda tensorrt_engine) nv_test(test_tensorrt_engine SRCS test_engine.cc DEPS dynload_cuda tensorrt_engine)
......
...@@ -29,9 +29,9 @@ TEST(OpConverter, ConvertBlock) { ...@@ -29,9 +29,9 @@ TEST(OpConverter, ConvertBlock) {
// init trt engine // init trt engine
cudaStream_t stream_; cudaStream_t stream_;
std::unique_ptr<TensorRTEngine> engine_; std::unique_ptr<TensorRTEngine> engine_;
engine_.reset(new TensorRTEngine(5, 1 << 15, &stream_));
engine_->InitNetwork();
PADDLE_ENFORCE_EQ(cudaStreamCreate(&stream_), 0); PADDLE_ENFORCE_EQ(cudaStreamCreate(&stream_), 0);
engine_.reset(new TensorRTEngine(5, 1 << 15, stream_));
engine_->InitNetwork();
engine_->DeclareInput("conv2d-X", nvinfer1::DataType::kFLOAT, engine_->DeclareInput("conv2d-X", nvinfer1::DataType::kFLOAT,
nvinfer1::Dims3(2, 5, 5)); nvinfer1::Dims3(2, 5, 5));
......
...@@ -78,11 +78,9 @@ class TRTConvertValidation { ...@@ -78,11 +78,9 @@ class TRTConvertValidation {
scope_(scope), scope_(scope),
if_add_batch_(if_add_batch), if_add_batch_(if_add_batch),
max_batch_size_(max_batch_size) { max_batch_size_(max_batch_size) {
// create engine.
engine_.reset(new TensorRTEngine(max_batch_size, workspace_size, &stream_));
engine_->InitNetwork();
PADDLE_ENFORCE_EQ(cudaStreamCreate(&stream_), 0); PADDLE_ENFORCE_EQ(cudaStreamCreate(&stream_), 0);
engine_.reset(new TensorRTEngine(max_batch_size, workspace_size, stream_));
engine_->InitNetwork();
} }
// Declare a Variable as input with random initialization. // Declare a Variable as input with random initialization.
...@@ -175,7 +173,7 @@ class TRTConvertValidation { ...@@ -175,7 +173,7 @@ class TRTConvertValidation {
op_->Run(scope_, place); op_->Run(scope_, place);
// Execute TRT. // Execute TRT.
engine_->Execute(batch_size); engine_->Execute(batch_size);
cudaStreamSynchronize(*engine_->stream()); cudaStreamSynchronize(engine_->stream());
ASSERT_FALSE(op_desc_->OutputArgumentNames().empty()); ASSERT_FALSE(op_desc_->OutputArgumentNames().empty());
const size_t output_space_size = 3000; const size_t output_space_size = 3000;
...@@ -184,7 +182,7 @@ class TRTConvertValidation { ...@@ -184,7 +182,7 @@ class TRTConvertValidation {
std::vector<float> fluid_out; std::vector<float> fluid_out;
std::vector<float> trt_out(output_space_size); std::vector<float> trt_out(output_space_size);
engine_->GetOutputInCPU(output, &trt_out[0], output_space_size); engine_->GetOutputInCPU(output, &trt_out[0], output_space_size);
cudaStreamSynchronize(*engine_->stream()); cudaStreamSynchronize(engine_->stream());
auto* var = scope_.FindVar(output); auto* var = scope_.FindVar(output);
auto tensor = var->GetMutable<framework::LoDTensor>(); auto tensor = var->GetMutable<framework::LoDTensor>();
......
...@@ -42,14 +42,13 @@ void TensorRTEngine::Execute(int batch_size) { ...@@ -42,14 +42,13 @@ void TensorRTEngine::Execute(int batch_size) {
PADDLE_ENFORCE(buf.device == DeviceType::GPU); PADDLE_ENFORCE(buf.device == DeviceType::GPU);
buffers.push_back(buf.buffer); buffers.push_back(buf.buffer);
} }
PADDLE_ENFORCE_NOT_NULL(stream_); infer_context_->enqueue(batch_size, buffers.data(), stream_, nullptr);
infer_context_->enqueue(batch_size, buffers.data(), *stream_, nullptr); cudaStreamSynchronize(stream_);
cudaStreamSynchronize(*stream_);
SetRuntimeBatch(batch_size); SetRuntimeBatch(batch_size);
} }
TensorRTEngine::~TensorRTEngine() { TensorRTEngine::~TensorRTEngine() {
cudaStreamSynchronize(*stream_); cudaStreamSynchronize(stream_);
// clean buffer // clean buffer
for (auto &buf : buffers_) { for (auto &buf : buffers_) {
if (buf.device == DeviceType::GPU && buf.buffer != nullptr) { if (buf.device == DeviceType::GPU && buf.buffer != nullptr) {
...@@ -70,6 +69,13 @@ void TensorRTEngine::FreezeNetwork() { ...@@ -70,6 +69,13 @@ void TensorRTEngine::FreezeNetwork() {
// build engine. // build engine.
infer_builder_->setMaxBatchSize(max_batch_); infer_builder_->setMaxBatchSize(max_batch_);
infer_builder_->setMaxWorkspaceSize(max_workspace_); infer_builder_->setMaxWorkspaceSize(max_workspace_);
if (enable_int8_) {
infer_builder_->setInt8Mode(true);
PADDLE_ENFORCE(
calibrator_ != nullptr,
"The precision mode is 'INT8', the calibrator should not be nullptr");
infer_builder_->setInt8Calibrator(calibrator_);
}
infer_engine_.reset(infer_builder_->buildCudaEngine(*infer_network_)); infer_engine_.reset(infer_builder_->buildCudaEngine(*infer_network_));
PADDLE_ENFORCE(infer_engine_ != nullptr, "build cuda engine failed!"); PADDLE_ENFORCE(infer_engine_ != nullptr, "build cuda engine failed!");
...@@ -173,7 +179,7 @@ void TensorRTEngine::GetOutputInGPU(const std::string &name, void *dst, ...@@ -173,7 +179,7 @@ void TensorRTEngine::GetOutputInGPU(const std::string &name, void *dst,
auto &buf = buffer(name); auto &buf = buffer(name);
PADDLE_ENFORCE_NOT_NULL(buf.buffer, "buffer should be allocated before"); PADDLE_ENFORCE_NOT_NULL(buf.buffer, "buffer should be allocated before");
PADDLE_ENFORCE_EQ(cudaMemcpyAsync(dst, buf.buffer, dst_size, PADDLE_ENFORCE_EQ(cudaMemcpyAsync(dst, buf.buffer, dst_size,
cudaMemcpyDeviceToDevice, *stream_), cudaMemcpyDeviceToDevice, stream_),
0); 0);
} }
...@@ -194,7 +200,7 @@ void TensorRTEngine::GetOutputInCPU(const std::string &name, void *dst, ...@@ -194,7 +200,7 @@ void TensorRTEngine::GetOutputInCPU(const std::string &name, void *dst,
auto &buf = buffer(name); auto &buf = buffer(name);
PADDLE_ENFORCE_NOT_NULL(buf.buffer, "buffer should be allocated before"); PADDLE_ENFORCE_NOT_NULL(buf.buffer, "buffer should be allocated before");
PADDLE_ENFORCE_EQ(0, cudaMemcpyAsync(dst, buf.buffer, dst_size, PADDLE_ENFORCE_EQ(0, cudaMemcpyAsync(dst, buf.buffer, dst_size,
cudaMemcpyDeviceToHost, *stream_)); cudaMemcpyDeviceToHost, stream_));
} }
Buffer &TensorRTEngine::buffer(const std::string &name) { Buffer &TensorRTEngine::buffer(const std::string &name) {
...@@ -211,12 +217,11 @@ void TensorRTEngine::SetInputFromCPU(const std::string &name, const void *data, ...@@ -211,12 +217,11 @@ void TensorRTEngine::SetInputFromCPU(const std::string &name, const void *data,
auto &buf = buffer(name); auto &buf = buffer(name);
PADDLE_ENFORCE_NOT_NULL(buf.buffer); PADDLE_ENFORCE_NOT_NULL(buf.buffer);
PADDLE_ENFORCE_NOT_NULL(data); PADDLE_ENFORCE_NOT_NULL(data);
PADDLE_ENFORCE_NOT_NULL(stream_);
PADDLE_ENFORCE_LE(size, buf.max_size, "buffer is too small"); PADDLE_ENFORCE_LE(size, buf.max_size, "buffer is too small");
PADDLE_ENFORCE(buf.device == DeviceType::GPU); PADDLE_ENFORCE(buf.device == DeviceType::GPU);
buf.size = size; buf.size = size;
PADDLE_ENFORCE_EQ(0, cudaMemcpyAsync(buf.buffer, data, size, PADDLE_ENFORCE_EQ(0, cudaMemcpyAsync(buf.buffer, data, size,
cudaMemcpyHostToDevice, *stream_)); cudaMemcpyHostToDevice, stream_));
} }
void TensorRTEngine::SetInputFromGPU(const std::string &name, const void *data, void TensorRTEngine::SetInputFromGPU(const std::string &name, const void *data,
...@@ -227,7 +232,7 @@ void TensorRTEngine::SetInputFromGPU(const std::string &name, const void *data, ...@@ -227,7 +232,7 @@ void TensorRTEngine::SetInputFromGPU(const std::string &name, const void *data,
PADDLE_ENFORCE_LE(size, buf.max_size, "buffer is too small"); PADDLE_ENFORCE_LE(size, buf.max_size, "buffer is too small");
PADDLE_ENFORCE(buf.device == DeviceType::GPU); PADDLE_ENFORCE(buf.device == DeviceType::GPU);
PADDLE_ENFORCE_EQ(0, cudaMemcpyAsync(buf.buffer, data, size, PADDLE_ENFORCE_EQ(0, cudaMemcpyAsync(buf.buffer, data, size,
cudaMemcpyDeviceToDevice, *stream_)); cudaMemcpyDeviceToDevice, stream_));
} }
void TensorRTEngine::SetITensor(const std::string &name, void TensorRTEngine::SetITensor(const std::string &name,
......
...@@ -23,12 +23,14 @@ limitations under the License. */ ...@@ -23,12 +23,14 @@ limitations under the License. */
#include "paddle/fluid/inference/engine.h" #include "paddle/fluid/inference/engine.h"
#include "paddle/fluid/inference/tensorrt/helper.h" #include "paddle/fluid/inference/tensorrt/helper.h"
#include "paddle/fluid/inference/tensorrt/plugin/trt_plugin.h" #include "paddle/fluid/inference/tensorrt/plugin/trt_plugin.h"
#include "paddle/fluid/inference/tensorrt/trt_int8_calibrator.h"
#include "paddle/fluid/inference/utils/singleton.h" #include "paddle/fluid/inference/utils/singleton.h"
namespace paddle { namespace paddle {
namespace inference { namespace inference {
namespace tensorrt { namespace tensorrt {
class TRTInt8Calibrator;
/* /*
* TensorRT Engine. * TensorRT Engine.
* *
...@@ -54,17 +56,17 @@ class TensorRTEngine : public EngineBase { ...@@ -54,17 +56,17 @@ class TensorRTEngine : public EngineBase {
nvinfer1::Weights w_; nvinfer1::Weights w_;
}; };
TensorRTEngine(int max_batch, int max_workspace, TensorRTEngine(int max_batch, int max_workspace, cudaStream_t stream,
cudaStream_t* stream = nullptr, int device = 0, int device = 0, bool enable_int8 = false,
TRTInt8Calibrator* calibrator = nullptr,
nvinfer1::ILogger& logger = NaiveLogger::Global()) nvinfer1::ILogger& logger = NaiveLogger::Global())
: max_batch_(max_batch), : max_batch_(max_batch),
max_workspace_(max_workspace), max_workspace_(max_workspace),
stream_(stream ? stream : &default_stream_), stream_(stream),
logger_(logger), device_(device),
device_(device) { enable_int8_(enable_int8),
freshDeviceId(); calibrator_(calibrator),
cudaStreamCreate(stream_); logger_(logger) {}
}
virtual ~TensorRTEngine(); virtual ~TensorRTEngine();
...@@ -102,7 +104,7 @@ class TensorRTEngine : public EngineBase { ...@@ -102,7 +104,7 @@ class TensorRTEngine : public EngineBase {
// NOTE this should be used after calling `FreezeNetwork`. // NOTE this should be used after calling `FreezeNetwork`.
Buffer& buffer(const std::string& name) override; Buffer& buffer(const std::string& name) override;
cudaStream_t* stream() { return stream_; } cudaStream_t stream() { return stream_; }
// Fill an input from CPU memory with name and size. // Fill an input from CPU memory with name and size.
void SetInputFromCPU(const std::string& name, const void* data, size_t size); void SetInputFromCPU(const std::string& name, const void* data, size_t size);
...@@ -142,8 +144,8 @@ class TensorRTEngine : public EngineBase { ...@@ -142,8 +144,8 @@ class TensorRTEngine : public EngineBase {
// In the normal case, the paddle-trt exists bug when runing the googlenet. // In the normal case, the paddle-trt exists bug when runing the googlenet.
// When there are more than two convolutions of 1 * 1 with the same input, the // When there are more than two convolutions of 1 * 1 with the same input, the
// paddle-tensorrt will do the merging optimization, which fuse those conv // paddle-tensorrt will do the merging optimization, which fuse those conv
// into // into one conv, and then trigger bug. So, We should use strategy to avoid
// one conv, and then trigger bug. So, We should use strategy to avoid this // this
// optimization for the time being. This bug will be fixed in the future. // optimization for the time being. This bug will be fixed in the future.
std::unordered_map<std::string /*name*/, int /*ITensor_quote_num*/> std::unordered_map<std::string /*name*/, int /*ITensor_quote_num*/>
itensor_quote_num; itensor_quote_num;
...@@ -156,11 +158,15 @@ class TensorRTEngine : public EngineBase { ...@@ -156,11 +158,15 @@ class TensorRTEngine : public EngineBase {
// the max memory size the engine uses // the max memory size the engine uses
int max_workspace_; int max_workspace_;
cudaStream_t stream_;
// The specific GPU id that the TensorRTEngine bounded to.
int device_;
bool enable_int8_;
TRTInt8Calibrator* calibrator_;
// batch size of the current data, will be updated each Executation. // batch size of the current data, will be updated each Executation.
int batch_size_{-1}; int batch_size_{-1};
cudaStream_t* stream_;
// If stream_ is not set from outside, hold its own stream.
cudaStream_t default_stream_;
nvinfer1::ILogger& logger_; nvinfer1::ILogger& logger_;
std::vector<Buffer> buffers_; std::vector<Buffer> buffers_;
...@@ -169,8 +175,6 @@ class TensorRTEngine : public EngineBase { ...@@ -169,8 +175,6 @@ class TensorRTEngine : public EngineBase {
std::unordered_map<std::string /*name*/, nvinfer1::ITensor* /*ITensor*/> std::unordered_map<std::string /*name*/, nvinfer1::ITensor* /*ITensor*/>
itensor_map_; itensor_map_;
// The specific GPU id that the TensorRTEngine bounded to.
int device_;
std::vector<std::unique_ptr<plugin::PluginTensorRT>> owned_plugin_; std::vector<std::unique_ptr<plugin::PluginTensorRT>> owned_plugin_;
// TensorRT related internal members // TensorRT related internal members
...@@ -208,38 +212,6 @@ class TensorRTEngine : public EngineBase { ...@@ -208,38 +212,6 @@ class TensorRTEngine : public EngineBase {
#define TRT_ENGINE_ADD_LAYER(engine__, layer__, ARGS...) \ #define TRT_ENGINE_ADD_LAYER(engine__, layer__, ARGS...) \
engine__->network()->add##layer__(ARGS); engine__->network()->add##layer__(ARGS);
/*
* Helper to control the TensorRT engine's creation and deletion.
*/
class TRT_EngineManager {
public:
bool HasEngine(const std::string& name) const {
return engines_.count(name) != 0;
}
// Get an engine called `name`.
TensorRTEngine* Get(const std::string& name) const {
return engines_.at(name).get();
}
// Create or get an engine called `name`
TensorRTEngine* Create(int max_batch, int max_workspace, cudaStream_t* stream,
const std::string& name, int gpu_device = 0) {
auto* p = new TensorRTEngine(max_batch, max_workspace, stream, gpu_device);
engines_[name].reset(p);
return p;
}
void DeleteALl() {
for (auto& item : engines_) {
item.second.reset(nullptr);
}
}
private:
std::unordered_map<std::string, std::unique_ptr<TensorRTEngine>> engines_;
};
} // namespace tensorrt } // namespace tensorrt
} // namespace inference } // namespace inference
} // namespace paddle } // namespace paddle
...@@ -27,8 +27,8 @@ namespace tensorrt { ...@@ -27,8 +27,8 @@ namespace tensorrt {
class TensorRTEngineTest : public ::testing::Test { class TensorRTEngineTest : public ::testing::Test {
protected: protected:
void SetUp() override { void SetUp() override {
// ASSERT_EQ(0, cudaStreamCreate(&stream_)); ASSERT_EQ(0, cudaStreamCreate(&stream_));
engine_ = new TensorRTEngine(10, 1 << 10, &stream_); engine_ = new TensorRTEngine(10, 1 << 10, stream_);
engine_->InitNetwork(); engine_->InitNetwork();
} }
......
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/fluid/inference/tensorrt/trt_int8_calibrator.h"
#include "glog/logging.h"
namespace paddle {
namespace inference {
namespace tensorrt {
// set the batch size before constructing the thread to execute engine
int TRTInt8Calibrator::getBatchSize() const { return batch_size_; }
TRTInt8Calibrator::TRTInt8Calibrator(
const std::unordered_map<std::string, size_t>& buffers, int batch_size,
std::string engine_name, const platform::Place place)
: batch_size_(batch_size), engine_name_(engine_name) {
int i = 0;
VLOG(4) << "Init a new calibrator: " << engine_name_;
for (const auto it : buffers) {
framework::Tensor temp_tensor;
std::string input_name = it.first;
int data_size = it.second;
int num_ele = data_size / sizeof(int16_t);
framework::DDim data_shape = framework::make_ddim({num_ele});
temp_tensor.Resize(data_shape);
data_tensors_.push_back(temp_tensor);
data_buffers_[input_name] = std::pair<void*, size_t>(
static_cast<void*>(temp_tensor.mutable_data<int16_t>(place)), num_ele);
i += 1;
}
}
TRTInt8Calibrator::TRTInt8Calibrator(const std::string& calib_data)
: batch_size_(0),
calib_running_(false),
data_is_set_(false),
done_(true),
calibration_table_(calib_data) {}
void TRTInt8Calibrator::waitAndSetDone() {
std::unique_lock<std::mutex> lk(mut_);
while ((calib_running_ || data_is_set_) && !done_) cond_.wait(lk);
if (!done_) {
done_ = true;
cond_.notify_all();
}
}
// There might be more than one input for trt subgraph,
// So, we use a map to store input information.
bool TRTInt8Calibrator::setBatch(
const std::unordered_map<std::string, void*>& data) {
VLOG(3) << "set batch: " << engine_name_;
std::unique_lock<std::mutex> lk(mut_);
// There is a producer and a consumer. The producer set the batch data and
// the consumer get the batch data. The size of the data pool is one.
// So, the producer has to wait for the consumer to finish processing before
// they can set the data.
while ((calib_running_ || data_is_set_) && (!done_)) cond_.wait(lk);
// The done_ is set to true using waitAndSetDone, When all calibration data
// are processed.
if (done_) return false;
// Sets the batch.
for (const auto& it : data) {
auto dataptr = data_buffers_.find(it.first);
if (dataptr == data_buffers_.end()) {
LOG(FATAL) << "FATAL " << engine_name_ << " input name '" << it.first
<< "' does not match with the buffer names";
}
const auto& d = dataptr->second;
PADDLE_ENFORCE(
cudaMemcpy(d.first, it.second, d.second, cudaMemcpyDeviceToDevice),
"Fail to cudaMemcpy %s for %s", engine_name_, it.first);
}
data_is_set_ = true;
cond_.notify_all();
return true;
}
bool TRTInt8Calibrator::getBatch(void** bindings, const char** names,
int num_bindings) {
VLOG(4) << "get batch: " << engine_name_;
std::unique_lock<std::mutex> lk(mut_);
// The consumer has just finished processing a data.
// The producer can set the data again.
calib_running_ = false;
cond_.notify_all();
// As long as there is data in the pool, the consumer can get it.
while (!data_is_set_ && !done_) cond_.wait(lk);
if (done_) return false;
// Gets the batch
for (int i = 0; i < num_bindings; i++) {
auto it = data_buffers_.find(names[i]);
if (it == data_buffers_.end()) {
LOG(FATAL) << "Calibration engine asked for unknown tensor name '"
<< names[i] << "' at position " << i;
}
bindings[i] = it->second.first;
}
data_is_set_ = false;
calib_running_ = true;
VLOG(4) << "get batch done: " << engine_name_;
return true;
}
void TRTInt8Calibrator::setDone() {
std::unique_lock<std::mutex> lk(mut_);
done_ = true;
cond_.notify_all();
}
const void* TRTInt8Calibrator::readCalibrationCache(size_t& length) {
if (calibration_table_.empty()) return nullptr;
length = calibration_table_.size();
return calibration_table_.data();
}
void TRTInt8Calibrator::writeCalibrationCache(const void* ptr,
std::size_t length) {
calibration_table_ = std::string((const char*)ptr, length);
VLOG(4) << "Got calibration data for " << engine_name_ << " " << ptr
<< " length=" << length;
}
TRTInt8Calibrator::~TRTInt8Calibrator() {
VLOG(4) << "Destroying calibrator for " << engine_name_;
}
} // namespace tensorrt
} // namespace inference
} // namespace paddle
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include <NvInfer.h>
#include <cuda_runtime_api.h>
#include <atomic>
#include <memory>
#include <mutex> // NOLINT
#include <string>
#include <unordered_map>
#include <utility>
#include <vector>
#include "paddle/fluid/framework/tensor.h"
#include "paddle/fluid/inference/tensorrt/engine.h"
#include "paddle/fluid/platform/place.h"
namespace paddle {
namespace inference {
namespace tensorrt {
class TensorRTEngine;
struct TRTInt8Calibrator : public nvinfer1::IInt8EntropyCalibrator {
public:
TRTInt8Calibrator(const std::unordered_map<std::string, size_t>& buffers,
int batch_size, std::string engine_name,
const platform::Place place);
explicit TRTInt8Calibrator(const std::string& calibration_data);
~TRTInt8Calibrator();
int getBatchSize() const override;
bool getBatch(void* bindings[], const char* names[],
int num_bindings) override;
bool setBatch(const std::unordered_map<std::string, void*>& data);
void setDone();
void waitAndSetDone();
const void* readCalibrationCache(std::size_t& length) override;
void writeCalibrationCache(const void* ptr, std::size_t length) override;
const std::string& getCalibrationTableAsString() {
return calibration_table_;
}
private:
const int batch_size_;
bool calib_running_{true};
bool data_is_set_{false};
bool done_{false};
std::mutex mut_;
std::condition_variable cond_;
std::unordered_map<std::string, std::pair<void*, size_t>> data_buffers_;
std::vector<framework::Tensor> data_tensors_;
std::string engine_name_;
std::string calibration_table_;
};
class TRTCalibratorEngine {
public:
TRTCalibratorEngine() {}
std::unique_ptr<TRTInt8Calibrator> calib_;
std::unique_ptr<std::thread> thr_;
std::unique_ptr<TensorRTEngine> engine_;
};
/*
* Manager to control the TensorRT Int8 calibration creation and deltetion.
*/
class TRTCalibratorEngineManager {
public:
bool Has() const { return res_.size() > 0; }
bool Has(const std::string& name) const {
if (res_.count(name) == 0) return false;
return res_.at(name).get() != nullptr;
}
// Get Int8Calibrator via name
TRTCalibratorEngine* Get(const std::string& name) const {
return res_.at(name).get();
}
// Look up or create a calibrator.
TRTCalibratorEngine* LookupOrCreate(const std::string& engine_name) {
if (res_.count(engine_name) == 0) {
auto* p = new TRTCalibratorEngine;
res_[engine_name].reset(p);
}
return res_.at(engine_name).get();
}
// Create an Int8Calibrator
TRTCalibratorEngine* Create(const std::string& engine_name) {
auto* p = new TRTCalibratorEngine;
res_[engine_name].reset(p);
return p;
}
void DeleteALL() {
for (auto& item : res_) {
item.second.reset(nullptr);
}
}
private:
std::unordered_map<std::string, std::unique_ptr<TRTCalibratorEngine>> res_;
};
} // namespace tensorrt
} // namespace inference
} // namespace paddle
...@@ -54,6 +54,7 @@ else() ...@@ -54,6 +54,7 @@ else()
message(WARNING "These tests has been disabled in OSX or WITH_MKL=OFF before being fixed: \n test_analyzer_seq_pool1") message(WARNING "These tests has been disabled in OSX or WITH_MKL=OFF before being fixed: \n test_analyzer_seq_pool1")
endif() endif()
# RNN2 # RNN2
set(RNN2_INSTALL_DIR "${INFERENCE_DEMO_INSTALL_DIR}/rnn2") set(RNN2_INSTALL_DIR "${INFERENCE_DEMO_INSTALL_DIR}/rnn2")
download_model_and_data(${RNN2_INSTALL_DIR} "rnn2_model.tar.gz" "rnn2_data.txt.tar.gz") download_model_and_data(${RNN2_INSTALL_DIR} "rnn2_model.tar.gz" "rnn2_data.txt.tar.gz")
...@@ -115,6 +116,10 @@ if (NOT EXISTS ${MOBILENET_INSTALL_DIR}) ...@@ -115,6 +116,10 @@ if (NOT EXISTS ${MOBILENET_INSTALL_DIR})
endif() endif()
inference_analysis_api_test_with_refer_result(test_analyzer_mobilenet_transpose ${MOBILENET_INSTALL_DIR} analyzer_vis_tester.cc SERIAL) inference_analysis_api_test_with_refer_result(test_analyzer_mobilenet_transpose ${MOBILENET_INSTALL_DIR} analyzer_vis_tester.cc SERIAL)
# googlenet
inference_analysis_api_test_with_fake_data(test_analyzer_googlenet
"${INFERENCE_DEMO_INSTALL_DIR}/googlenet" analyzer_resnet50_tester.cc "googlenet.tar.gz" SERIAL)
# resnet50 # resnet50
inference_analysis_api_test_with_fake_data(test_analyzer_resnet50 inference_analysis_api_test_with_fake_data(test_analyzer_resnet50
"${INFERENCE_DEMO_INSTALL_DIR}/resnet50" analyzer_resnet50_tester.cc "resnet50_model.tar.gz" SERIAL) "${INFERENCE_DEMO_INSTALL_DIR}/resnet50" analyzer_resnet50_tester.cc "resnet50_model.tar.gz" SERIAL)
...@@ -123,6 +128,11 @@ inference_analysis_api_test_with_fake_data(test_analyzer_resnet50 ...@@ -123,6 +128,11 @@ inference_analysis_api_test_with_fake_data(test_analyzer_resnet50
inference_analysis_api_test_with_fake_data(test_analyzer_mobilenet_depthwise_conv inference_analysis_api_test_with_fake_data(test_analyzer_mobilenet_depthwise_conv
"${INFERENCE_DEMO_INSTALL_DIR}/mobilenet_depthwise_conv" analyzer_resnet50_tester.cc "mobilenet_model.tar.gz" SERIAL) "${INFERENCE_DEMO_INSTALL_DIR}/mobilenet_depthwise_conv" analyzer_resnet50_tester.cc "mobilenet_model.tar.gz" SERIAL)
# bert, max_len=20, embedding_dim=128
set(BERT_INSTALL_DIR "${INFERENCE_DEMO_INSTALL_DIR}/bert_emb128")
download_model_and_data(${BERT_INSTALL_DIR} "bert_emb128_model.tar.gz" "bert_data_len20.txt.tar.gz")
inference_analysis_api_test(test_analyzer_bert ${BERT_INSTALL_DIR} analyzer_bert_tester.cc SERIAL)
# anakin # anakin
if (WITH_ANAKIN AND WITH_MKL) # only needed in CI if (WITH_ANAKIN AND WITH_MKL) # only needed in CI
# anakin rnn1 # anakin rnn1
......
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/fluid/inference/tests/api/tester_helper.h"
namespace paddle {
namespace inference {
using paddle::PaddleTensor;
template <typename T>
void GetValueFromStream(std::stringstream *ss, T *t) {
(*ss) >> (*t);
}
template <>
void GetValueFromStream<std::string>(std::stringstream *ss, std::string *t) {
*t = ss->str();
}
// Split string to vector
template <typename T>
void Split(const std::string &line, char sep, std::vector<T> *v) {
std::stringstream ss;
T t;
for (auto c : line) {
if (c != sep) {
ss << c;
} else {
GetValueFromStream<T>(&ss, &t);
v->push_back(std::move(t));
ss.str({});
ss.clear();
}
}
if (!ss.str().empty()) {
GetValueFromStream<T>(&ss, &t);
v->push_back(std::move(t));
ss.str({});
ss.clear();
}
}
template <typename T>
constexpr paddle::PaddleDType GetPaddleDType();
template <>
constexpr paddle::PaddleDType GetPaddleDType<int64_t>() {
return paddle::PaddleDType::INT64;
}
template <>
constexpr paddle::PaddleDType GetPaddleDType<float>() {
return paddle::PaddleDType::FLOAT32;
}
// Parse tensor from string
template <typename T>
bool ParseTensor(const std::string &field, paddle::PaddleTensor *tensor) {
std::vector<std::string> data;
Split(field, ':', &data);
if (data.size() < 2) return false;
std::string shape_str = data[0];
std::vector<int> shape;
Split(shape_str, ' ', &shape);
std::string mat_str = data[1];
std::vector<T> mat;
Split(mat_str, ' ', &mat);
tensor->shape = shape;
auto size =
std::accumulate(shape.begin(), shape.end(), 1, std::multiplies<int>()) *
sizeof(T);
tensor->data.Resize(size);
std::copy(mat.begin(), mat.end(), static_cast<T *>(tensor->data.data()));
tensor->dtype = GetPaddleDType<T>();
return true;
}
// Parse input tensors from string
bool ParseLine(const std::string &line,
std::vector<paddle::PaddleTensor> *tensors) {
std::vector<std::string> fields;
Split(line, ';', &fields);
if (fields.size() < 5) return false;
tensors->clear();
tensors->reserve(5);
int i = 0;
// src_id
paddle::PaddleTensor src_id;
ParseTensor<int64_t>(fields[i++], &src_id);
tensors->push_back(src_id);
// pos_id
paddle::PaddleTensor pos_id;
ParseTensor<int64_t>(fields[i++], &pos_id);
tensors->push_back(pos_id);
// segment_id
paddle::PaddleTensor segment_id;
ParseTensor<int64_t>(fields[i++], &segment_id);
tensors->push_back(segment_id);
// self_attention_bias
paddle::PaddleTensor self_attention_bias;
ParseTensor<float>(fields[i++], &self_attention_bias);
tensors->push_back(self_attention_bias);
// next_segment_index
paddle::PaddleTensor next_segment_index;
ParseTensor<int64_t>(fields[i++], &next_segment_index);
tensors->push_back(next_segment_index);
return true;
}
bool LoadInputData(std::vector<std::vector<paddle::PaddleTensor>> *inputs) {
if (FLAGS_infer_data.empty()) {
LOG(ERROR) << "please set input data path";
return false;
}
std::ifstream fin(FLAGS_infer_data);
std::string line;
int sample = 0;
// The unit-test dataset only have 10 samples, each sample have 5 feeds.
while (std::getline(fin, line)) {
std::vector<paddle::PaddleTensor> feed_data;
ParseLine(line, &feed_data);
inputs->push_back(std::move(feed_data));
sample++;
if (!FLAGS_test_all_data && sample == FLAGS_batch_size) break;
}
LOG(INFO) << "number of samples: " << sample;
return true;
}
void SetConfig(AnalysisConfig *config) { config->SetModel(FLAGS_infer_model); }
void profile(bool use_mkldnn = false) {
AnalysisConfig config;
SetConfig(&config);
if (use_mkldnn) {
config.EnableMKLDNN();
}
std::vector<PaddleTensor> outputs;
std::vector<std::vector<PaddleTensor>> inputs;
LoadInputData(&inputs);
TestPrediction(reinterpret_cast<const PaddlePredictor::Config *>(&config),
inputs, &outputs, FLAGS_num_threads);
}
TEST(Analyzer_bert, profile) { profile(); }
#ifdef PADDLE_WITH_MKLDNN
TEST(Analyzer_bert, profile_mkldnn) { profile(true); }
#endif
// Check the fuse status
TEST(Analyzer_bert, fuse_statis) {
AnalysisConfig cfg;
SetConfig(&cfg);
int num_ops;
auto predictor = CreatePaddlePredictor<AnalysisConfig>(cfg);
auto fuse_statis = GetFuseStatis(
static_cast<AnalysisPredictor *>(predictor.get()), &num_ops);
LOG(INFO) << "num_ops: " << num_ops;
}
// Compare result of NativeConfig and AnalysisConfig
void compare(bool use_mkldnn = false) {
AnalysisConfig cfg;
SetConfig(&cfg);
if (use_mkldnn) {
cfg.EnableMKLDNN();
}
std::vector<std::vector<PaddleTensor>> inputs;
LoadInputData(&inputs);
CompareNativeAndAnalysis(
reinterpret_cast<const PaddlePredictor::Config *>(&cfg), inputs);
}
TEST(Analyzer_bert, compare) { compare(); }
#ifdef PADDLE_WITH_MKLDNN
TEST(Analyzer_bert, compare_mkldnn) { compare(true /* use_mkldnn */); }
#endif
// Compare Deterministic result
TEST(Analyzer_bert, compare_determine) {
AnalysisConfig cfg;
SetConfig(&cfg);
std::vector<std::vector<PaddleTensor>> inputs;
LoadInputData(&inputs);
CompareDeterministic(reinterpret_cast<const PaddlePredictor::Config *>(&cfg),
inputs);
}
} // namespace inference
} // namespace paddle
...@@ -19,7 +19,6 @@ DEFINE_int32(max_turn_num, 9, ...@@ -19,7 +19,6 @@ DEFINE_int32(max_turn_num, 9,
namespace paddle { namespace paddle {
namespace inference { namespace inference {
using contrib::AnalysisConfig;
constexpr int32_t kMaxTurnLen = 50; constexpr int32_t kMaxTurnLen = 50;
...@@ -165,7 +164,7 @@ void PrepareInputs(std::vector<PaddleTensor> *input_slots, DataRecord *data, ...@@ -165,7 +164,7 @@ void PrepareInputs(std::vector<PaddleTensor> *input_slots, DataRecord *data,
input_slots->push_back(std::move(response_mask_tensor)); input_slots->push_back(std::move(response_mask_tensor));
} }
void SetConfig(contrib::AnalysisConfig *cfg) { void SetConfig(AnalysisConfig *cfg) {
cfg->SetModel(FLAGS_infer_model + "/__model__", FLAGS_infer_model + "/param"); cfg->SetModel(FLAGS_infer_model + "/__model__", FLAGS_infer_model + "/param");
cfg->SwitchSpecifyInputNames(); cfg->SwitchSpecifyInputNames();
cfg->SwitchIrOptim(true); cfg->SwitchIrOptim(true);
...@@ -187,7 +186,7 @@ void SetInput(std::vector<std::vector<PaddleTensor>> *inputs) { ...@@ -187,7 +186,7 @@ void SetInput(std::vector<std::vector<PaddleTensor>> *inputs) {
// Easy for profiling independently. // Easy for profiling independently.
void profile(bool use_mkldnn = false) { void profile(bool use_mkldnn = false) {
contrib::AnalysisConfig cfg; AnalysisConfig cfg;
SetConfig(&cfg); SetConfig(&cfg);
if (use_mkldnn) { if (use_mkldnn) {
...@@ -223,7 +222,7 @@ TEST(Analyzer_dam, profile_mkldnn) { profile(true /* use_mkldnn */); } ...@@ -223,7 +222,7 @@ TEST(Analyzer_dam, profile_mkldnn) { profile(true /* use_mkldnn */); }
// Check the fuse status // Check the fuse status
TEST(Analyzer_dam, fuse_statis) { TEST(Analyzer_dam, fuse_statis) {
contrib::AnalysisConfig cfg; AnalysisConfig cfg;
SetConfig(&cfg); SetConfig(&cfg);
int num_ops; int num_ops;
...@@ -253,17 +252,17 @@ void compare(bool use_mkldnn = false) { ...@@ -253,17 +252,17 @@ void compare(bool use_mkldnn = false) {
} }
// Compare result of NativeConfig and AnalysisConfig with memory optimization. // Compare result of NativeConfig and AnalysisConfig with memory optimization.
TEST(Analyzer_dam, compare_with_memory_optim) { TEST(Analyzer_dam, compare_with_static_memory_optim) {
// The small dam will core in CI, but works in local. // The small dam will core in CI, but works in local.
if (FLAGS_max_turn_num == 9) { if (FLAGS_max_turn_num == 9) {
contrib::AnalysisConfig cfg, cfg1; AnalysisConfig cfg, cfg1;
DataRecord data(FLAGS_infer_data, FLAGS_batch_size); DataRecord data(FLAGS_infer_data, FLAGS_batch_size);
std::vector<std::vector<PaddleTensor>> input_slots_all; std::vector<std::vector<PaddleTensor>> input_slots_all;
SetInput(&input_slots_all); SetInput(&input_slots_all);
// Run the first time to force to update memory cache // Run the first time to force to update memory cache
SetConfig(&cfg); SetConfig(&cfg);
cfg.EnableMemoryOptim(true); cfg.EnableMemoryOptim(true, true /*force update*/);
CompareNativeAndAnalysis( CompareNativeAndAnalysis(
reinterpret_cast<const PaddlePredictor::Config *>(&cfg), reinterpret_cast<const PaddlePredictor::Config *>(&cfg),
...@@ -271,7 +270,7 @@ TEST(Analyzer_dam, compare_with_memory_optim) { ...@@ -271,7 +270,7 @@ TEST(Analyzer_dam, compare_with_memory_optim) {
// Run second time to use the memory cache and perform memory optimization. // Run second time to use the memory cache and perform memory optimization.
SetConfig(&cfg1); SetConfig(&cfg1);
cfg1.EnableMemoryOptim(); cfg1.EnableMemoryOptim(true, false /*do not force update*/);
CompareNativeAndAnalysis( CompareNativeAndAnalysis(
reinterpret_cast<const PaddlePredictor::Config *>(&cfg1), reinterpret_cast<const PaddlePredictor::Config *>(&cfg1),
...@@ -279,6 +278,24 @@ TEST(Analyzer_dam, compare_with_memory_optim) { ...@@ -279,6 +278,24 @@ TEST(Analyzer_dam, compare_with_memory_optim) {
} }
} }
TEST(Analyzer_dam, compare_with_dynamic_memory_optim) {
// The small dam will core in CI, but works in local.
if (FLAGS_max_turn_num == 9) {
AnalysisConfig cfg, cfg1;
DataRecord data(FLAGS_infer_data, FLAGS_batch_size);
std::vector<std::vector<PaddleTensor>> input_slots_all;
SetInput(&input_slots_all);
// Run the first time to force to update memory cache
SetConfig(&cfg);
cfg.EnableMemoryOptim();
CompareNativeAndAnalysis(
reinterpret_cast<const PaddlePredictor::Config *>(&cfg),
input_slots_all);
}
}
TEST(Analyzer_dam, compare) { compare(); } TEST(Analyzer_dam, compare) { compare(); }
#ifdef PADDLE_WITH_MKLDNN #ifdef PADDLE_WITH_MKLDNN
......
...@@ -18,8 +18,6 @@ namespace paddle { ...@@ -18,8 +18,6 @@ namespace paddle {
namespace inference { namespace inference {
namespace analysis { namespace analysis {
using contrib::AnalysisConfig;
struct DataRecord { struct DataRecord {
std::vector<int64_t> data; std::vector<int64_t> data;
std::vector<size_t> lod; std::vector<size_t> lod;
......
...@@ -16,7 +16,6 @@ ...@@ -16,7 +16,6 @@
namespace paddle { namespace paddle {
namespace inference { namespace inference {
using contrib::AnalysisConfig;
struct DataRecord { struct DataRecord {
std::vector<std::vector<int64_t>> query, title; std::vector<std::vector<int64_t>> query, title;
...@@ -75,7 +74,7 @@ void PrepareInputs(std::vector<PaddleTensor> *input_slots, DataRecord *data, ...@@ -75,7 +74,7 @@ void PrepareInputs(std::vector<PaddleTensor> *input_slots, DataRecord *data,
} }
} }
void SetConfig(contrib::AnalysisConfig *cfg) { void SetConfig(AnalysisConfig *cfg) {
cfg->SetModel(FLAGS_infer_model); cfg->SetModel(FLAGS_infer_model);
cfg->DisableGpu(); cfg->DisableGpu();
cfg->SwitchSpecifyInputNames(); cfg->SwitchSpecifyInputNames();
...@@ -95,7 +94,7 @@ void SetInput(std::vector<std::vector<PaddleTensor>> *inputs) { ...@@ -95,7 +94,7 @@ void SetInput(std::vector<std::vector<PaddleTensor>> *inputs) {
// Easy for profiling independently. // Easy for profiling independently.
void profile(bool use_mkldnn = false) { void profile(bool use_mkldnn = false) {
contrib::AnalysisConfig cfg; AnalysisConfig cfg;
SetConfig(&cfg); SetConfig(&cfg);
std::vector<PaddleTensor> outputs; std::vector<PaddleTensor> outputs;
...@@ -130,7 +129,7 @@ TEST(Analyzer_MM_DNN, profile_mkldnn) { profile(true /* use_mkldnn */); } ...@@ -130,7 +129,7 @@ TEST(Analyzer_MM_DNN, profile_mkldnn) { profile(true /* use_mkldnn */); }
// Check the fuse status // Check the fuse status
TEST(Analyzer_MM_DNN, fuse_statis) { TEST(Analyzer_MM_DNN, fuse_statis) {
contrib::AnalysisConfig cfg; AnalysisConfig cfg;
SetConfig(&cfg); SetConfig(&cfg);
int num_ops; int num_ops;
...@@ -141,7 +140,7 @@ TEST(Analyzer_MM_DNN, fuse_statis) { ...@@ -141,7 +140,7 @@ TEST(Analyzer_MM_DNN, fuse_statis) {
// Compare result of NativeConfig and AnalysisConfig // Compare result of NativeConfig and AnalysisConfig
void compare(bool use_mkldnn = false) { void compare(bool use_mkldnn = false) {
contrib::AnalysisConfig cfg; AnalysisConfig cfg;
SetConfig(&cfg); SetConfig(&cfg);
if (use_mkldnn) { if (use_mkldnn) {
......
...@@ -16,7 +16,6 @@ ...@@ -16,7 +16,6 @@
namespace paddle { namespace paddle {
namespace inference { namespace inference {
using contrib::AnalysisConfig;
struct DataRecord { struct DataRecord {
std::vector<std::vector<int64_t>> word, mention; std::vector<std::vector<int64_t>> word, mention;
...@@ -76,7 +75,7 @@ void PrepareInputs(std::vector<PaddleTensor> *input_slots, DataRecord *data) { ...@@ -76,7 +75,7 @@ void PrepareInputs(std::vector<PaddleTensor> *input_slots, DataRecord *data) {
} }
} }
void SetConfig(contrib::AnalysisConfig *cfg, bool memory_load = false) { void SetConfig(AnalysisConfig *cfg, bool memory_load = false) {
if (memory_load) { if (memory_load) {
std::string buffer_prog, buffer_param; std::string buffer_prog, buffer_param;
ReadBinaryFile(FLAGS_infer_model + "/__model__", &buffer_prog); ReadBinaryFile(FLAGS_infer_model + "/__model__", &buffer_prog);
...@@ -105,7 +104,7 @@ void SetInput(std::vector<std::vector<PaddleTensor>> *inputs) { ...@@ -105,7 +104,7 @@ void SetInput(std::vector<std::vector<PaddleTensor>> *inputs) {
// Easy for profiling independently. // Easy for profiling independently.
void profile(bool memory_load = false) { void profile(bool memory_load = false) {
contrib::AnalysisConfig cfg; AnalysisConfig cfg;
SetConfig(&cfg, memory_load); SetConfig(&cfg, memory_load);
std::vector<PaddleTensor> outputs; std::vector<PaddleTensor> outputs;
...@@ -136,7 +135,7 @@ TEST(Analyzer_Chinese_ner, profile_memory_load) { ...@@ -136,7 +135,7 @@ TEST(Analyzer_Chinese_ner, profile_memory_load) {
// Check the fuse status // Check the fuse status
TEST(Analyzer_Chinese_ner, fuse_statis) { TEST(Analyzer_Chinese_ner, fuse_statis) {
contrib::AnalysisConfig cfg; AnalysisConfig cfg;
SetConfig(&cfg); SetConfig(&cfg);
int num_ops; int num_ops;
...@@ -152,7 +151,7 @@ TEST(Analyzer_Chinese_ner, fuse_statis) { ...@@ -152,7 +151,7 @@ TEST(Analyzer_Chinese_ner, fuse_statis) {
// Compare result of NativeConfig and AnalysisConfig // Compare result of NativeConfig and AnalysisConfig
TEST(Analyzer_Chinese_ner, compare) { TEST(Analyzer_Chinese_ner, compare) {
contrib::AnalysisConfig cfg; AnalysisConfig cfg;
SetConfig(&cfg); SetConfig(&cfg);
std::vector<std::vector<PaddleTensor>> input_slots_all; std::vector<std::vector<PaddleTensor>> input_slots_all;
......
...@@ -16,7 +16,6 @@ ...@@ -16,7 +16,6 @@
namespace paddle { namespace paddle {
namespace inference { namespace inference {
using contrib::AnalysisConfig;
struct DataRecord { struct DataRecord {
std::vector<std::vector<int64_t>> query_basic, query_phrase, title_basic, std::vector<std::vector<int64_t>> query_basic, query_phrase, title_basic,
...@@ -103,7 +102,7 @@ void PrepareInputs(std::vector<PaddleTensor> *input_slots, DataRecord *data, ...@@ -103,7 +102,7 @@ void PrepareInputs(std::vector<PaddleTensor> *input_slots, DataRecord *data,
} }
} }
void SetConfig(contrib::AnalysisConfig *cfg) { void SetConfig(AnalysisConfig *cfg) {
cfg->SetModel(FLAGS_infer_model); cfg->SetModel(FLAGS_infer_model);
cfg->DisableGpu(); cfg->DisableGpu();
cfg->SwitchSpecifyInputNames(); cfg->SwitchSpecifyInputNames();
...@@ -123,7 +122,7 @@ void SetInput(std::vector<std::vector<PaddleTensor>> *inputs) { ...@@ -123,7 +122,7 @@ void SetInput(std::vector<std::vector<PaddleTensor>> *inputs) {
// Easy for profiling independently. // Easy for profiling independently.
TEST(Analyzer_Pyramid_DNN, profile) { TEST(Analyzer_Pyramid_DNN, profile) {
contrib::AnalysisConfig cfg; AnalysisConfig cfg;
SetConfig(&cfg); SetConfig(&cfg);
std::vector<PaddleTensor> outputs; std::vector<PaddleTensor> outputs;
...@@ -147,7 +146,7 @@ TEST(Analyzer_Pyramid_DNN, profile) { ...@@ -147,7 +146,7 @@ TEST(Analyzer_Pyramid_DNN, profile) {
// Check the fuse status // Check the fuse status
TEST(Analyzer_Pyramid_DNN, fuse_statis) { TEST(Analyzer_Pyramid_DNN, fuse_statis) {
contrib::AnalysisConfig cfg; AnalysisConfig cfg;
SetConfig(&cfg); SetConfig(&cfg);
int num_ops; int num_ops;
...@@ -158,7 +157,7 @@ TEST(Analyzer_Pyramid_DNN, fuse_statis) { ...@@ -158,7 +157,7 @@ TEST(Analyzer_Pyramid_DNN, fuse_statis) {
// Compare result of NativeConfig and AnalysisConfig // Compare result of NativeConfig and AnalysisConfig
TEST(Analyzer_Pyramid_DNN, compare) { TEST(Analyzer_Pyramid_DNN, compare) {
contrib::AnalysisConfig cfg; AnalysisConfig cfg;
SetConfig(&cfg); SetConfig(&cfg);
std::vector<std::vector<PaddleTensor>> input_slots_all; std::vector<std::vector<PaddleTensor>> input_slots_all;
......
...@@ -20,7 +20,6 @@ namespace paddle { ...@@ -20,7 +20,6 @@ namespace paddle {
namespace inference { namespace inference {
using namespace framework; // NOLINT using namespace framework; // NOLINT
using namespace contrib; // NOLINT
struct DataRecord { struct DataRecord {
std::vector<std::vector<std::vector<float>>> link_step_data_all; std::vector<std::vector<std::vector<float>>> link_step_data_all;
...@@ -223,7 +222,7 @@ void SetInput(std::vector<std::vector<PaddleTensor>> *inputs) { ...@@ -223,7 +222,7 @@ void SetInput(std::vector<std::vector<PaddleTensor>> *inputs) {
// Easy for profiling independently. // Easy for profiling independently.
TEST(Analyzer_rnn1, profile) { TEST(Analyzer_rnn1, profile) {
contrib::AnalysisConfig cfg; AnalysisConfig cfg;
SetConfig(&cfg); SetConfig(&cfg);
cfg.DisableGpu(); cfg.DisableGpu();
cfg.SwitchIrDebug(); cfg.SwitchIrDebug();
...@@ -237,7 +236,7 @@ TEST(Analyzer_rnn1, profile) { ...@@ -237,7 +236,7 @@ TEST(Analyzer_rnn1, profile) {
// Check the fuse status // Check the fuse status
TEST(Analyzer_rnn1, fuse_statis) { TEST(Analyzer_rnn1, fuse_statis) {
contrib::AnalysisConfig cfg; AnalysisConfig cfg;
SetConfig(&cfg); SetConfig(&cfg);
int num_ops; int num_ops;
...@@ -254,7 +253,7 @@ TEST(Analyzer_rnn1, fuse_statis) { ...@@ -254,7 +253,7 @@ TEST(Analyzer_rnn1, fuse_statis) {
// Compare result of NativeConfig and AnalysisConfig // Compare result of NativeConfig and AnalysisConfig
TEST(Analyzer_rnn1, compare) { TEST(Analyzer_rnn1, compare) {
contrib::AnalysisConfig cfg; AnalysisConfig cfg;
SetConfig(&cfg); SetConfig(&cfg);
std::vector<std::vector<PaddleTensor>> input_slots_all; std::vector<std::vector<PaddleTensor>> input_slots_all;
...@@ -276,7 +275,7 @@ TEST(Analyzer_rnn1, compare_determine) { ...@@ -276,7 +275,7 @@ TEST(Analyzer_rnn1, compare_determine) {
// Test Multi-Thread. // Test Multi-Thread.
TEST(Analyzer_rnn1, multi_thread) { TEST(Analyzer_rnn1, multi_thread) {
contrib::AnalysisConfig cfg; AnalysisConfig cfg;
SetConfig(&cfg); SetConfig(&cfg);
std::vector<PaddleTensor> outputs; std::vector<PaddleTensor> outputs;
......
...@@ -20,7 +20,6 @@ limitations under the License. */ ...@@ -20,7 +20,6 @@ limitations under the License. */
namespace paddle { namespace paddle {
namespace inference { namespace inference {
namespace analysis { namespace analysis {
using contrib::AnalysisConfig;
struct Record { struct Record {
std::vector<float> data; std::vector<float> data;
......
...@@ -58,9 +58,8 @@ std::ostream &operator<<(std::ostream &os, const NativeConfig &config) { ...@@ -58,9 +58,8 @@ std::ostream &operator<<(std::ostream &os, const NativeConfig &config) {
return os; return os;
} }
std::ostream &operator<<(std::ostream &os, std::ostream &operator<<(std::ostream &os, const AnalysisConfig &config) {
const contrib::AnalysisConfig &config) { os << GenSpaces(num_spaces) << "AnalysisConfig {\n";
os << GenSpaces(num_spaces) << "contrib::AnalysisConfig {\n";
num_spaces++; num_spaces++;
os << config.ToNativeConfig(); os << config.ToNativeConfig();
if (!config.model_from_memory()) { if (!config.model_from_memory()) {
......
...@@ -58,7 +58,7 @@ namespace inference { ...@@ -58,7 +58,7 @@ namespace inference {
void PrintConfig(const PaddlePredictor::Config *config, bool use_analysis) { void PrintConfig(const PaddlePredictor::Config *config, bool use_analysis) {
const auto *analysis_config = const auto *analysis_config =
reinterpret_cast<const contrib::AnalysisConfig *>(config); reinterpret_cast<const AnalysisConfig *>(config);
if (use_analysis) { if (use_analysis) {
LOG(INFO) << *analysis_config; LOG(INFO) << *analysis_config;
return; return;
...@@ -102,9 +102,9 @@ void CompareResult(const std::vector<PaddleTensor> &outputs, ...@@ -102,9 +102,9 @@ void CompareResult(const std::vector<PaddleTensor> &outputs,
std::unique_ptr<PaddlePredictor> CreateTestPredictor( std::unique_ptr<PaddlePredictor> CreateTestPredictor(
const PaddlePredictor::Config *config, bool use_analysis = true) { const PaddlePredictor::Config *config, bool use_analysis = true) {
const auto *analysis_config = const auto *analysis_config =
reinterpret_cast<const contrib::AnalysisConfig *>(config); reinterpret_cast<const AnalysisConfig *>(config);
if (use_analysis) { if (use_analysis) {
return CreatePaddlePredictor<contrib::AnalysisConfig>(*analysis_config); return CreatePaddlePredictor<AnalysisConfig>(*analysis_config);
} }
auto native_config = analysis_config->ToNativeConfig(); auto native_config = analysis_config->ToNativeConfig();
return CreatePaddlePredictor<NativeConfig>(native_config); return CreatePaddlePredictor<NativeConfig>(native_config);
...@@ -139,7 +139,8 @@ void SetFakeImageInput(std::vector<std::vector<PaddleTensor>> *inputs, ...@@ -139,7 +139,8 @@ void SetFakeImageInput(std::vector<std::vector<PaddleTensor>> *inputs,
const std::string &dirname, bool is_combined = true, const std::string &dirname, bool is_combined = true,
std::string model_filename = "model", std::string model_filename = "model",
std::string params_filename = "params", std::string params_filename = "params",
const std::vector<std::string> *feed_names = nullptr) { const std::vector<std::string> *feed_names = nullptr,
const int continuous_inuput_index = 0) {
// Set fake_image_data // Set fake_image_data
PADDLE_ENFORCE_EQ(FLAGS_test_all_data, 0, "Only have single batch of data."); PADDLE_ENFORCE_EQ(FLAGS_test_all_data, 0, "Only have single batch of data.");
std::vector<std::vector<int64_t>> feed_target_shapes = GetFeedTargetShapes( std::vector<std::vector<int64_t>> feed_target_shapes = GetFeedTargetShapes(
...@@ -176,7 +177,8 @@ void SetFakeImageInput(std::vector<std::vector<PaddleTensor>> *inputs, ...@@ -176,7 +177,8 @@ void SetFakeImageInput(std::vector<std::vector<PaddleTensor>> *inputs,
float *input_data = static_cast<float *>(input.data.data()); float *input_data = static_cast<float *>(input.data.data());
// fill input data, for profile easily, do not use random data here. // fill input data, for profile easily, do not use random data here.
for (size_t j = 0; j < len; ++j) { for (size_t j = 0; j < len; ++j) {
*(input_data + j) = static_cast<float>(j) / len; *(input_data + j) =
static_cast<float>((j + continuous_inuput_index) % len) / len;
} }
} }
(*inputs).emplace_back(input_slots); (*inputs).emplace_back(input_slots);
...@@ -344,6 +346,16 @@ void CompareNativeAndAnalysis( ...@@ -344,6 +346,16 @@ void CompareNativeAndAnalysis(
CompareResult(analysis_outputs, native_outputs); CompareResult(analysis_outputs, native_outputs);
} }
void CompareNativeAndAnalysis(
PaddlePredictor *native_pred, PaddlePredictor *analysis_pred,
const std::vector<std::vector<PaddleTensor>> &inputs) {
int batch_size = FLAGS_batch_size;
std::vector<PaddleTensor> native_outputs, analysis_outputs;
native_pred->Run(inputs[0], &native_outputs, batch_size);
analysis_pred->Run(inputs[0], &analysis_outputs, batch_size);
CompareResult(analysis_outputs, native_outputs);
}
template <typename T> template <typename T>
std::string LoDTensorSummary(const framework::LoDTensor &tensor) { std::string LoDTensorSummary(const framework::LoDTensor &tensor) {
std::stringstream ss; std::stringstream ss;
......
...@@ -42,9 +42,9 @@ void SetConfig(ConfigType* config, std::string model_dir, bool use_gpu, ...@@ -42,9 +42,9 @@ void SetConfig(ConfigType* config, std::string model_dir, bool use_gpu,
} }
template <> template <>
void SetConfig<contrib::AnalysisConfig>(contrib::AnalysisConfig* config, void SetConfig<AnalysisConfig>(AnalysisConfig* config, std::string model_dir,
std::string model_dir, bool use_gpu, bool use_gpu, bool use_tensorrt,
bool use_tensorrt, int batch_size) { int batch_size) {
if (!FLAGS_prog_filename.empty() && !FLAGS_param_filename.empty()) { if (!FLAGS_prog_filename.empty() && !FLAGS_param_filename.empty()) {
config->SetModel(model_dir + "/" + FLAGS_prog_filename, config->SetModel(model_dir + "/" + FLAGS_prog_filename,
model_dir + "/" + FLAGS_param_filename); model_dir + "/" + FLAGS_param_filename);
...@@ -75,11 +75,11 @@ void profile(std::string model_dir, bool use_analysis, bool use_tensorrt) { ...@@ -75,11 +75,11 @@ void profile(std::string model_dir, bool use_analysis, bool use_tensorrt) {
std::vector<PaddleTensor> outputs; std::vector<PaddleTensor> outputs;
if (use_analysis || use_tensorrt) { if (use_analysis || use_tensorrt) {
contrib::AnalysisConfig config; AnalysisConfig config;
config.EnableUseGpu(100, 0); config.EnableUseGpu(100, 0);
config.pass_builder()->TurnOnDebug(); config.pass_builder()->TurnOnDebug();
SetConfig<contrib::AnalysisConfig>(&config, model_dir, true, use_tensorrt, SetConfig<AnalysisConfig>(&config, model_dir, true, use_tensorrt,
FLAGS_batch_size); FLAGS_batch_size);
TestPrediction(reinterpret_cast<PaddlePredictor::Config*>(&config), TestPrediction(reinterpret_cast<PaddlePredictor::Config*>(&config),
inputs_all, &outputs, FLAGS_num_threads, true); inputs_all, &outputs, FLAGS_num_threads, true);
} else { } else {
...@@ -99,14 +99,36 @@ void compare(std::string model_dir, bool use_tensorrt) { ...@@ -99,14 +99,36 @@ void compare(std::string model_dir, bool use_tensorrt) {
SetFakeImageInput(&inputs_all, model_dir, false, "__model__", ""); SetFakeImageInput(&inputs_all, model_dir, false, "__model__", "");
} }
contrib::AnalysisConfig analysis_config; AnalysisConfig analysis_config;
SetConfig<contrib::AnalysisConfig>(&analysis_config, model_dir, true, SetConfig<AnalysisConfig>(&analysis_config, model_dir, true, use_tensorrt,
use_tensorrt, FLAGS_batch_size); FLAGS_batch_size);
CompareNativeAndAnalysis( CompareNativeAndAnalysis(
reinterpret_cast<const PaddlePredictor::Config*>(&analysis_config), reinterpret_cast<const PaddlePredictor::Config*>(&analysis_config),
inputs_all); inputs_all);
} }
void compare_continuous_input(std::string model_dir, bool use_tensorrt) {
AnalysisConfig analysis_config;
SetConfig<AnalysisConfig>(&analysis_config, model_dir, true, use_tensorrt,
FLAGS_batch_size);
auto config =
reinterpret_cast<const PaddlePredictor::Config*>(&analysis_config);
auto native_pred = CreateTestPredictor(config, false);
auto analysis_pred = CreateTestPredictor(config, true);
for (int i = 0; i < 100; i++) {
std::vector<std::vector<PaddleTensor>> inputs_all;
if (!FLAGS_prog_filename.empty() && !FLAGS_param_filename.empty()) {
SetFakeImageInput(&inputs_all, model_dir, true, FLAGS_prog_filename,
FLAGS_param_filename, nullptr, i);
} else {
SetFakeImageInput(&inputs_all, model_dir, false, "__model__", "", nullptr,
i);
}
CompareNativeAndAnalysis(native_pred.get(), analysis_pred.get(),
inputs_all);
}
}
TEST(TensorRT_mobilenet, compare) { TEST(TensorRT_mobilenet, compare) {
std::string model_dir = FLAGS_infer_model + "/mobilenet"; std::string model_dir = FLAGS_infer_model + "/mobilenet";
compare(model_dir, /* use_tensorrt */ true); compare(model_dir, /* use_tensorrt */ true);
...@@ -162,5 +184,15 @@ TEST(TensorRT_mobilenet, profile) { ...@@ -162,5 +184,15 @@ TEST(TensorRT_mobilenet, profile) {
profile(model_dir, true, false); profile(model_dir, true, false);
} }
TEST(resnet50, compare_continuous_input) {
std::string model_dir = FLAGS_infer_model + "/resnet50";
compare_continuous_input(model_dir, true);
}
TEST(resnet50, compare_continuous_input_native) {
std::string model_dir = FLAGS_infer_model + "/resnet50";
compare_continuous_input(model_dir, false);
}
} // namespace inference } // namespace inference
} // namespace paddle } // namespace paddle
...@@ -13,8 +13,15 @@ ...@@ -13,8 +13,15 @@
// limitations under the License. // limitations under the License.
#include "paddle/fluid/memory/allocation/legacy_allocator.h" #include "paddle/fluid/memory/allocation/legacy_allocator.h"
#include <string> #include <string>
#include <utility>
#include <vector> #include <vector>
#ifdef PADDLE_WITH_JEMALLOC
#include <jemalloc/jemalloc.h>
#endif
#include "glog/logging.h" #include "glog/logging.h"
#include "paddle/fluid/memory/detail/buddy_allocator.h" #include "paddle/fluid/memory/detail/buddy_allocator.h"
#include "paddle/fluid/memory/detail/system_allocator.h" #include "paddle/fluid/memory/detail/system_allocator.h"
...@@ -37,7 +44,7 @@ template <typename Place> ...@@ -37,7 +44,7 @@ template <typename Place>
void *Alloc(const Place &place, size_t size); void *Alloc(const Place &place, size_t size);
template <typename Place> template <typename Place>
void Free(const Place &place, void *p); void Free(const Place &place, void *p, size_t size);
template <typename Place> template <typename Place>
size_t Used(const Place &place); size_t Used(const Place &place);
...@@ -52,6 +59,11 @@ size_t memory_usage(const platform::Place &p); ...@@ -52,6 +59,11 @@ size_t memory_usage(const platform::Place &p);
using BuddyAllocator = detail::BuddyAllocator; using BuddyAllocator = detail::BuddyAllocator;
std::unordered_map</*device id*/ int,
std::pair</*current memory usage*/ uint64_t,
/*peak memory usage*/ uint64_t>>
gpu_mem_info;
BuddyAllocator *GetCPUBuddyAllocator() { BuddyAllocator *GetCPUBuddyAllocator() {
// We tried thread_local for inference::RNN1 model, but that not works much // We tried thread_local for inference::RNN1 model, but that not works much
// for multi-thread test. // for multi-thread test.
...@@ -89,7 +101,11 @@ struct NaiveAllocator { ...@@ -89,7 +101,11 @@ struct NaiveAllocator {
template <> template <>
void *Alloc<platform::CPUPlace>(const platform::CPUPlace &place, size_t size) { void *Alloc<platform::CPUPlace>(const platform::CPUPlace &place, size_t size) {
VLOG(10) << "Allocate " << size << " bytes on " << platform::Place(place); VLOG(10) << "Allocate " << size << " bytes on " << platform::Place(place);
#ifdef PADDLE_WITH_JEMALLOC
void *p = malloc(size);
#else
void *p = GetCPUBuddyAllocator()->Alloc(size); void *p = GetCPUBuddyAllocator()->Alloc(size);
#endif
if (FLAGS_init_allocated_mem) { if (FLAGS_init_allocated_mem) {
memset(p, 0xEF, size); memset(p, 0xEF, size);
} }
...@@ -98,14 +114,24 @@ void *Alloc<platform::CPUPlace>(const platform::CPUPlace &place, size_t size) { ...@@ -98,14 +114,24 @@ void *Alloc<platform::CPUPlace>(const platform::CPUPlace &place, size_t size) {
} }
template <> template <>
void Free<platform::CPUPlace>(const platform::CPUPlace &place, void *p) { void Free<platform::CPUPlace>(const platform::CPUPlace &place, void *p,
size_t size) {
VLOG(10) << "Free pointer=" << p << " on " << platform::Place(place); VLOG(10) << "Free pointer=" << p << " on " << platform::Place(place);
#ifdef PADDLE_WITH_JEMALLOC
free(p);
#else
GetCPUBuddyAllocator()->Free(p); GetCPUBuddyAllocator()->Free(p);
#endif
} }
template <> template <>
size_t Used<platform::CPUPlace>(const platform::CPUPlace &place) { size_t Used<platform::CPUPlace>(const platform::CPUPlace &place) {
#ifdef PADDLE_WITH_JEMALLOC
// fake the result of used memory when PADDLE_WITH_JEMALLOC is ON
return 0U;
#else
return GetCPUBuddyAllocator()->Used(); return GetCPUBuddyAllocator()->Used();
#endif
} }
#ifdef PADDLE_WITH_CUDA #ifdef PADDLE_WITH_CUDA
...@@ -177,9 +203,16 @@ void *Alloc<platform::CUDAPlace>(const platform::CUDAPlace &place, ...@@ -177,9 +203,16 @@ void *Alloc<platform::CUDAPlace>(const platform::CUDAPlace &place,
LOG(WARNING) << "GPU memory used: " LOG(WARNING) << "GPU memory used: "
<< string::HumanReadableSize(Used<platform::CUDAPlace>(place)); << string::HumanReadableSize(Used<platform::CUDAPlace>(place));
platform::SetDeviceId(cur_dev); platform::SetDeviceId(cur_dev);
} } else {
if (FLAGS_init_allocated_mem) { gpu_mem_info[place.device].first += size;
cudaMemset(ptr, 0xEF, size); if (gpu_mem_info[place.device].first > gpu_mem_info[place.device].second) {
gpu_mem_info[place.device].second = gpu_mem_info[place.device].first;
VLOG(3) << "device: " << place.device << " peak memory usage : "
<< (gpu_mem_info[place.device].second >> 20) << " MiB";
}
if (FLAGS_init_allocated_mem) {
cudaMemset(ptr, 0xEF, size);
}
} }
return ptr; return ptr;
#else #else
...@@ -188,9 +221,11 @@ void *Alloc<platform::CUDAPlace>(const platform::CUDAPlace &place, ...@@ -188,9 +221,11 @@ void *Alloc<platform::CUDAPlace>(const platform::CUDAPlace &place,
} }
template <> template <>
void Free<platform::CUDAPlace>(const platform::CUDAPlace &place, void *p) { void Free<platform::CUDAPlace>(const platform::CUDAPlace &place, void *p,
size_t size) {
#ifdef PADDLE_WITH_CUDA #ifdef PADDLE_WITH_CUDA
GetGPUBuddyAllocator(place.device)->Free(p); GetGPUBuddyAllocator(place.device)->Free(p);
gpu_mem_info[place.device].first -= size;
#else #else
PADDLE_THROW("'CUDAPlace' is not supported in CPU only device."); PADDLE_THROW("'CUDAPlace' is not supported in CPU only device.");
#endif #endif
...@@ -243,7 +278,7 @@ void *Alloc<platform::CUDAPinnedPlace>(const platform::CUDAPinnedPlace &place, ...@@ -243,7 +278,7 @@ void *Alloc<platform::CUDAPinnedPlace>(const platform::CUDAPinnedPlace &place,
template <> template <>
void Free<platform::CUDAPinnedPlace>(const platform::CUDAPinnedPlace &place, void Free<platform::CUDAPinnedPlace>(const platform::CUDAPinnedPlace &place,
void *p) { void *p, size_t size) {
#ifdef PADDLE_WITH_CUDA #ifdef PADDLE_WITH_CUDA
GetCUDAPinnedBuddyAllocator()->Free(p); GetCUDAPinnedBuddyAllocator()->Free(p);
#else #else
...@@ -264,15 +299,17 @@ struct AllocVisitor : public boost::static_visitor<void *> { ...@@ -264,15 +299,17 @@ struct AllocVisitor : public boost::static_visitor<void *> {
}; };
struct FreeVisitor : public boost::static_visitor<void> { struct FreeVisitor : public boost::static_visitor<void> {
inline explicit FreeVisitor(void *ptr) : ptr_(ptr) {} inline explicit FreeVisitor(void *ptr, size_t size)
: ptr_(ptr), size_(size) {}
template <typename Place> template <typename Place>
inline void operator()(const Place &place) const { inline void operator()(const Place &place) const {
Free<Place>(place, ptr_); Free<Place>(place, ptr_, size_);
} }
private: private:
void *ptr_; void *ptr_;
size_t size_;
}; };
size_t Usage::operator()(const platform::CPUPlace &cpu) const { size_t Usage::operator()(const platform::CPUPlace &cpu) const {
...@@ -304,8 +341,9 @@ Allocation *LegacyAllocator::AllocateImpl(size_t size, Allocator::Attr attr) { ...@@ -304,8 +341,9 @@ Allocation *LegacyAllocator::AllocateImpl(size_t size, Allocator::Attr attr) {
} }
void LegacyAllocator::Free(Allocation *allocation) { void LegacyAllocator::Free(Allocation *allocation) {
boost::apply_visitor(legacy::FreeVisitor(allocation->ptr()), boost::apply_visitor(
allocation->place()); legacy::FreeVisitor(allocation->ptr(), allocation->size()),
allocation->place());
delete allocation; delete allocation;
} }
} // namespace allocation } // namespace allocation
......
...@@ -13,6 +13,7 @@ add_subdirectory(detection) ...@@ -13,6 +13,7 @@ add_subdirectory(detection)
add_subdirectory(elementwise) add_subdirectory(elementwise)
add_subdirectory(fused) add_subdirectory(fused)
add_subdirectory(metrics) add_subdirectory(metrics)
add_subdirectory(ngraph)
add_subdirectory(optimizers) add_subdirectory(optimizers)
add_subdirectory(reduce_ops) add_subdirectory(reduce_ops)
add_subdirectory(sequence_ops) add_subdirectory(sequence_ops)
...@@ -66,7 +67,7 @@ set(COMMON_OP_DEPS ${OP_HEADER_DEPS}) ...@@ -66,7 +67,7 @@ set(COMMON_OP_DEPS ${OP_HEADER_DEPS})
set(COMMON_OP_DEPS ${COMMON_OP_DEPS} selected_rows_functor selected_rows lod_tensor maxouting unpooling pooling lod_rank_table context_project sequence_pooling executor) set(COMMON_OP_DEPS ${COMMON_OP_DEPS} selected_rows_functor selected_rows lod_tensor maxouting unpooling pooling lod_rank_table context_project sequence_pooling executor)
set(COMMON_OP_DEPS ${COMMON_OP_DEPS} dynload_warpctc) set(COMMON_OP_DEPS ${COMMON_OP_DEPS} dynload_warpctc)
set(COMMON_OP_DEPS ${COMMON_OP_DEPS} sequence_padding sequence_scale cos_sim_functor memory jit_kernel_helper concat_and_split cross_entropy softmax vol2col im2col sampler tree2col) set(COMMON_OP_DEPS ${COMMON_OP_DEPS} sequence_padding sequence_scale cos_sim_functor memory jit_kernel_helper concat_and_split cross_entropy softmax vol2col im2col sampler tree2col)
set(COMMON_OP_DEPS ${COMMON_OP_DEPS} sequence2batch lstm_compute matrix_bit_code gru_compute activation_functions) set(COMMON_OP_DEPS ${COMMON_OP_DEPS} sequence2batch lstm_compute matrix_bit_code gru_compute activation_functions beam_search)
if (WITH_GPU) if (WITH_GPU)
set(COMMON_OP_DEPS ${COMMON_OP_DEPS} depthwise_conv prelu) set(COMMON_OP_DEPS ${COMMON_OP_DEPS} depthwise_conv prelu)
endif() endif()
...@@ -86,7 +87,6 @@ set(GLOB_OPERATOR_DEPS ${OPERATOR_DEPS} CACHE INTERNAL "Global Op dependencies") ...@@ -86,7 +87,6 @@ set(GLOB_OPERATOR_DEPS ${OPERATOR_DEPS} CACHE INTERNAL "Global Op dependencies")
cc_test(gather_test SRCS gather_test.cc DEPS tensor) cc_test(gather_test SRCS gather_test.cc DEPS tensor)
cc_test(scatter_test SRCS scatter_test.cc DEPS tensor math_function) cc_test(scatter_test SRCS scatter_test.cc DEPS tensor math_function)
cc_test(beam_search_decode_op_test SRCS beam_search_decode_op_test.cc DEPS lod_tensor) cc_test(beam_search_decode_op_test SRCS beam_search_decode_op_test.cc DEPS lod_tensor)
cc_test(beam_search_op_test SRCS beam_search_op_test.cc DEPS lod_tensor beam_search_op)
cc_test(strided_memcpy_test SRCS strided_memcpy_test.cc DEPS tensor memory) cc_test(strided_memcpy_test SRCS strided_memcpy_test.cc DEPS tensor memory)
cc_test(save_load_op_test SRCS save_load_op_test.cc DEPS save_op load_op) cc_test(save_load_op_test SRCS save_load_op_test.cc DEPS save_op load_op)
cc_test(save_load_combine_op_test SRCS save_load_combine_op_test.cc DEPS save_combine_op load_combine_op) cc_test(save_load_combine_op_test SRCS save_load_combine_op_test.cc DEPS save_combine_op load_combine_op)
......
...@@ -14,7 +14,7 @@ limitations under the License. */ ...@@ -14,7 +14,7 @@ limitations under the License. */
#include "paddle/fluid/operators/activation_op.h" #include "paddle/fluid/operators/activation_op.h"
#include <string> #include <string>
#include "paddle/fluid/operators/mkldnn_activation_op.h" #include "paddle/fluid/operators/mkldnn/mkldnn_activation_op.h"
#include "paddle/fluid/platform/port.h" #include "paddle/fluid/platform/port.h"
namespace paddle { namespace paddle {
......
...@@ -83,7 +83,7 @@ __global__ void AffineChannelScaleBiasGradientCUDAKernel( ...@@ -83,7 +83,7 @@ __global__ void AffineChannelScaleBiasGradientCUDAKernel(
T* dbias) { T* dbias) {
const int outer_size = C; const int outer_size = C;
const int inner_size = N * HxW; const int inner_size = N * HxW;
typedef cub::BlockReduce<T, BlockDim> BlockReduce; typedef cub::BlockReduce<double, BlockDim> BlockReduce;
__shared__ typename BlockReduce::TempStorage ds_storage; __shared__ typename BlockReduce::TempStorage ds_storage;
__shared__ typename BlockReduce::TempStorage db_storage; __shared__ typename BlockReduce::TempStorage db_storage;
...@@ -97,13 +97,16 @@ __global__ void AffineChannelScaleBiasGradientCUDAKernel( ...@@ -97,13 +97,16 @@ __global__ void AffineChannelScaleBiasGradientCUDAKernel(
ds_sum += dy[index] * x[index]; ds_sum += dy[index] * x[index];
db_sum += dy[index]; db_sum += dy[index];
} }
ds_sum = BlockReduce(ds_storage).Reduce(ds_sum, cub::Sum()); __syncthreads();
db_sum = BlockReduce(db_storage).Reduce(db_sum, cub::Sum()); auto ds_out =
BlockReduce(ds_storage).Reduce(static_cast<double>(ds_sum), cub::Sum());
auto db_out =
BlockReduce(db_storage).Reduce(static_cast<double>(db_sum), cub::Sum());
__syncthreads();
if (threadIdx.x == 0) { if (threadIdx.x == 0) {
dscale[i] = ds_sum; dscale[i] = ds_out;
dbias[i] = db_sum; dbias[i] = db_out;
} }
__syncthreads();
} }
} }
......
...@@ -12,205 +12,15 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. ...@@ -12,205 +12,15 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#include <algorithm> #include "paddle/fluid/operators/beam_search_op.h"
#include <map>
#include <string> #include <string>
#include <vector> #include <vector>
#include "paddle/fluid/framework/lod_tensor.h"
#include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/operators/beam_search_op.h"
namespace paddle { namespace paddle {
namespace operators { namespace operators {
void BeamSearch::operator()(const framework::LoDTensor &pre_ids,
const framework::LoDTensor &pre_scores,
framework::LoDTensor *selected_ids,
framework::LoDTensor *selected_scores) {
auto abs_lod = framework::ToAbsOffset(ids_->lod());
auto &high_level = abs_lod[lod_level_];
auto items = SelectTopBeamSizeItems(pre_ids, pre_scores);
auto selected_items = ToMap(items, high_level.back());
VLOG(3) << "selected_items:";
for (size_t i = 0; i < selected_items.size(); ++i) {
VLOG(3) << "offset:" << i;
for (auto &item : selected_items[i]) {
VLOG(3) << ItemToString(item);
}
}
PruneEndBeams(pre_ids, &selected_items);
// calculate the output tensor's height
size_t num_instances = std::accumulate(
std::begin(selected_items), std::end(selected_items), 0,
[](size_t a, std::vector<Item> &b) { return a + b.size(); });
// the output tensor shape should be [num_instances, 1]
auto dims = framework::make_ddim(
std::vector<int64_t>({static_cast<int>(num_instances), 1}));
selected_ids->Resize(dims);
selected_scores->Resize(dims);
std::map<size_t /*offset*/, std::vector<Item>> hash;
framework::LoD new_lod;
auto *ids_data = selected_ids->mutable_data<int64_t>(platform::CPUPlace());
auto *scores_data =
selected_scores->mutable_data<float>(platform::CPUPlace());
// fill in data
std::vector<size_t> low_level;
size_t low_offset = 0;
for (auto &items : selected_items) {
low_level.push_back(low_offset);
for (auto &item : items) {
ids_data[low_offset] = item.id;
scores_data[low_offset] = item.score;
low_offset++;
}
}
low_level.push_back(low_offset);
// fill lod
framework::LoD lod(2);
lod[0].assign(high_level.begin(), high_level.end());
lod[1].assign(low_level.begin(), low_level.end());
if (!framework::CheckLoD(lod)) {
PADDLE_THROW("lod %s is not right", framework::LoDToString(lod));
}
selected_ids->set_lod(lod);
selected_scores->set_lod(lod);
}
void BeamSearch::PruneEndBeams(const framework::LoDTensor &pre_ids,
std::vector<std::vector<Item>> *items) {
auto *pre_ids_data = pre_ids.data<int64_t>();
auto abs_lod = framework::ToAbsOffset(ids_->lod());
auto &high_level = abs_lod[lod_level_];
for (size_t src_idx = 0; src_idx < high_level.size() - 1; ++src_idx) {
size_t src_prefix_start = high_level[src_idx];
size_t src_prefix_end = high_level[src_idx + 1];
bool finish_flag = true;
for (size_t offset = src_prefix_start; offset < src_prefix_end; offset++) {
for (auto &item : items->at(offset)) {
if (item.id != static_cast<size_t>(end_id_) ||
pre_ids_data[offset] != end_id_) {
finish_flag = false;
break;
}
}
if (!finish_flag) break;
}
if (finish_flag) { // all branchs of the beam (source sentence) end and
// prune this beam
for (size_t offset = src_prefix_start; offset < src_prefix_end; offset++)
items->at(offset).clear();
}
}
}
std::vector<std::vector<BeamSearch::Item>> BeamSearch::ToMap(
const std::vector<std::vector<Item>> &items, size_t element_num) {
std::vector<std::vector<Item>> result;
result.resize(element_num);
for (auto &entries : items) {
for (const auto &item : entries) {
result[item.offset].push_back(item);
}
}
return result;
}
std::vector<std::vector<BeamSearch::Item>> BeamSearch::SelectTopBeamSizeItems(
const framework::LoDTensor &pre_ids,
const framework::LoDTensor &pre_scores) {
std::vector<std::vector<Item>> result;
std::vector<Item> items;
// for each source sentence, select the top beam_size items across all
// candidate sets.
while (NextItemSet(pre_ids, pre_scores, &items)) {
std::nth_element(
std::begin(items), std::begin(items) + beam_size_, std::end(items),
[](const Item &a, const Item &b) { return a.score > b.score; });
// prune the top beam_size items.
if (items.size() > beam_size_) {
items.resize(beam_size_);
}
result.emplace_back(items);
}
VLOG(3) << "SelectTopBeamSizeItems result size " << result.size();
for (auto &items : result) {
VLOG(3) << "item set:";
for (auto &item : items) {
VLOG(3) << ItemToString(item);
}
}
return result;
}
// the candidates of a source
bool BeamSearch::NextItemSet(const framework::LoDTensor &pre_ids,
const framework::LoDTensor &pre_scores,
std::vector<BeamSearch::Item> *items) {
if (sent_offset_ >= ids_->NumElements(lod_level_)) {
return false;
}
// find the current candidates
auto ids = *ids_;
auto scores = *scores_;
auto abs_lod = framework::ToAbsOffset(ids.lod());
auto *ids_data = ids.data<int64_t>();
auto *scores_data = scores.data<float>();
size_t instance_dim = 1;
for (int i = 1; i < ids.dims().size(); i++) {
instance_dim *= ids.dims()[i];
}
auto *pre_ids_data = pre_ids.data<int64_t>();
auto *pre_scores_data = pre_scores.data<float>();
items->clear();
items->reserve(framework::product(ids.dims()));
for (size_t offset = abs_lod[lod_level_][sent_offset_];
offset < abs_lod[lod_level_][sent_offset_ + 1]; offset++) {
auto pre_id = pre_ids_data[offset];
auto pre_score = pre_scores_data[offset];
if (pre_id == end_id_) {
// Allocate all probability mass to eos_id for finished branchs and the
// other candidate ids can be ignored.
items->emplace_back(offset, end_id_, pre_score);
} else {
for (size_t d = 0; d < instance_dim; d++) {
const size_t dim_offset = offset * instance_dim + d;
items->emplace_back(offset, ids_data[dim_offset],
scores_data[dim_offset]);
}
}
}
sent_offset_++;
return true;
}
std::ostream &operator<<(std::ostream &os, const BeamSearch::Item &item) {
os << "{";
os << "offset: " << item.offset << ", ";
os << "id: " << item.id << ", ";
os << "score: " << item.score << "";
os << "}";
return os;
}
std::string ItemToString(const BeamSearch::Item &item) {
std::ostringstream stream;
stream << item;
return stream.str();
}
class BeamSearchOpMaker : public framework::OpProtoAndCheckerMaker { class BeamSearchOpMaker : public framework::OpProtoAndCheckerMaker {
public: public:
void Make() override { void Make() override {
...@@ -219,29 +29,40 @@ class BeamSearchOpMaker : public framework::OpProtoAndCheckerMaker { ...@@ -219,29 +29,40 @@ class BeamSearchOpMaker : public framework::OpProtoAndCheckerMaker {
"(LoDTensor) The LoDTensor containing the selected ids at the " "(LoDTensor) The LoDTensor containing the selected ids at the "
"previous step. It should be a tensor with shape (batch_size, 1) " "previous step. It should be a tensor with shape (batch_size, 1) "
"and lod `[[0, 1, ... , batch_size], [0, 1, ..., batch_size]]` at " "and lod `[[0, 1, ... , batch_size], [0, 1, ..., batch_size]]` at "
"thefirst step."); "the first step.");
AddInput("pre_scores", AddInput("pre_scores",
"(LoDTensor) The LoDTensor containing the accumulated " "(LoDTensor) The LoDTensor containing the accumulated "
"scores corresponding to the selected ids at the previous step."); "scores corresponding to the selected ids at the previous step.");
AddInput("ids", AddInput("ids",
"(LoDTensor) The LoDTensor containing the candidates ids. Its " "(LoDTensor) The LoDTensor containing the candidates ids. Its "
"shape should be (batch_size * beam_size, K), where K supposed to " "shape should be (batch_size * beam_size, W). If not set, it will "
"be beam_size."); "be calculated out according to Input(scores) in this operator.")
.AsDispensable();
AddInput("scores", AddInput("scores",
"(LoDTensor) The LodTensor containing the accumulated scores " "(LoDTensor) The LoDTensor containing the current scores "
"corresponding to Input(ids) and its shape is the same as the " "corresponding to Input(ids). If Input(ids) is not nullptr, its "
"shape of Input(ids)."); "shape is the same as that of Input(ids)."
"If is_accumulated is true, Input(scores) is accumulated scores "
"and will be used derectedly. Else, each score will be "
"transformed to the log field and accumulate Input(pre_sores) "
"first.");
AddOutput("selected_ids", AddOutput("selected_ids",
"A LodTensor that stores the IDs selected by beam search."); "A LodTensor that stores the IDs selected by beam search.");
AddOutput("selected_scores", AddOutput("selected_scores",
"A LoDTensor containing the accumulated scores corresponding to " "A LoDTensor containing the accumulated scores corresponding to "
"Output(selected_ids)."); "Output(selected_ids).");
AddOutput(
"parent_idx",
"A Tensor preserving the selected_ids' parent indice in pre_ids.");
// Attributes stored in AttributeMap // Attributes stored in AttributeMap
AddAttr<int>("level", "the level of LoDTensor"); AddAttr<int>("level", "the level of LoDTensor");
AddAttr<int>("beam_size", "beam size for beam search"); AddAttr<int>("beam_size", "beam size for beam search");
AddAttr<int>("end_id", AddAttr<int>("end_id",
"the token id which indicates the end of a sequence"); "the token id which indicates the end of a sequence");
AddAttr<bool>("is_accumulated",
"Whether the Input(scores) is accumulated scores.")
.SetDefault(true);
AddComment(R"DOC( AddComment(R"DOC(
This operator does the search in beams for one time step. This operator does the search in beams for one time step.
...@@ -265,10 +86,9 @@ class BeamSearchOp : public framework::OperatorWithKernel { ...@@ -265,10 +86,9 @@ class BeamSearchOp : public framework::OperatorWithKernel {
public: public:
using framework::OperatorWithKernel::OperatorWithKernel; using framework::OperatorWithKernel::OperatorWithKernel;
protected:
void InferShape(framework::InferShapeContext *ctx) const override { void InferShape(framework::InferShapeContext *ctx) const override {
for (const std::string &arg : for (const std::string &arg :
std::vector<std::string>({"pre_ids", "ids", "scores"})) { std::vector<std::string>({"pre_ids", "scores"})) {
PADDLE_ENFORCE(ctx->HasInput(arg), "BeamSearch need input argument '%s'", PADDLE_ENFORCE(ctx->HasInput(arg), "BeamSearch need input argument '%s'",
arg); arg);
} }
...@@ -279,12 +99,22 @@ class BeamSearchOp : public framework::OperatorWithKernel { ...@@ -279,12 +99,22 @@ class BeamSearchOp : public framework::OperatorWithKernel {
} }
} }
protected:
framework::OpKernelType GetExpectedKernelType( framework::OpKernelType GetExpectedKernelType(
const framework::ExecutionContext &ctx) const override { const framework::ExecutionContext &ctx) const override {
framework::OpKernelType kt = framework::OpKernelType( auto *scores = ctx.Input<framework::LoDTensor>("scores");
ctx.Input<framework::LoDTensor>("pre_ids")->type(), size_t level = ctx.Attr<int>("level");
platform::CPUPlace()); size_t batch_size = scores->lod()[level].size() - 1;
return kt; // The current CUDA kernel only support cases with batch_size < 4.
// Compute on CPU for cases with batch_size > 4.
if (batch_size <= 4) {
return framework::OpKernelType(
ctx.Input<framework::LoDTensor>("pre_ids")->type(), ctx.GetPlace());
} else {
return framework::OpKernelType(
ctx.Input<framework::LoDTensor>("pre_ids")->type(),
platform::CPUPlace());
}
} }
}; };
......
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/operators/beam_search_op.h"
#include "paddle/fluid/framework/op_registry.h"
namespace ops = paddle::operators;
REGISTER_OP_CUDA_KERNEL(
beam_search,
ops::BeamSearchOpKernel<paddle::platform::CUDADeviceContext, float>,
ops::BeamSearchOpKernel<paddle::platform::CUDADeviceContext, double>,
ops::BeamSearchOpKernel<paddle::platform::CUDADeviceContext, int>,
ops::BeamSearchOpKernel<paddle::platform::CUDADeviceContext, int64_t>);
...@@ -4,7 +4,7 @@ Licensed under the Apache License, Version 2.0 (the "License"); ...@@ -4,7 +4,7 @@ Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License. you may not use this file except in compliance with the License.
You may obtain a copy of the License at You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0 http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS, distributed under the License is distributed on an "AS IS" BASIS,
...@@ -14,187 +14,12 @@ limitations under the License. */ ...@@ -14,187 +14,12 @@ limitations under the License. */
#pragma once #pragma once
#include <string>
#include <vector>
#include "paddle/fluid/framework/lod_tensor.h"
#include "paddle/fluid/framework/operator.h" #include "paddle/fluid/framework/operator.h"
#include "paddle/fluid/operators/math/beam_search.h"
namespace paddle { namespace paddle {
namespace operators { namespace operators {
/*
* This is an implementation of beam search.
*
* To explain the details, lets take machine translation task for example, in
* this task, one source sentence is translated to multiple target sentences,
* during this period, one sentence will be translated to multiple translation
* prefixes(target sentence that have not ended), in each time step a prefix
* will have some candidates, input the candidate ids and their corresponding
* scores (probabilities), it will sort and select the top beam_size candidates
* for each source sentence, and store the selected candidates's score and their
* corresponding ids to LoDTensors.
*
* A detailed example:
*
* Input
*
* ids:
* LoD (should have 2 levels)
* first level: [0, 1, 4]
* second level: [0, 1, 2, 3, 4]
*
* tensor's data
* [
* [4, 2, 5]
* [2, 1, 3]
* [3, 5, 2]
* [8, 2, 1]
* ]
*
* scores:
* LoD same as `ids`
* tensor's data
* [
* [0.5, 0.3, 0.2]
* [0.6, 0.3, 0.1]
* [0.9, 0.5, 0.1]
* [0.7, 0.5, 0.1]
* ]
*
* the inputs means that there are 2 source sentences to translate, and the
* first source has 1 prefix, the second source has 2 prefix.
*
* lets assume beam size is 2, and the beam search's output should be
* LoD
* first level:
* [0, 1, 2]
* second level:
* [0, 2, 4]
*
* id tensor's data
* [[
* 4,
* 1,
* 3,
* 8,
* ]]
*
* score tensor's data
* [[
* 0.5,
* 0.3,
* 0.9,
* 0.7
* ]]
*
* TODO all the prune operations should be in the beam search, so it is better
* to split the beam search algorithm into a sequence of smaller operators, and
* the prune operators can be inserted in this sequence.
*/
class BeamSearch {
public:
// TODO(superjom) make type customizable
using id_t = size_t;
using score_t = float;
/*
* Input the arguments that needed by this class.
*/
BeamSearch(const framework::LoDTensor& ids,
const framework::LoDTensor& scores, size_t level, size_t beam_size,
int end_id)
: beam_size_(beam_size),
ids_(&ids),
scores_(&scores),
lod_level_(level),
end_id_(end_id) {}
/*
* The main function of beam search.
*
* @selected_ids: a [None, 1]-shaped tensor with LoD.
* In a machine translation model, it might be the candidate term id sets,
* each set stored as a varience-length sequence.
* The format might be described with a two-level LoD
* - [[0 1]
* - [0 1 2]]
* - [[]
* - [0 1]]
* the first level of LoD tells that there are two source sentences. The
* second level describes the details of the candidate id set's offsets in
* the
* source sentences.
*
* @selected_scores: a LoD tensor with the same shape and LoD with
* selected_ids.
* It stores the corresponding scores of candidate ids in selected_ids.
*
* Return false if all the input tensor is empty, in machine translation task
* that means no candidates is provided, and the task will stop running.
*/
void operator()(const framework::LoDTensor& pre_ids,
const framework::LoDTensor& pre_scores,
framework::LoDTensor* selected_ids,
framework::LoDTensor* selected_scores);
/*
* The basic items help to sort.
*/
struct Item {
Item() {}
Item(size_t offset, size_t id, float score)
: offset(offset), id(id), score(score) {}
// offset in the higher lod level.
size_t offset;
// // prefix id in the lower lod level.
// size_t prefix;
// the candidate id
id_t id;
// the corresponding score
score_t score;
};
protected:
/*
* Prune the source sentences all branchs finished, and it is optional.
* Pruning must one step later than finishing (thus pre_ids is needed here),
* since the end tokens must be writed out.
*/
void PruneEndBeams(const framework::LoDTensor& pre_ids,
std::vector<std::vector<Item>>* items);
/*
* Transform the items into a map whose key is offset, value is the items.
* NOTE low performance.
*/
std::vector<std::vector<Item>> ToMap(
const std::vector<std::vector<Item>>& inputs, size_t element_num);
/*
* For each source, select top beam_size records.
*/
std::vector<std::vector<Item>> SelectTopBeamSizeItems(
const framework::LoDTensor& pre_ids,
const framework::LoDTensor& pre_scores);
/*
* Get the items of next source sequence, return false if no remaining items.
*/
bool NextItemSet(const framework::LoDTensor& pre_ids,
const framework::LoDTensor& pre_scores,
std::vector<Item>* items);
private:
size_t beam_size_;
const framework::LoDTensor* ids_;
const framework::LoDTensor* scores_;
size_t lod_level_{0};
size_t sent_offset_{0};
int end_id_{0};
};
std::ostream& operator<<(std::ostream& os, const BeamSearch::Item& item);
std::string ItemToString(const BeamSearch::Item& item);
template <typename DeviceContext, typename T> template <typename DeviceContext, typename T>
class BeamSearchOpKernel : public framework::OpKernel<T> { class BeamSearchOpKernel : public framework::OpKernel<T> {
public: public:
...@@ -203,7 +28,7 @@ class BeamSearchOpKernel : public framework::OpKernel<T> { ...@@ -203,7 +28,7 @@ class BeamSearchOpKernel : public framework::OpKernel<T> {
auto* scores = context.Input<framework::LoDTensor>("scores"); auto* scores = context.Input<framework::LoDTensor>("scores");
auto* pre_ids = context.Input<framework::LoDTensor>("pre_ids"); auto* pre_ids = context.Input<framework::LoDTensor>("pre_ids");
auto* pre_scores = context.Input<framework::LoDTensor>("pre_scores"); auto* pre_scores = context.Input<framework::LoDTensor>("pre_scores");
PADDLE_ENFORCE_NOT_NULL(ids);
PADDLE_ENFORCE_NOT_NULL(scores); PADDLE_ENFORCE_NOT_NULL(scores);
PADDLE_ENFORCE_NOT_NULL(pre_ids); PADDLE_ENFORCE_NOT_NULL(pre_ids);
PADDLE_ENFORCE_NOT_NULL(pre_scores); PADDLE_ENFORCE_NOT_NULL(pre_scores);
...@@ -211,14 +36,22 @@ class BeamSearchOpKernel : public framework::OpKernel<T> { ...@@ -211,14 +36,22 @@ class BeamSearchOpKernel : public framework::OpKernel<T> {
size_t level = context.Attr<int>("level"); size_t level = context.Attr<int>("level");
size_t beam_size = context.Attr<int>("beam_size"); size_t beam_size = context.Attr<int>("beam_size");
int end_id = context.Attr<int>("end_id"); int end_id = context.Attr<int>("end_id");
BeamSearch alg(*ids, *scores, level, beam_size, end_id); bool is_accumulated = context.Attr<bool>("is_accumulated");
auto selected_ids = context.Output<framework::LoDTensor>("selected_ids"); auto selected_ids = context.Output<framework::LoDTensor>("selected_ids");
auto selected_scores = auto selected_scores =
context.Output<framework::LoDTensor>("selected_scores"); context.Output<framework::LoDTensor>("selected_scores");
auto* parent_idx = context.Output<framework::Tensor>("parent_idx");
PADDLE_ENFORCE_NOT_NULL(selected_ids); PADDLE_ENFORCE_NOT_NULL(selected_ids);
PADDLE_ENFORCE_NOT_NULL(selected_scores); PADDLE_ENFORCE_NOT_NULL(selected_scores);
alg(*pre_ids, *pre_scores, selected_ids, selected_scores); PADDLE_ENFORCE_NOT_NULL(parent_idx);
math::BeamSearchFunctor<DeviceContext, T> alg;
alg(context.template device_context<DeviceContext>(), pre_ids, pre_scores,
ids, scores, selected_ids, selected_scores, parent_idx, level,
beam_size, end_id, is_accumulated);
} }
}; };
} // namespace operators } // namespace operators
} // namespace paddle } // namespace paddle
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/operators/beam_search_op.h"
#include <gtest/gtest.h>
#include <vector>
namespace paddle {
namespace test {
using std::vector;
using framework::LoDTensor;
using framework::LoD;
using operators::BeamSearch;
using paddle::platform::CPUPlace;
using std::cout;
using std::endl;
void CreateInput(LoDTensor* ids, LoDTensor* scores) {
LoD lod;
vector<size_t> level0({0, 2, 4});
vector<size_t> level1({0, 1, 2, 3, 4});
lod.push_back(level0);
lod.push_back(level1);
ids->set_lod(lod);
scores->set_lod(lod);
auto dims = framework::make_ddim(vector<int64_t>({4, 3}));
ids->Resize(dims);
scores->Resize(dims);
CPUPlace place;
auto* ids_data = ids->mutable_data<int64_t>(place);
auto* scores_data = scores->mutable_data<float>(place);
vector<int64_t> _ids({4, 2, 5, 2, 1, 3, 3, 5, 2, 8, 2, 1});
vector<float> _scores(
{0.5f, 0.3f, 0.2f, 0.6f, 0.3f, 0.1f, 0.9f, 0.5f, 0.1f, 0.7f, 0.5f, 0.1f});
for (int i = 0; i < 12; i++) {
ids_data[i] = _ids[i];
scores_data[i] = _scores[i];
}
}
// It seems that beam_search_op has bugs.
TEST(DISABLED_beam_search_op, run) {
CPUPlace place;
LoDTensor ids, scores;
CreateInput(&ids, &scores);
LoDTensor pre_ids;
pre_ids.Resize(framework::make_ddim(vector<int64_t>(4, 1)));
for (int i = 0; i < 4; i++) {
pre_ids.mutable_data<int64_t>(place)[i] = i + 1;
}
LoDTensor pre_scores;
pre_scores.Resize(framework::make_ddim(vector<int64_t>(4, 1)));
for (int i = 0; i < 4; i++) {
pre_scores.mutable_data<float>(place)[i] = 0.1 * (i + 1);
}
BeamSearch beamsearch(ids, scores, (size_t)0, (size_t)2, 0);
LoDTensor sids, sscores;
beamsearch(pre_ids, pre_scores, &sids, &sscores);
LOG(INFO) << "score: " << sscores << endl;
ASSERT_EQ(sids.lod(), sscores.lod());
vector<int> tids({4, 2, 3, 8});
vector<float> tscores({0.5f, 0.6f, 0.9f, 0.7f});
for (int i = 0; i < 4; i++) {
ASSERT_EQ(tids[i], sids.data<int64_t>()[i]);
ASSERT_EQ(tscores[i], sscores.data<float>()[i]);
}
}
} // namespace test
} // namespace paddle
...@@ -87,8 +87,8 @@ class BprLossGradientOpKernel : public framework::OpKernel<T> { ...@@ -87,8 +87,8 @@ class BprLossGradientOpKernel : public framework::OpKernel<T> {
auto* label = ctx.Input<Tensor>("Label"); auto* label = ctx.Input<Tensor>("Label");
auto* dx = ctx.Output<Tensor>(framework::GradVarName("X")); auto* dx = ctx.Output<Tensor>(framework::GradVarName("X"));
const int step_size = x->dims()[0]; const size_t step_size = static_cast<size_t>(x->dims()[0]);
const int num_classes = x->dims()[1]; const size_t num_classes = static_cast<size_t>(x->dims()[1]);
T* dx_data = dx->mutable_data<T>(ctx.GetPlace()); T* dx_data = dx->mutable_data<T>(ctx.GetPlace());
const T* dy_data = dy->data<T>(); const T* dy_data = dy->data<T>();
const T* x_data = x->data<T>(); const T* x_data = x->data<T>();
......
...@@ -104,9 +104,7 @@ class CUDNNConvFusionOpKernel : public framework::OpKernel<T> { ...@@ -104,9 +104,7 @@ class CUDNNConvFusionOpKernel : public framework::OpKernel<T> {
// ------------------- cudnn conv algorithm --------------------- // ------------------- cudnn conv algorithm ---------------------
cudnnConvolutionFwdAlgo_t algo; cudnnConvolutionFwdAlgo_t algo;
auto handle = dev_ctx.cudnn_handle(); auto handle = dev_ctx.cudnn_handle();
auto workspace_handle = dev_ctx.cudnn_workspace_handle();
Tensor cudnn_workspace;
void* cudnn_workspace_ptr = nullptr;
CUDNN_ENFORCE(platform::dynload::cudnnSetConvolutionMathType( CUDNN_ENFORCE(platform::dynload::cudnnSetConvolutionMathType(
cudnn_conv_desc, CUDNN_DEFAULT_MATH)); cudnn_conv_desc, CUDNN_DEFAULT_MATH));
...@@ -120,24 +118,19 @@ class CUDNNConvFusionOpKernel : public framework::OpKernel<T> { ...@@ -120,24 +118,19 @@ class CUDNNConvFusionOpKernel : public framework::OpKernel<T> {
workspace_size_limit, &algo)); workspace_size_limit, &algo));
VLOG(3) << "cuDNN forward algo " << algo; VLOG(3) << "cuDNN forward algo " << algo;
} else { } else {
cudnn_workspace =
ctx.AllocateTmpTensor<int8_t, platform::CUDADeviceContext>(
framework::make_ddim(
{static_cast<int64_t>(workspace_size_limit)}),
dev_ctx);
cudnn_workspace_ptr = static_cast<void*>(cudnn_workspace.data<int8_t>());
auto search_func = [&]() { auto search_func = [&]() {
int returned_algo_count; int returned_algo_count;
std::array<cudnnConvolutionFwdAlgoPerf_t, kNUM_CUDNN_FWD_ALGS> std::array<cudnnConvolutionFwdAlgoPerf_t, kNUM_CUDNN_FWD_ALGS>
fwd_perf_stat; fwd_perf_stat;
auto cudnn_find_func = [&](void* cudnn_workspace) {
CUDNN_ENFORCE(platform::dynload::cudnnFindConvolutionForwardAlgorithmEx( CUDNN_ENFORCE(
handle, cudnn_input_desc, input_data, cudnn_filter_desc, platform::dynload::cudnnFindConvolutionForwardAlgorithmEx(
filter_data, cudnn_conv_desc, cudnn_output_desc, output_data, handle, cudnn_input_desc, input_data, cudnn_filter_desc,
kNUM_CUDNN_FWD_ALGS, &returned_algo_count, fwd_perf_stat.data(), filter_data, cudnn_conv_desc, cudnn_output_desc, output_data,
cudnn_workspace_ptr, workspace_size_limit)); kNUM_CUDNN_FWD_ALGS, &returned_algo_count,
fwd_perf_stat.data(), cudnn_workspace, workspace_size_limit));
};
workspace_handle.RunFunc(cudnn_find_func, workspace_size_limit);
VLOG(3) << "Perf result: (algo: stat, time, memory)"; VLOG(3) << "Perf result: (algo: stat, time, memory)";
for (int i = 0; i < returned_algo_count; ++i) { for (int i = 0; i < returned_algo_count; ++i) {
const auto& stat = fwd_perf_stat[i]; const auto& stat = fwd_perf_stat[i];
...@@ -188,15 +181,6 @@ class CUDNNConvFusionOpKernel : public framework::OpKernel<T> { ...@@ -188,15 +181,6 @@ class CUDNNConvFusionOpKernel : public framework::OpKernel<T> {
PADDLE_ENFORCE_LE(workspace_size_in_bytes, workspace_size_limit, PADDLE_ENFORCE_LE(workspace_size_in_bytes, workspace_size_limit,
"workspace_size to be allocated exceeds the limit"); "workspace_size to be allocated exceeds the limit");
if (!cudnn_workspace_ptr) {
cudnn_workspace =
ctx.AllocateTmpTensor<int8_t, platform::CUDADeviceContext>(
framework::make_ddim(
{static_cast<int64_t>(workspace_size_in_bytes)}),
dev_ctx);
cudnn_workspace_ptr = static_cast<void*>(cudnn_workspace.data<int8_t>());
}
if ((activation == "identity") && (!residual)) { if ((activation == "identity") && (!residual)) {
// Only the CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_PRECOMP_GEMM algo is // Only the CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_PRECOMP_GEMM algo is
// enabled with CUDNN_ACTIVATION_IDENTITY in cuDNN lib. // enabled with CUDNN_ACTIVATION_IDENTITY in cuDNN lib.
...@@ -204,12 +188,13 @@ class CUDNNConvFusionOpKernel : public framework::OpKernel<T> { ...@@ -204,12 +188,13 @@ class CUDNNConvFusionOpKernel : public framework::OpKernel<T> {
// cudnnConvolutionForward and cudnnAddTensor // cudnnConvolutionForward and cudnnAddTensor
// ------------- cudnn conv forward and bias add --------------------- // ------------- cudnn conv forward and bias add ---------------------
ScalingParamType<T> alpha = 1.0f, beta = 0.0f; ScalingParamType<T> alpha = 1.0f, beta = 0.0f;
auto cudnn_func = [&](void* cudnn_workspace) {
CUDNN_ENFORCE(platform::dynload::cudnnConvolutionForward( CUDNN_ENFORCE(platform::dynload::cudnnConvolutionForward(
handle, &alpha, cudnn_input_desc, input_data, cudnn_filter_desc, handle, &alpha, cudnn_input_desc, input_data, cudnn_filter_desc,
filter_data, cudnn_conv_desc, algo, cudnn_workspace_ptr, filter_data, cudnn_conv_desc, algo, cudnn_workspace,
workspace_size_in_bytes, &beta, cudnn_output_desc, output_data)); workspace_size_in_bytes, &beta, cudnn_output_desc, output_data));
};
workspace_handle.RunFunc(cudnn_func, workspace_size_in_bytes);
CUDNN_ENFORCE(platform::dynload::cudnnAddTensor( CUDNN_ENFORCE(platform::dynload::cudnnAddTensor(
handle, &alpha, cudnn_bias_desc, bias_data, &alpha, cudnn_output_desc, handle, &alpha, cudnn_bias_desc, bias_data, &alpha, cudnn_output_desc,
output_data)); output_data));
...@@ -220,13 +205,15 @@ class CUDNNConvFusionOpKernel : public framework::OpKernel<T> { ...@@ -220,13 +205,15 @@ class CUDNNConvFusionOpKernel : public framework::OpKernel<T> {
// ------------------- cudnn conv+bias+act forward -------------------- // ------------------- cudnn conv+bias+act forward --------------------
ScalingParamType<T> alpha1 = 1.0f; ScalingParamType<T> alpha1 = 1.0f;
ScalingParamType<T> alpha2 = residual ? 1.0f : 0.0f; ScalingParamType<T> alpha2 = residual ? 1.0f : 0.0f;
auto cudnn_func = [&](void* cudnn_workspace) {
CUDNN_ENFORCE(platform::dynload::cudnnConvolutionBiasActivationForward( CUDNN_ENFORCE(platform::dynload::cudnnConvolutionBiasActivationForward(
handle, &alpha1, cudnn_input_desc, input_data, cudnn_filter_desc, handle, &alpha1, cudnn_input_desc, input_data, cudnn_filter_desc,
filter_data, cudnn_conv_desc, algo, cudnn_workspace_ptr, filter_data, cudnn_conv_desc, algo, cudnn_workspace,
workspace_size_in_bytes, &alpha2, cudnn_output_desc, residual_data, workspace_size_in_bytes, &alpha2, cudnn_output_desc, residual_data,
cudnn_bias_desc, bias_data, cudnn_act_desc, cudnn_output_desc, cudnn_bias_desc, bias_data, cudnn_act_desc, cudnn_output_desc,
output_data)); output_data));
};
workspace_handle.RunFunc(cudnn_func, workspace_size_in_bytes);
} }
std::vector<int> channels = ctx.Attr<std::vector<int>>("split_channels"); std::vector<int> channels = ctx.Attr<std::vector<int>>("split_channels");
if (channels.size()) { if (channels.size()) {
......
...@@ -104,18 +104,16 @@ class CUDNNConvTransposeOpKernel : public framework::OpKernel<T> { ...@@ -104,18 +104,16 @@ class CUDNNConvTransposeOpKernel : public framework::OpKernel<T> {
int output_offset = output->numel() / output->dims()[0] / groups; int output_offset = output->numel() / output->dims()[0] / groups;
int filter_offset = filter->numel() / groups; int filter_offset = filter->numel() / groups;
T alpha = 1.0f, beta = 0.0f; T alpha = 1.0f, beta = 0.0f;
auto workspace_handle = dev_ctx.cudnn_workspace_handle();
auto temp_allocation =
platform::DeviceTemporaryAllocator::Instance().Get(dev_ctx).Allocate(
workspace_size_in_bytes);
void* cudnn_workspace = temp_allocation->ptr();
for (int g = 0; g < groups; g++) { for (int g = 0; g < groups; g++) {
CUDNN_ENFORCE(platform::dynload::cudnnConvolutionBackwardData( auto cudnn_func = [&](void* cudnn_workspace) {
handle, &alpha, cudnn_filter_desc, filter_data + filter_offset * g, CUDNN_ENFORCE(platform::dynload::cudnnConvolutionBackwardData(
cudnn_input_desc, input_data + input_offset * g, cudnn_conv_desc, handle, &alpha, cudnn_filter_desc, filter_data + filter_offset * g,
algo, cudnn_workspace, workspace_size_in_bytes, &beta, cudnn_input_desc, input_data + input_offset * g, cudnn_conv_desc,
cudnn_output_desc, output_data + output_offset * g)); algo, cudnn_workspace, workspace_size_in_bytes, &beta,
cudnn_output_desc, output_data + output_offset * g));
};
workspace_handle.RunFunc(cudnn_func, workspace_size_in_bytes);
} }
} }
}; };
...@@ -211,22 +209,20 @@ class CUDNNConvTransposeGradOpKernel : public framework::OpKernel<T> { ...@@ -211,22 +209,20 @@ class CUDNNConvTransposeGradOpKernel : public framework::OpKernel<T> {
output_grad->numel() / output_grad->dims()[0] / groups; output_grad->numel() / output_grad->dims()[0] / groups;
int filter_offset = filter->numel() / groups; int filter_offset = filter->numel() / groups;
T alpha = 1.0f, beta = 0.0f; T alpha = 1.0f, beta = 0.0f;
auto workspace_handle = dev_ctx.cudnn_workspace_handle();
auto temp_allocation =
platform::DeviceTemporaryAllocator::Instance().Get(dev_ctx).Allocate(
workspace_size_in_bytes);
void* cudnn_workspace = temp_allocation->ptr();
if (input_grad) { if (input_grad) {
T* input_grad_data = input_grad->mutable_data<T>(ctx.GetPlace()); T* input_grad_data = input_grad->mutable_data<T>(ctx.GetPlace());
// Because beta is zero, it is unnecessary to reset input_grad. // Because beta is zero, it is unnecessary to reset input_grad.
for (int g = 0; g < groups; g++) { for (int g = 0; g < groups; g++) {
CUDNN_ENFORCE(platform::dynload::cudnnConvolutionForward( auto cudnn_func = [&](void* cudnn_workspace) {
handle, &alpha, cudnn_output_desc, CUDNN_ENFORCE(platform::dynload::cudnnConvolutionForward(
output_grad_data + output_grad_offset * g, cudnn_filter_desc, handle, &alpha, cudnn_output_desc,
filter_data + filter_offset * g, cudnn_conv_desc, data_algo, output_grad_data + output_grad_offset * g, cudnn_filter_desc,
cudnn_workspace, workspace_size_in_bytes, &beta, cudnn_input_desc, filter_data + filter_offset * g, cudnn_conv_desc, data_algo,
input_grad_data + input_offset * g)); cudnn_workspace, workspace_size_in_bytes, &beta, cudnn_input_desc,
input_grad_data + input_offset * g));
};
workspace_handle.RunFunc(cudnn_func, workspace_size_in_bytes);
} }
} }
...@@ -236,12 +232,15 @@ class CUDNNConvTransposeGradOpKernel : public framework::OpKernel<T> { ...@@ -236,12 +232,15 @@ class CUDNNConvTransposeGradOpKernel : public framework::OpKernel<T> {
// Because beta is zero, it is unnecessary to reset filter_grad. // Because beta is zero, it is unnecessary to reset filter_grad.
// Gradient with respect to the filter // Gradient with respect to the filter
for (int g = 0; g < groups; g++) { for (int g = 0; g < groups; g++) {
CUDNN_ENFORCE(platform::dynload::cudnnConvolutionBackwardFilter( auto cudnn_func = [&](void* cudnn_workspace) {
handle, &alpha, cudnn_output_desc, CUDNN_ENFORCE(platform::dynload::cudnnConvolutionBackwardFilter(
output_grad_data + output_grad_offset * g, cudnn_input_desc, handle, &alpha, cudnn_output_desc,
input_data + input_offset * g, cudnn_conv_desc, filter_algo, output_grad_data + output_grad_offset * g, cudnn_input_desc,
cudnn_workspace, workspace_size_in_bytes, &beta, cudnn_filter_desc, input_data + input_offset * g, cudnn_conv_desc, filter_algo,
filter_grad_data + filter_offset * g)); cudnn_workspace, workspace_size_in_bytes, &beta,
cudnn_filter_desc, filter_grad_data + filter_offset * g));
};
workspace_handle.RunFunc(cudnn_func, workspace_size_in_bytes);
} }
} }
} }
......
...@@ -31,6 +31,7 @@ detection_library(polygon_box_transform_op SRCS polygon_box_transform_op.cc ...@@ -31,6 +31,7 @@ detection_library(polygon_box_transform_op SRCS polygon_box_transform_op.cc
polygon_box_transform_op.cu) polygon_box_transform_op.cu)
detection_library(rpn_target_assign_op SRCS rpn_target_assign_op.cc) detection_library(rpn_target_assign_op SRCS rpn_target_assign_op.cc)
detection_library(generate_proposal_labels_op SRCS generate_proposal_labels_op.cc) detection_library(generate_proposal_labels_op SRCS generate_proposal_labels_op.cc)
detection_library(yolov3_loss_op SRCS yolov3_loss_op.cc)
if(WITH_GPU) if(WITH_GPU)
detection_library(generate_proposals_op SRCS generate_proposals_op.cc generate_proposals_op.cu DEPS memory cub) detection_library(generate_proposals_op SRCS generate_proposals_op.cc generate_proposals_op.cu DEPS memory cub)
...@@ -45,3 +46,7 @@ detection_library(roi_perspective_transform_op SRCS roi_perspective_transform_op ...@@ -45,3 +46,7 @@ detection_library(roi_perspective_transform_op SRCS roi_perspective_transform_op
foreach(src ${LOCAL_DETECTION_LIBS}) foreach(src ${LOCAL_DETECTION_LIBS})
set(OP_LIBRARY ${src} ${OP_LIBRARY} CACHE INTERNAL "op libs") set(OP_LIBRARY ${src} ${OP_LIBRARY} CACHE INTERNAL "op libs")
endforeach() endforeach()
cc_library(mask_util SRCS mask_util.cc DEPS memory)
cc_test(mask_util_test SRCS mask_util_test.cc DEPS memory mask_util)
detection_library(generate_mask_labels_op SRCS generate_mask_labels_op.cc DEPS mask_util)
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. /* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License"); Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License. you may not use this file except in compliance with the License.
You may obtain a copy of the License at You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0 http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS, distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#pragma once #pragma once
#include <algorithm> #include <algorithm>
#include "paddle/fluid/framework/eigen.h" #include "paddle/fluid/framework/eigen.h"
...@@ -88,7 +92,9 @@ void BboxOverlaps(const framework::Tensor& r_boxes, ...@@ -88,7 +92,9 @@ void BboxOverlaps(const framework::Tensor& r_boxes,
inter_w = std::max(x_max - x_min + 1, zero); inter_w = std::max(x_max - x_min + 1, zero);
inter_h = std::max(y_max - y_min + 1, zero); inter_h = std::max(y_max - y_min + 1, zero);
inter_area = inter_w * inter_h; inter_area = inter_w * inter_h;
overlaps_et(i, j) = inter_area / (r_box_area + c_box_area - inter_area); overlaps_et(i, j) =
(inter_area == 0.) ? 0 : inter_area /
(r_box_area + c_box_area - inter_area);
} }
} }
} }
......
...@@ -10,6 +10,7 @@ See the License for the specific language governing permissions and ...@@ -10,6 +10,7 @@ See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#include "paddle/fluid/operators/detection/box_coder_op.h" #include "paddle/fluid/operators/detection/box_coder_op.h"
#include <vector>
namespace paddle { namespace paddle {
namespace operators { namespace operators {
...@@ -32,32 +33,57 @@ class BoxCoderOp : public framework::OperatorWithKernel { ...@@ -32,32 +33,57 @@ class BoxCoderOp : public framework::OperatorWithKernel {
if (ctx->IsRuntime()) { if (ctx->IsRuntime()) {
PADDLE_ENFORCE_EQ(prior_box_dims.size(), 2, PADDLE_ENFORCE_EQ(prior_box_dims.size(), 2,
"The rank of Input of PriorBoxVar must be 2"); "The rank of Input PriorBox must be 2");
PADDLE_ENFORCE_EQ(prior_box_dims[1], 4, PADDLE_ENFORCE_EQ(prior_box_dims[1], 4,
"The shape of PriorBox is [N, 4]"); "The shape of PriorBox is [N, 4]");
if (ctx->HasInput("PriorBoxVar")) { if (ctx->HasInput("PriorBoxVar")) {
auto prior_box_var_dims = ctx->GetInputDim("PriorBoxVar"); auto prior_box_var_dims = ctx->GetInputDim("PriorBoxVar");
PADDLE_ENFORCE_EQ(prior_box_dims, prior_box_var_dims); PADDLE_ENFORCE(
prior_box_var_dims.size() == 1 || prior_box_var_dims.size() == 2,
"Input(PriorBoxVar) of BoxCoderOp should be 1 or 2.");
if (prior_box_var_dims.size() == 1) {
PADDLE_ENFORCE_EQ(
prior_box_var_dims[0], 4,
"The 1st dimension of Input(PriorBoxVar) should be 4"
"when the rank is 1.");
} else {
PADDLE_ENFORCE_EQ(
prior_box_dims, prior_box_var_dims,
"The dimension of Input(PriorBoxVar) should be equal to"
"the dimension of Input(PriorBox when the rank is 2.)");
}
} }
}
auto code_type = auto code_type = GetBoxCodeType(ctx->Attrs().Get<std::string>("code_type"));
GetBoxCodeType(ctx->Attrs().Get<std::string>("code_type")); int axis = ctx->Attrs().Get<int>("axis");
if (code_type == BoxCodeType::kEncodeCenterSize) { if (code_type == BoxCodeType::kEncodeCenterSize) {
PADDLE_ENFORCE_EQ(target_box_dims.size(), 2, PADDLE_ENFORCE_EQ(target_box_dims.size(), 2,
"The rank of Input of TargetBox must be 2"); "The rank of Input TargetBox must be 2");
PADDLE_ENFORCE_EQ(target_box_dims[1], 4, PADDLE_ENFORCE_EQ(target_box_dims[1], 4,
"The shape of TargetBox is [M, 4]"); "The shape of TargetBox is [M, 4]");
} else if (code_type == BoxCodeType::kDecodeCenterSize) { ctx->SetOutputDim(
PADDLE_ENFORCE_EQ(target_box_dims.size(), 3, "OutputBox",
"The rank of Input of TargetBox must be 3"); framework::make_ddim({target_box_dims[0], prior_box_dims[0], 4}));
} else if (code_type == BoxCodeType::kDecodeCenterSize) {
PADDLE_ENFORCE_EQ(target_box_dims.size(), 3,
"The rank of Input TargetBox must be 3");
if (axis == 0) {
PADDLE_ENFORCE_EQ(target_box_dims[1], prior_box_dims[0]); PADDLE_ENFORCE_EQ(target_box_dims[1], prior_box_dims[0]);
PADDLE_ENFORCE_EQ(target_box_dims[2], prior_box_dims[1]); } else if (axis == 1) {
PADDLE_ENFORCE_EQ(target_box_dims[0], prior_box_dims[0]);
} else {
PADDLE_THROW("axis must be 0 or 1.");
} }
PADDLE_ENFORCE_EQ(target_box_dims[2], prior_box_dims[1]);
ctx->ShareDim("TargetBox", /*->*/ "OutputBox");
}
if (code_type == BoxCodeType::kDecodeCenterSize && axis == 1) {
ctx->ShareLoD("PriorBox", /*->*/ "OutputBox");
} else {
ctx->ShareLoD("TargetBox", /*->*/ "OutputBox");
} }
ctx->SetOutputDim(
"OutputBox",
framework::make_ddim({target_box_dims[0], prior_box_dims[0], 4}));
ctx->ShareLoD("TargetBox", /*->*/ "OutputBox");
} }
}; };
...@@ -100,6 +126,21 @@ class BoxCoderOpMaker : public framework::OpProtoAndCheckerMaker { ...@@ -100,6 +126,21 @@ class BoxCoderOpMaker : public framework::OpProtoAndCheckerMaker {
"(bool, default true) " "(bool, default true) "
"whether treat the priorbox as a noramlized box") "whether treat the priorbox as a noramlized box")
.SetDefault(true); .SetDefault(true);
AddAttr<int>("axis",
"(int, default 0)"
"which axis in PriorBox to broadcast for box decode,"
"for example, if axis is 0 and TargetBox has shape"
"[N, M, 4] and PriorBox has shape [M, 4], then PriorBox "
"will broadcast to [N, M, 4] for decoding. It is only valid"
"when code type is decode_center_size")
.SetDefault(0)
.InEnum({0, 1});
AddAttr<std::vector<float>>(
"variance",
"(vector<float>, default {}),"
"variance of prior box with shape [4]. PriorBoxVar and variance can"
"not be provided at the same time.")
.SetDefault(std::vector<float>{});
AddOutput("OutputBox", AddOutput("OutputBox",
"(LoDTensor or Tensor) " "(LoDTensor or Tensor) "
"When code_type is 'encode_center_size', the output tensor of " "When code_type is 'encode_center_size', the output tensor of "
...@@ -138,7 +179,11 @@ where `tx`, `ty`, `tw`, `th` denote the target box's center coordinates, width ...@@ -138,7 +179,11 @@ where `tx`, `ty`, `tw`, `th` denote the target box's center coordinates, width
and height respectively. Similarly, `px`, `py`, `pw`, `ph` denote the and height respectively. Similarly, `px`, `py`, `pw`, `ph` denote the
priorbox's (anchor) center coordinates, width and height. `pxv`, `pyv`, `pwv`, priorbox's (anchor) center coordinates, width and height. `pxv`, `pyv`, `pwv`,
`phv` denote the variance of the priorbox and `ox`, `oy`, `ow`, `oh` denote the `phv` denote the variance of the priorbox and `ox`, `oy`, `ow`, `oh` denote the
encoded/decoded coordinates, width and height. encoded/decoded coordinates, width and height.
During Box Decoding, two modes for broadcast are supported. Say target box has
shape [N, M, 4], and the shape of prior box can be [N, 4] or [M, 4]. Then prior
box will broadcast to target box along the assigned axis.
)DOC"); )DOC");
} }
}; };
......
此差异已折叠。
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include <stdint.h>
#include <vector>
namespace paddle {
namespace operators {
void Poly2Mask(const float* ploy, int k, int h, int w, uint8_t* mask);
void Poly2Boxes(const std::vector<std::vector<std::vector<float>>>& polys,
float* boxes);
void Polys2MaskWrtBox(const std::vector<std::vector<float>>& polygons,
const float* box, int M, uint8_t* mask);
} // namespace operators
} // namespace paddle
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册