提交 c7e38680 编写于 作者: Q Qiao Longfei

Merge branch 'develop' of https://github.com/PaddlePaddle/Paddle into add-communicator

...@@ -276,9 +276,3 @@ add_subdirectory(paddle) ...@@ -276,9 +276,3 @@ add_subdirectory(paddle)
if(WITH_PYTHON) if(WITH_PYTHON)
add_subdirectory(python) add_subdirectory(python)
endif() endif()
if(WITH_DOC)
find_package(Sphinx REQUIRED)
find_python_module(recommonmark REQUIRED)
add_subdirectory(doc)
endif()
...@@ -11,12 +11,10 @@ RUN /bin/bash -c 'if [[ -n ${UBUNTU_MIRROR} ]]; then sed -i 's#http://archive.ub ...@@ -11,12 +11,10 @@ RUN /bin/bash -c 'if [[ -n ${UBUNTU_MIRROR} ]]; then sed -i 's#http://archive.ub
# ENV variables # ENV variables
ARG WITH_GPU ARG WITH_GPU
ARG WITH_AVX ARG WITH_AVX
ARG WITH_DOC
ENV WOBOQ OFF ENV WOBOQ OFF
ENV WITH_GPU=${WITH_GPU:-ON} ENV WITH_GPU=${WITH_GPU:-ON}
ENV WITH_AVX=${WITH_AVX:-ON} ENV WITH_AVX=${WITH_AVX:-ON}
ENV WITH_DOC=${WITH_DOC:-OFF}
ENV HOME /root ENV HOME /root
# Add bash enhancements # Add bash enhancements
......
# - This module looks for Sphinx
# Find the Sphinx documentation generator
#
# This modules defines
# SPHINX_EXECUTABLE
# SPHINX_FOUND
find_program(SPHINX_EXECUTABLE
NAMES sphinx-build
PATHS
/usr/bin
/usr/local/bin
/opt/local/bin
DOC "Sphinx documentation generator"
)
if( NOT SPHINX_EXECUTABLE )
set(_Python_VERSIONS
2.7 2.6 2.5 2.4 2.3 2.2 2.1 2.0 1.6 1.5
)
foreach( _version ${_Python_VERSIONS} )
set( _sphinx_NAMES sphinx-build-${_version} )
find_program( SPHINX_EXECUTABLE
NAMES ${_sphinx_NAMES}
PATHS
/usr/bin
/usr/local/bin
/opt/loca/bin
DOC "Sphinx documentation generator"
)
endforeach()
endif()
include(FindPackageHandleStandardArgs)
find_package_handle_standard_args(Sphinx DEFAULT_MSG
SPHINX_EXECUTABLE
)
option( SPHINX_HTML_OUTPUT "Build a single HTML with the whole content." ON )
option( SPHINX_DIRHTML_OUTPUT "Build HTML pages, but with a single directory per document." OFF )
option( SPHINX_HTMLHELP_OUTPUT "Build HTML pages with additional information for building a documentation collection in htmlhelp." OFF )
option( SPHINX_QTHELP_OUTPUT "Build HTML pages with additional information for building a documentation collection in qthelp." OFF )
option( SPHINX_DEVHELP_OUTPUT "Build HTML pages with additional information for building a documentation collection in devhelp." OFF )
option( SPHINX_EPUB_OUTPUT "Build HTML pages with additional information for building a documentation collection in epub." OFF )
option( SPHINX_LATEX_OUTPUT "Build LaTeX sources that can be compiled to a PDF document using pdflatex." OFF )
option( SPHINX_MAN_OUTPUT "Build manual pages in groff format for UNIX systems." OFF )
option( SPHINX_TEXT_OUTPUT "Build plain text files." OFF )
mark_as_advanced(
SPHINX_EXECUTABLE
SPHINX_HTML_OUTPUT
SPHINX_DIRHTML_OUTPUT
SPHINX_HTMLHELP_OUTPUT
SPHINX_QTHELP_OUTPUT
SPHINX_DEVHELP_OUTPUT
SPHINX_EPUB_OUTPUT
SPHINX_LATEX_OUTPUT
SPHINX_MAN_OUTPUT
SPHINX_TEXT_OUTPUT
)
function( Sphinx_add_target target_name builder conf cache source destination )
add_custom_target( ${target_name} ALL
COMMAND ${SPHINX_EXECUTABLE} -b ${builder}
-d ${cache}
-c ${conf}
${source}
${destination}
COMMENT "Generating sphinx documentation: ${builder}"
COMMAND cd ${destination} && ln -sf ./index_*.html index.html
)
set_property(
DIRECTORY APPEND PROPERTY
ADDITIONAL_MAKE_CLEAN_FILES
${destination}
)
endfunction()
# Target dependencies can be optionally listed at the end.
function( Sphinx_add_targets target_base_name conf source base_destination )
set( _dependencies )
foreach( arg IN LISTS ARGN )
set( _dependencies ${_dependencies} ${arg} )
endforeach()
if( ${SPHINX_HTML_OUTPUT} )
Sphinx_add_target( ${target_base_name}_html html ${conf} ${source} ${base_destination}/html )
add_dependencies( ${target_base_name}_html ${_dependencies} )
endif()
if( ${SPHINX_DIRHTML_OUTPUT} )
Sphinx_add_target( ${target_base_name}_dirhtml dirhtml ${conf} ${source} ${base_destination}/dirhtml )
add_dependencies( ${target_base_name}_dirhtml ${_dependencies} )
endif()
if( ${SPHINX_QTHELP_OUTPUT} )
Sphinx_add_target( ${target_base_name}_qthelp qthelp ${conf} ${source} ${base_destination}/qthelp )
add_dependencies( ${target_base_name}_qthelp ${_dependencies} )
endif()
if( ${SPHINX_DEVHELP_OUTPUT} )
Sphinx_add_target( ${target_base_name}_devhelp devhelp ${conf} ${source} ${base_destination}/devhelp )
add_dependencies( ${target_base_name}_devhelp ${_dependencies} )
endif()
if( ${SPHINX_EPUB_OUTPUT} )
Sphinx_add_target( ${target_base_name}_epub epub ${conf} ${source} ${base_destination}/epub )
add_dependencies( ${target_base_name}_epub ${_dependencies} )
endif()
if( ${SPHINX_LATEX_OUTPUT} )
Sphinx_add_target( ${target_base_name}_latex latex ${conf} ${source} ${base_destination}/latex )
add_dependencies( ${target_base_name}_latex ${_dependencies} )
endif()
if( ${SPHINX_MAN_OUTPUT} )
Sphinx_add_target( ${target_base_name}_man man ${conf} ${source} ${base_destination}/man )
add_dependencies( ${target_base_name}_man ${_dependencies} )
endif()
if( ${SPHINX_TEXT_OUTPUT} )
Sphinx_add_target( ${target_base_name}_text text ${conf} ${source} ${base_destination}/text )
add_dependencies( ${target_base_name}_text ${_dependencies} )
endif()
if( ${BUILD_TESTING} )
sphinx_add_target( ${target_base_name}_linkcheck linkcheck ${conf} ${source} ${base_destination}/linkcheck )
add_dependencies( ${target_base_name}_linkcheck ${_dependencies} )
endif()
endfunction()
...@@ -388,6 +388,7 @@ function(cc_test TARGET_NAME) ...@@ -388,6 +388,7 @@ function(cc_test TARGET_NAME)
endif() endif()
set_property(TEST ${TARGET_NAME} PROPERTY ENVIRONMENT FLAGS_cpu_deterministic=true) set_property(TEST ${TARGET_NAME} PROPERTY ENVIRONMENT FLAGS_cpu_deterministic=true)
set_property(TEST ${TARGET_NAME} PROPERTY ENVIRONMENT FLAGS_init_allocated_mem=true) set_property(TEST ${TARGET_NAME} PROPERTY ENVIRONMENT FLAGS_init_allocated_mem=true)
set_property(TEST ${TARGET_NAME} PROPERTY ENVIRONMENT FLAGS_limit_of_tmp_allocation=4294967296) # 4G
set_property(TEST ${TARGET_NAME} PROPERTY ENVIRONMENT FLAGS_cudnn_deterministic=true) set_property(TEST ${TARGET_NAME} PROPERTY ENVIRONMENT FLAGS_cudnn_deterministic=true)
# No unit test should exceed 10 minutes. # No unit test should exceed 10 minutes.
set_tests_properties(${TARGET_NAME} PROPERTIES TIMEOUT 600) set_tests_properties(${TARGET_NAME} PROPERTIES TIMEOUT 600)
...@@ -460,6 +461,7 @@ function(nv_test TARGET_NAME) ...@@ -460,6 +461,7 @@ function(nv_test TARGET_NAME)
endif() endif()
set_property(TEST ${TARGET_NAME} PROPERTY ENVIRONMENT FLAGS_cpu_deterministic=true) set_property(TEST ${TARGET_NAME} PROPERTY ENVIRONMENT FLAGS_cpu_deterministic=true)
set_property(TEST ${TARGET_NAME} PROPERTY ENVIRONMENT FLAGS_init_allocated_mem=true) set_property(TEST ${TARGET_NAME} PROPERTY ENVIRONMENT FLAGS_init_allocated_mem=true)
set_property(TEST ${TARGET_NAME} PROPERTY ENVIRONMENT FLAGS_limit_of_tmp_allocation=4294967296) # 4G
set_property(TEST ${TARGET_NAME} PROPERTY ENVIRONMENT FLAGS_cudnn_deterministic=true) set_property(TEST ${TARGET_NAME} PROPERTY ENVIRONMENT FLAGS_cudnn_deterministic=true)
endif() endif()
endfunction(nv_test) endfunction(nv_test)
...@@ -708,9 +710,10 @@ function(py_test TARGET_NAME) ...@@ -708,9 +710,10 @@ function(py_test TARGET_NAME)
set(oneValueArgs "") set(oneValueArgs "")
set(multiValueArgs SRCS DEPS ARGS ENVS) set(multiValueArgs SRCS DEPS ARGS ENVS)
cmake_parse_arguments(py_test "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN}) cmake_parse_arguments(py_test "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
add_test(NAME ${TARGET_NAME} add_test(NAME ${TARGET_NAME}
COMMAND ${CMAKE_COMMAND} -E env FLAGS_init_allocated_mem=true FLAGS_cudnn_deterministic=true COMMAND ${CMAKE_COMMAND} -E env FLAGS_init_allocated_mem=true FLAGS_cudnn_deterministic=true
FLAGS_cpu_deterministic=true FLAGS_cpu_deterministic=true FLAGS_limit_of_tmp_allocation=4294967296 # 4G
PYTHONPATH=${PADDLE_BINARY_DIR}/python ${py_test_ENVS} PYTHONPATH=${PADDLE_BINARY_DIR}/python ${py_test_ENVS}
${PYTHON_EXECUTABLE} -u ${py_test_SRCS} ${py_test_ARGS} ${PYTHON_EXECUTABLE} -u ${py_test_SRCS} ${py_test_ARGS}
WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}) WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR})
......
...@@ -122,7 +122,7 @@ paddle.fluid.layers.transpose ArgSpec(args=['x', 'perm', 'name'], varargs=None, ...@@ -122,7 +122,7 @@ paddle.fluid.layers.transpose ArgSpec(args=['x', 'perm', 'name'], varargs=None,
paddle.fluid.layers.im2sequence ArgSpec(args=['input', 'filter_size', 'stride', 'padding', 'input_image_size', 'out_stride', 'name'], varargs=None, keywords=None, defaults=(1, 1, 0, None, 1, None)) paddle.fluid.layers.im2sequence ArgSpec(args=['input', 'filter_size', 'stride', 'padding', 'input_image_size', 'out_stride', 'name'], varargs=None, keywords=None, defaults=(1, 1, 0, None, 1, None))
paddle.fluid.layers.nce ArgSpec(args=['input', 'label', 'num_total_classes', 'sample_weight', 'param_attr', 'bias_attr', 'num_neg_samples', 'name', 'sampler', 'custom_dist', 'seed', 'is_sparse'], varargs=None, keywords=None, defaults=(None, None, None, None, None, 'uniform', None, 0, False)) paddle.fluid.layers.nce ArgSpec(args=['input', 'label', 'num_total_classes', 'sample_weight', 'param_attr', 'bias_attr', 'num_neg_samples', 'name', 'sampler', 'custom_dist', 'seed', 'is_sparse'], varargs=None, keywords=None, defaults=(None, None, None, None, None, 'uniform', None, 0, False))
paddle.fluid.layers.hsigmoid ArgSpec(args=['input', 'label', 'num_classes', 'param_attr', 'bias_attr', 'name', 'path_table', 'path_code', 'is_custom', 'is_sparse'], varargs=None, keywords=None, defaults=(None, None, None, None, None, False, False)) paddle.fluid.layers.hsigmoid ArgSpec(args=['input', 'label', 'num_classes', 'param_attr', 'bias_attr', 'name', 'path_table', 'path_code', 'is_custom', 'is_sparse'], varargs=None, keywords=None, defaults=(None, None, None, None, None, False, False))
paddle.fluid.layers.beam_search ArgSpec(args=['pre_ids', 'pre_scores', 'ids', 'scores', 'beam_size', 'end_id', 'level', 'name'], varargs=None, keywords=None, defaults=(0, None)) paddle.fluid.layers.beam_search ArgSpec(args=['pre_ids', 'pre_scores', 'ids', 'scores', 'beam_size', 'end_id', 'level', 'is_accumulated', 'name'], varargs=None, keywords=None, defaults=(0, True, None))
paddle.fluid.layers.row_conv ArgSpec(args=['input', 'future_context_size', 'param_attr', 'act'], varargs=None, keywords=None, defaults=(None, None)) paddle.fluid.layers.row_conv ArgSpec(args=['input', 'future_context_size', 'param_attr', 'act'], varargs=None, keywords=None, defaults=(None, None))
paddle.fluid.layers.multiplex ArgSpec(args=['inputs', 'index'], varargs=None, keywords=None, defaults=None) paddle.fluid.layers.multiplex ArgSpec(args=['inputs', 'index'], varargs=None, keywords=None, defaults=None)
paddle.fluid.layers.layer_norm ArgSpec(args=['input', 'scale', 'shift', 'begin_norm_axis', 'epsilon', 'param_attr', 'bias_attr', 'act', 'name'], varargs=None, keywords=None, defaults=(True, True, 1, 1e-05, None, None, None, None)) paddle.fluid.layers.layer_norm ArgSpec(args=['input', 'scale', 'shift', 'begin_norm_axis', 'epsilon', 'param_attr', 'bias_attr', 'act', 'name'], varargs=None, keywords=None, defaults=(True, True, 1, 1e-05, None, None, None, None))
...@@ -213,6 +213,7 @@ paddle.fluid.layers.bilinear_tensor_product ArgSpec(args=['x', 'y', 'size', 'act ...@@ -213,6 +213,7 @@ paddle.fluid.layers.bilinear_tensor_product ArgSpec(args=['x', 'y', 'size', 'act
paddle.fluid.layers.merge_selected_rows ArgSpec(args=['x', 'name'], varargs=None, keywords=None, defaults=(None,)) paddle.fluid.layers.merge_selected_rows ArgSpec(args=['x', 'name'], varargs=None, keywords=None, defaults=(None,))
paddle.fluid.layers.get_tensor_from_selected_rows ArgSpec(args=['x', 'name'], varargs=None, keywords=None, defaults=(None,)) paddle.fluid.layers.get_tensor_from_selected_rows ArgSpec(args=['x', 'name'], varargs=None, keywords=None, defaults=(None,))
paddle.fluid.layers.lstm ArgSpec(args=['input', 'init_h', 'init_c', 'max_len', 'hidden_size', 'num_layers', 'dropout_prob', 'is_bidirec', 'is_test', 'name', 'default_initializer', 'seed'], varargs=None, keywords=None, defaults=(0.0, False, False, None, None, -1)) paddle.fluid.layers.lstm ArgSpec(args=['input', 'init_h', 'init_c', 'max_len', 'hidden_size', 'num_layers', 'dropout_prob', 'is_bidirec', 'is_test', 'name', 'default_initializer', 'seed'], varargs=None, keywords=None, defaults=(0.0, False, False, None, None, -1))
paddle.fluid.layers.shuffle_channel ArgSpec(args=['x', 'group', 'name'], varargs=None, keywords=None, defaults=(None,))
paddle.fluid.layers.py_func ArgSpec(args=['func', 'x', 'out', 'backward_func', 'skip_vars_in_backward_input'], varargs=None, keywords=None, defaults=(None, None)) paddle.fluid.layers.py_func ArgSpec(args=['func', 'x', 'out', 'backward_func', 'skip_vars_in_backward_input'], varargs=None, keywords=None, defaults=(None, None))
paddle.fluid.layers.psroi_pool ArgSpec(args=['input', 'rois', 'output_channels', 'spatial_scale', 'pooled_height', 'pooled_width', 'name'], varargs=None, keywords=None, defaults=(None,)) paddle.fluid.layers.psroi_pool ArgSpec(args=['input', 'rois', 'output_channels', 'spatial_scale', 'pooled_height', 'pooled_width', 'name'], varargs=None, keywords=None, defaults=(None,))
paddle.fluid.layers.teacher_student_sigmoid_loss ArgSpec(args=['input', 'label', 'soft_max_up_bound', 'soft_max_lower_bound'], varargs=None, keywords=None, defaults=(15.0, -15.0)) paddle.fluid.layers.teacher_student_sigmoid_loss ArgSpec(args=['input', 'label', 'soft_max_up_bound', 'soft_max_lower_bound'], varargs=None, keywords=None, defaults=(15.0, -15.0))
...@@ -359,6 +360,7 @@ paddle.fluid.contrib.QuantizeTranspiler.__init__ ArgSpec(args=['self', 'weight_b ...@@ -359,6 +360,7 @@ paddle.fluid.contrib.QuantizeTranspiler.__init__ ArgSpec(args=['self', 'weight_b
paddle.fluid.contrib.QuantizeTranspiler.convert_to_int8 ArgSpec(args=['self', 'program', 'place', 'scope'], varargs=None, keywords=None, defaults=(None,)) paddle.fluid.contrib.QuantizeTranspiler.convert_to_int8 ArgSpec(args=['self', 'program', 'place', 'scope'], varargs=None, keywords=None, defaults=(None,))
paddle.fluid.contrib.QuantizeTranspiler.freeze_program ArgSpec(args=['self', 'program', 'place', 'fuse_bn', 'scope'], varargs=None, keywords=None, defaults=(False, None)) paddle.fluid.contrib.QuantizeTranspiler.freeze_program ArgSpec(args=['self', 'program', 'place', 'fuse_bn', 'scope'], varargs=None, keywords=None, defaults=(False, None))
paddle.fluid.contrib.QuantizeTranspiler.training_transpile ArgSpec(args=['self', 'program', 'startup_program'], varargs=None, keywords=None, defaults=(None, None)) paddle.fluid.contrib.QuantizeTranspiler.training_transpile ArgSpec(args=['self', 'program', 'startup_program'], varargs=None, keywords=None, defaults=(None, None))
paddle.fluid.contrib.reader.ctr_reader.ctr_reader ArgSpec(args=['feed_dict', 'file_type', 'file_format', 'dense_slot_index', 'sparse_slot_index', 'capacity', 'thread_num', 'batch_size', 'file_list', 'slots', 'name'], varargs=None, keywords=None, defaults=(None,))
paddle.fluid.contrib.build_compressor ArgSpec(args=['place', 'data_reader', 'data_feeder', 'scope', 'metrics', 'epoch', 'config'], varargs=None, keywords=None, defaults=(None, None, None, None, None, None, None)) paddle.fluid.contrib.build_compressor ArgSpec(args=['place', 'data_reader', 'data_feeder', 'scope', 'metrics', 'epoch', 'config'], varargs=None, keywords=None, defaults=(None, None, None, None, None, None, None))
paddle.fluid.contrib.CompressPass.__init__ ArgSpec(args=['self', 'place', 'data_reader', 'data_feeder', 'scope', 'metrics', 'epoch', 'program_exe'], varargs=None, keywords=None, defaults=(None, None, None, None, None, None, None)) paddle.fluid.contrib.CompressPass.__init__ ArgSpec(args=['self', 'place', 'data_reader', 'data_feeder', 'scope', 'metrics', 'epoch', 'program_exe'], varargs=None, keywords=None, defaults=(None, None, None, None, None, None, None))
paddle.fluid.contrib.CompressPass.add_strategy ArgSpec(args=['self', 'strategy'], varargs=None, keywords=None, defaults=None) paddle.fluid.contrib.CompressPass.add_strategy ArgSpec(args=['self', 'strategy'], varargs=None, keywords=None, defaults=None)
......
#windows treat symbolic file as a real file, which is different with unix #windows treat symbolic file as a real file, which is different with unix
#We create a hidden file and compile it instead of origin source file. #We create a hidden file and compile it instead of origin source file.
function(windows_symbolic TARGET) function(windows_symbolic TARGET)
...@@ -129,12 +128,6 @@ cc_test(version_test SRCS version_test.cc DEPS version) ...@@ -129,12 +128,6 @@ cc_test(version_test SRCS version_test.cc DEPS version)
cc_library(proto_desc SRCS var_desc.cc op_desc.cc block_desc.cc program_desc.cc DEPS shape_inference op_info operator glog version) cc_library(proto_desc SRCS var_desc.cc op_desc.cc block_desc.cc program_desc.cc DEPS shape_inference op_info operator glog version)
if(WITH_NGRAPH)
cc_library(ngraph_bridge SRCS ngraph_bridge.cc DEPS operator framework_proto ngraph)
cc_library(ngraph_operator SRCS ngraph_operator.cc DEPS ngraph_bridge operator op_info device_context tensor scope glog
shape_inference data_transform lod_tensor profiler)
endif(WITH_NGRAPH)
cc_library(op_registry SRCS op_registry.cc DEPS op_proto_maker op_info operator glog proto_desc) cc_library(op_registry SRCS op_registry.cc DEPS op_proto_maker op_info operator glog proto_desc)
nv_test(op_registry_test SRCS op_registry_test.cc DEPS op_registry) nv_test(op_registry_test SRCS op_registry_test.cc DEPS op_registry)
...@@ -171,13 +164,12 @@ if(WITH_DISTRIBUTE) ...@@ -171,13 +164,12 @@ if(WITH_DISTRIBUTE)
set(DISTRIBUTE_COMPILE_FLAGS "-Wno-non-virtual-dtor -Wno-error=non-virtual-dtor -Wno-error=delete-non-virtual-dtor") set(DISTRIBUTE_COMPILE_FLAGS "-Wno-non-virtual-dtor -Wno-error=non-virtual-dtor -Wno-error=delete-non-virtual-dtor")
set_source_files_properties(executor.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS}) set_source_files_properties(executor.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
else() else()
if(WITH_NGRAPH) if (WITH_NGRAPH)
cc_library(executor SRCS executor.cc DEPS op_registry device_context scope framework_proto glog lod_rank_table feed_fetch_method graph_to_program_pass ngraph_operator variable_helper) cc_library(executor SRCS executor.cc DEPS op_registry device_context scope framework_proto glog lod_rank_table feed_fetch_method graph_to_program_pass variable_helper ngraph_engine)
else(WITH_NGRAPH) else ()
cc_library(executor SRCS executor.cc DEPS op_registry device_context scope framework_proto glog lod_rank_table feed_fetch_method graph_to_program_pass variable_helper) cc_library(executor SRCS executor.cc DEPS op_registry device_context scope framework_proto glog lod_rank_table feed_fetch_method graph_to_program_pass variable_helper)
endif(WITH_NGRAPH) endif()
cc_test(test_naive_executor SRCS naive_executor_test.cc DEPS naive_executor elementwise_add_op) cc_test(test_naive_executor SRCS naive_executor_test.cc DEPS naive_executor elementwise_add_op)
endif() endif()
...@@ -214,3 +206,24 @@ endif (NOT WIN32) ...@@ -214,3 +206,24 @@ endif (NOT WIN32)
cc_library(dlpack_tensor SRCS dlpack_tensor.cc DEPS tensor dlpack) cc_library(dlpack_tensor SRCS dlpack_tensor.cc DEPS tensor dlpack)
cc_test(dlpack_tensor_test SRCS dlpack_tensor_test.cc DEPS dlpack_tensor glog) cc_test(dlpack_tensor_test SRCS dlpack_tensor_test.cc DEPS dlpack_tensor glog)
# Get the current working branch
execute_process(
COMMAND git rev-parse --abbrev-ref HEAD
WORKING_DIRECTORY ${CMAKE_SOURCE_DIR}
OUTPUT_VARIABLE PADDLE_BRANCH
OUTPUT_STRIP_TRAILING_WHITESPACE
)
# Get the latest abbreviated commit hash of the working branch
execute_process(
COMMAND git log -1 --format=%h
WORKING_DIRECTORY ${CMAKE_SOURCE_DIR}
OUTPUT_VARIABLE PADDLE_COMMIT
OUTPUT_STRIP_TRAILING_WHITESPACE
)
message(STATUS "commit: ${PADDLE_COMMIT}")
message(STATUS "branch: ${PADDLE_BRANCH}")
configure_file(commit.h.in commit.h)
#pragma once
#include <string>
namespace paddle {
namespace framework {
static std::string paddle_commit() {
return "@PADDLE_COMMIT@";
}
static std::string paddle_compile_branch() {
return "@PADDLE_BRANCH@";
}
static std::string paddle_version() {
return "@PADDLE_VERSION@";
}
} // namespace framework
} // namespace paddle
...@@ -91,7 +91,7 @@ struct BuildStrategy { ...@@ -91,7 +91,7 @@ struct BuildStrategy {
int num_trainers_{1}; int num_trainers_{1};
int trainer_id_{0}; int trainer_id_{0};
std::vector<std::string> trainers_endpoints_; std::vector<std::string> trainers_endpoints_;
bool remove_unnecessary_lock_{false}; bool remove_unnecessary_lock_{true};
// NOTE: // NOTE:
// Before you add new options, think if it's a general strategy that works // Before you add new options, think if it's a general strategy that works
......
...@@ -25,6 +25,9 @@ struct ExecutionStrategy { ...@@ -25,6 +25,9 @@ struct ExecutionStrategy {
size_t num_threads_{0}; size_t num_threads_{0};
bool use_cuda_{true}; bool use_cuda_{true};
bool allow_op_delay_{false}; bool allow_op_delay_{false};
// If we set this to 1, we will delete all variables when finish a batch. and
// this will loss 15%+ performance.
// Please be aware about this parameters.
size_t num_iteration_per_drop_scope_{1}; size_t num_iteration_per_drop_scope_{1};
ExecutorType type_{kDefault}; ExecutorType type_{kDefault};
bool dry_run_{false}; bool dry_run_{false};
......
...@@ -27,7 +27,7 @@ limitations under the License. */ ...@@ -27,7 +27,7 @@ limitations under the License. */
#include "paddle/fluid/platform/profiler.h" #include "paddle/fluid/platform/profiler.h"
#ifdef PADDLE_WITH_NGRAPH #ifdef PADDLE_WITH_NGRAPH
#include "paddle/fluid/framework/ngraph_operator.h" #include "paddle/fluid/operators/ngraph/ngraph_engine.h"
#endif #endif
DECLARE_bool(benchmark); DECLARE_bool(benchmark);
...@@ -133,24 +133,6 @@ static void DeleteUnusedTensors( ...@@ -133,24 +133,6 @@ static void DeleteUnusedTensors(
} }
} }
static void EnableFusedOp(ExecutorPrepareContext* ctx) {
#ifdef PADDLE_WITH_NGRAPH
VLOG(3) << "use_ngraph=True";
auto intervals = NgraphOperator::NgraphOpIntervals(&ctx->ops_);
for (auto& interval : intervals) {
auto* ng_op = new NgraphOperator(ctx->prog_, ctx->block_id_, interval.at(0),
interval.at(1));
*interval[0] = std::unique_ptr<OperatorBase>(ng_op);
}
for (auto it = intervals.rbegin(); it != intervals.rend(); ++it) {
ctx->ops_.erase(it->at(0) + 1, it->at(1));
}
#else
LOG(WARNING)
<< "'NGRAPH' is not supported, Please re-compile with WITH_NGRAPH option";
#endif
}
Executor::Executor(const platform::Place& place) : place_(place) {} Executor::Executor(const platform::Place& place) : place_(place) {}
void Executor::Close() { void Executor::Close() {
...@@ -204,6 +186,9 @@ void Executor::Run(const ProgramDesc& pdesc, Scope* scope, int block_id, ...@@ -204,6 +186,9 @@ void Executor::Run(const ProgramDesc& pdesc, Scope* scope, int block_id,
bool create_local_scope, bool create_vars) { bool create_local_scope, bool create_vars) {
platform::RecordBlock b(block_id); platform::RecordBlock b(block_id);
if (FLAGS_use_mkldnn) EnableMKLDNN(pdesc); if (FLAGS_use_mkldnn) EnableMKLDNN(pdesc);
#ifdef PADDLE_WITH_NGRAPH
if (FLAGS_use_ngraph) operators::NgraphEngine::EnableNgraph(pdesc);
#endif
auto ctx = Prepare(pdesc, block_id); auto ctx = Prepare(pdesc, block_id);
RunPreparedContext(ctx.get(), scope, create_local_scope, create_vars); RunPreparedContext(ctx.get(), scope, create_local_scope, create_vars);
} }
...@@ -379,7 +364,6 @@ std::unique_ptr<ExecutorPrepareContext> Executor::Prepare( ...@@ -379,7 +364,6 @@ std::unique_ptr<ExecutorPrepareContext> Executor::Prepare(
for (auto& op_desc : block.AllOps()) { for (auto& op_desc : block.AllOps()) {
ctx->ops_.push_back(OpRegistry::CreateOp(*op_desc)); ctx->ops_.push_back(OpRegistry::CreateOp(*op_desc));
} }
if (FLAGS_use_ngraph) EnableFusedOp(ctx.get());
return ctx; return ctx;
} }
......
...@@ -14,6 +14,7 @@ ...@@ -14,6 +14,7 @@
#include "paddle/fluid/framework/ir/graph_traits.h" #include "paddle/fluid/framework/ir/graph_traits.h"
#include <set>
#include <vector> #include <vector>
namespace paddle { namespace paddle {
...@@ -79,7 +80,7 @@ NodesTSIterator::NodesTSIterator(const std::vector<Node *> &source) { ...@@ -79,7 +80,7 @@ NodesTSIterator::NodesTSIterator(const std::vector<Node *> &source) {
} }
std::unordered_set<Node *> visited; std::unordered_set<Node *> visited;
std::unordered_set<Node *> to_visit{source.begin(), source.end()}; std::set<Node *> to_visit{source.begin(), source.end()};
std::vector<Node *> inlink_visited; std::vector<Node *> inlink_visited;
while (!to_visit.empty()) { while (!to_visit.empty()) {
......
...@@ -54,13 +54,14 @@ std::ostream &operator<<(std::ostream &os, const LoD &lod) { ...@@ -54,13 +54,14 @@ std::ostream &operator<<(std::ostream &os, const LoD &lod) {
std::ostream &operator<<(std::ostream &os, const LoDTensor &t) { std::ostream &operator<<(std::ostream &os, const LoDTensor &t) {
if (!platform::is_cpu_place(t.place())) { if (!platform::is_cpu_place(t.place())) {
LoDTensor tt; LoDTensor cpu_tensor;
framework::TensorCopy(t, platform::CPUPlace(), &tt); cpu_tensor.set_lod(t.lod());
framework::TensorCopy(t, platform::CPUPlace(), &cpu_tensor);
platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance(); platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance();
auto &dev_ctx = *pool.Get(t.place()); auto &dev_ctx = *pool.Get(t.place());
dev_ctx.Wait(); dev_ctx.Wait();
os << tt; os << cpu_tensor;
return os; return os;
} }
......
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. /* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License"); Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License. you may not use this file except in compliance with the License.
You may obtain a copy of the License at You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0 http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS, distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#pragma once #pragma once
......
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include <algorithm>
#include <string>
#include <unordered_map>
#include <vector>
#include "paddle/fluid/framework/attribute.h"
#include "paddle/fluid/framework/op_info.h"
#include "paddle/fluid/framework/op_kernel_type.h"
#include "paddle/fluid/framework/operator.h"
#include "paddle/fluid/framework/program_desc.h"
#include "paddle/fluid/framework/scope.h"
#include "paddle/fluid/platform/variant.h"
#include "ngraph/type/element_type.hpp"
namespace paddle {
namespace framework {
class NgraphOperator : public OperatorBase {
public:
static std::vector<
std::vector<std::vector<std::unique_ptr<OperatorBase>>::iterator>>
NgraphOpIntervals(
std::vector<std::unique_ptr<paddle::framework::OperatorBase>>* ops);
explicit NgraphOperator(
const ProgramDesc& prog, size_t block_id,
std::vector<std::unique_ptr<OperatorBase>>::iterator start,
std::vector<std::unique_ptr<OperatorBase>>::iterator end,
const std::string& type = "fused_op", const VariableNameMap& inputs = {},
const VariableNameMap& outputs = {}, const AttributeMap& attrs = {});
void RunImpl(const Scope& scope, const platform::Place& place) const final;
private:
const ProgramDesc pdesc_;
size_t block_;
std::vector<std::shared_ptr<OperatorBase>> fused_ops_;
std::unordered_map<std::string, ngraph::element::Type> var_type_map_;
std::unordered_set<std::string> persistables_;
std::unordered_set<std::string> fetches_;
std::unordered_set<std::string> post_op_inputs_;
bool is_full_ = false;
void Process();
};
} // namespace framework
} // namespace paddle
...@@ -19,8 +19,6 @@ limitations under the License. */ ...@@ -19,8 +19,6 @@ limitations under the License. */
#include <sstream> #include <sstream>
#include <string> #include <string>
#include <vector> #include <vector>
#include "gflags/gflags.h"
#include "glog/logging.h"
#include "paddle/fluid/framework/data_transform.h" #include "paddle/fluid/framework/data_transform.h"
#include "paddle/fluid/framework/executor.h" #include "paddle/fluid/framework/executor.h"
#include "paddle/fluid/framework/lod_tensor.h" #include "paddle/fluid/framework/lod_tensor.h"
...@@ -1075,7 +1073,9 @@ Scope* OperatorWithKernel::PrepareData( ...@@ -1075,7 +1073,9 @@ Scope* OperatorWithKernel::PrepareData(
proto::VarType::Type OperatorWithKernel::IndicateDataType( proto::VarType::Type OperatorWithKernel::IndicateDataType(
const ExecutionContext& ctx) const { const ExecutionContext& ctx) const {
int data_type = -1; proto::VarType::Type dafault_data_type =
static_cast<proto::VarType::Type>(-1);
proto::VarType::Type data_type = dafault_data_type;
for (auto& input : this->inputs_) { for (auto& input : this->inputs_) {
const std::vector<const Variable*> vars = ctx.MultiInputVar(input.first); const std::vector<const Variable*> vars = ctx.MultiInputVar(input.first);
for (size_t i = 0; i < vars.size(); ++i) { for (size_t i = 0; i < vars.size(); ++i) {
...@@ -1092,18 +1092,19 @@ proto::VarType::Type OperatorWithKernel::IndicateDataType( ...@@ -1092,18 +1092,19 @@ proto::VarType::Type OperatorWithKernel::IndicateDataType(
if (t != nullptr) { if (t != nullptr) {
PADDLE_ENFORCE(t->IsInitialized(), "Input %s(%lu)is not initialized", PADDLE_ENFORCE(t->IsInitialized(), "Input %s(%lu)is not initialized",
input.first, i); input.first, i);
int tmp = static_cast<int>(t->type()); proto::VarType::Type tmp = t->type();
PADDLE_ENFORCE( PADDLE_ENFORCE(
tmp == data_type || data_type == -1, tmp == data_type || data_type == dafault_data_type,
"DataType of Paddle Op %s must be the same. Get (%d) != (%d)", "DataType of Paddle Op %s must be the same. Get (%d) != (%d)",
Type(), data_type, tmp); Type(), DataTypeToString(data_type), DataTypeToString(tmp));
data_type = tmp; data_type = tmp;
} }
} }
} }
} }
PADDLE_ENFORCE(data_type != -1, "DataType should be indicated by input"); PADDLE_ENFORCE(data_type != dafault_data_type,
return static_cast<proto::VarType::Type>(data_type); "DataType should be indicated by input");
return data_type;
} }
OpKernelType OperatorWithKernel::GetExpectedKernelType( OpKernelType OperatorWithKernel::GetExpectedKernelType(
......
...@@ -25,7 +25,8 @@ inline const T* Tensor::data() const { ...@@ -25,7 +25,8 @@ inline const T* Tensor::data() const {
check_memory_size(); check_memory_size();
bool valid = bool valid =
std::is_same<T, void>::value || type_ == DataTypeTrait<T>::DataType; std::is_same<T, void>::value || type_ == DataTypeTrait<T>::DataType;
PADDLE_ENFORCE(valid, "Tensor holds the wrong type, it holds %d", type_); PADDLE_ENFORCE(valid, "Tensor holds the wrong type, it holds %d",
DataTypeToString(type_));
return reinterpret_cast<const T*>( return reinterpret_cast<const T*>(
reinterpret_cast<uintptr_t>(holder_->ptr()) + offset_); reinterpret_cast<uintptr_t>(holder_->ptr()) + offset_);
......
if(WITH_PYTHON) if(WITH_PYTHON)
cc_library(layer SRCS layer.cc DEPS proto_desc operator) cc_library(layer SRCS layer.cc DEPS proto_desc operator device_context blas)
cc_library(tracer SRCS tracer.cc DEPS proto_desc) cc_library(tracer SRCS tracer.cc DEPS proto_desc device_context)
cc_library(engine SRCS engine.cc) cc_library(engine SRCS engine.cc)
endif() endif()
...@@ -13,6 +13,7 @@ ...@@ -13,6 +13,7 @@
// limitations under the License. // limitations under the License.
#include "paddle/fluid/imperative/layer.h" #include "paddle/fluid/imperative/layer.h"
#include <deque> #include <deque>
#include <limits> #include <limits>
#include <map> #include <map>
...@@ -22,6 +23,9 @@ ...@@ -22,6 +23,9 @@
#include "paddle/fluid/framework/lod_tensor.h" #include "paddle/fluid/framework/lod_tensor.h"
#include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/framework/operator.h" #include "paddle/fluid/framework/operator.h"
#include "paddle/fluid/framework/tensor_util.h"
#include "paddle/fluid/operators/math/blas.h"
#include "paddle/fluid/platform/device_context.h"
#include "paddle/fluid/string/printf.h" #include "paddle/fluid/string/printf.h"
namespace paddle { namespace paddle {
...@@ -34,22 +38,66 @@ std::map<int, py::object> py_funcs_; ...@@ -34,22 +38,66 @@ std::map<int, py::object> py_funcs_;
using framework::Variable; using framework::Variable;
void AddTo(Variable* src, Variable* dst) { namespace detail {
framework::LoDTensor* dst_tensor = dst->GetMutable<framework::LoDTensor>();
framework::LoDTensor* src_tensor = src->GetMutable<framework::LoDTensor>(); template <typename T>
class TensorAddToFunctor : public boost::static_visitor<> {
public:
TensorAddToFunctor(int64_t numel, const T* x, T* y)
: numel_(numel), x_(x), y_(y) {}
void operator()(const platform::CPUPlace& place) {
platform::CPUDeviceContext* ctx = dynamic_cast<platform::CPUDeviceContext*>(
platform::DeviceContextPool::Instance().Get(place));
auto blas = operators::math::GetBlas<platform::CPUDeviceContext, T>(*ctx);
blas.AXPY(numel_, 1., x_, y_);
}
#ifdef PADDLE_WITH_CUDA
void operator()(const platform::CUDAPlace& place) {
platform::CUDADeviceContext* ctx =
dynamic_cast<platform::CUDADeviceContext*>(
platform::DeviceContextPool::Instance().Get(place));
auto blas = operators::math::GetBlas<platform::CUDADeviceContext, T>(*ctx);
blas.AXPY(numel_, 1., x_, y_);
}
#else
void operator()(const platform::CUDAPlace& place) {
PADDLE_THROW("Do NOT support gradient merge in place %s", place);
}
#endif
// there is NO blas in CUDAPinnedPlace
void operator()(const platform::CUDAPinnedPlace& place) {
PADDLE_THROW("Do NOT support gradient merge in place %s", place);
}
private:
int64_t numel_;
const T* x_;
T* y_;
};
} // namespace detail
void AddTo(Variable* src, Variable* dst, platform::Place place) {
framework::Tensor* dst_tensor = dst->GetMutable<framework::LoDTensor>();
framework::Tensor* src_tensor = src->GetMutable<framework::LoDTensor>();
// FIXME(minqiyang): loss_grad op will pass a zero grad of label // FIXME(minqiyang): loss_grad op will pass a zero grad of label
// ugly fix for it // ugly fix for it
if (src_tensor->numel() == 0) { if (src_tensor->numel() == 0) {
return; return;
} }
PADDLE_ENFORCE(dst_tensor->numel() == src_tensor->numel(), PADDLE_ENFORCE(dst_tensor->numel() == src_tensor->numel(),
"dst_numel %lld vs. src_numel %lld", dst_tensor->numel(), "dst_numel %lld vs. src_numel %lld", dst_tensor->numel(),
src_tensor->numel()); src_tensor->numel());
float* dst_data = dst_tensor->mutable_data<float>(platform::CPUPlace());
const float* src_data = src_tensor->data<float>(); detail::TensorAddToFunctor<float> func(
for (int64_t i = 0; i < src_tensor->numel(); ++i) { src_tensor->numel(), src_tensor->data<float>(),
dst_data[i] += src_data[i]; dst_tensor->mutable_data<float>(place));
} boost::apply_visitor(func, place);
} }
class Autograd { class Autograd {
...@@ -120,66 +168,104 @@ class Autograd { ...@@ -120,66 +168,104 @@ class Autograd {
} }
}; };
std::unique_ptr<VarBase> VarBase::NewVarBase(const platform::Place& dst_place,
const bool blocking) const {
PADDLE_ENFORCE(var_->IsInitialized(),
"Variable must be initialized when getting numpy tensor");
std::unique_ptr<VarBase> new_var(new VarBase());
framework::LoDTensor* tensor =
new_var->var_->GetMutable<framework::LoDTensor>();
tensor->Resize(var_->Get<framework::LoDTensor>().dims());
tensor->set_lod(var_->Get<framework::LoDTensor>().lod());
if (blocking) {
platform::DeviceContext* dev_ctx =
platform::DeviceContextPool::Instance().Get(dst_place);
framework::TensorCopySync(var_->Get<framework::LoDTensor>(), dst_place,
tensor);
dev_ctx->Wait();
} else {
framework::TensorCopy(var_->Get<framework::LoDTensor>(), dst_place, tensor);
}
if (platform::is_gpu_place(dst_place)) {
VLOG(3) << "copy tensor " << var_desc_->Name() << " from gpu";
}
return new_var;
}
framework::LoDTensor& VarBase::GradValue() { framework::LoDTensor& VarBase::GradValue() {
VLOG(3) << "get var grad " << var_desc_->Name(); VLOG(3) << "get var grad " << var_desc_->Name();
return *(grads_->var_->GetMutable<framework::LoDTensor>()); return *(grads_->var_->GetMutable<framework::LoDTensor>());
} }
std::map<std::string, std::vector<VarBase*>> OpBase::ApplyGrad() { std::map<std::string, std::vector<VarBase*>> OpBase::ApplyGrad() {
if (!grad_op_desc_ && backward_id_ <= 0) { if (grad_op_descs_.empty() && backward_id_ <= 0) {
LOG(WARNING) << "op with no grad: " << op_desc_->Type(); LOG(WARNING) << "op with no grad: " << op_desc_->Type();
return {}; return {};
} }
std::map<std::string, std::vector<framework::Variable*>> grad_outputs; std::vector<framework::VariableValueMap> grad_outputs;
if (backward_id_ > 0) { if (backward_id_ > 0) {
VLOG(3) << "py_layer_grad"; VLOG(3) << "py_layer_grad";
grad_outputs[framework::GradVarName(PyLayer::kFwdOut)] = PyLayer::ApplyGrad( grad_outputs.resize(1);
backward_id_, grad_outputs[0][framework::GradVarName(PyLayer::kFwdOut)] =
grad_input_vars_[framework::GradVarName(PyLayer::kFwdInp)]); PyLayer::ApplyGrad(
backward_id_,
grad_input_vars_[0][framework::GradVarName(PyLayer::kFwdInp)]);
} else { } else {
VLOG(3) << "op grad " << grad_op_desc_->Type(); grad_outputs.resize(grad_op_descs_.size());
for (auto it : grad_output_vars_) { for (size_t k = 0; k < grad_op_descs_.size(); ++k) {
auto& outputs = grad_outputs[it.first]; framework::OpDesc* grad_op_desc = grad_op_descs_[k];
for (size_t i = 0; i < it.second.size(); ++i) { VLOG(3) << "op grad " << grad_op_desc->Type();
// Allocate a new variable for (auto it : grad_output_vars_[k]) {
Variable* tmp_var = new framework::Variable(); auto& outputs = grad_outputs[k][it.first];
tmp_var->GetMutable<framework::LoDTensor>(); for (size_t i = 0; i < it.second.size(); ++i) {
outputs.push_back(tmp_var); // Allocate a new variable
Variable* tmp_var = new framework::Variable();
tmp_var->GetMutable<framework::LoDTensor>();
outputs.push_back(tmp_var);
}
} }
}
framework::RuntimeContext ctx(grad_input_vars_, grad_outputs); framework::RuntimeContext ctx(grad_input_vars_[k], grad_outputs[k]);
// No need to do compile time infer shape here. // No need to do compile time infer shape here.
// grad_op_desc_->InferShape(*block_); // grad_op_desc_->InferShape(*block_);
grad_op_desc_->InferVarType(block_); grad_op_desc->InferVarType(block_);
std::unique_ptr<framework::OperatorBase> opbase = std::unique_ptr<framework::OperatorBase> opbase =
framework::OpRegistry::CreateOp(*grad_op_desc_); framework::OpRegistry::CreateOp(*grad_op_desc);
framework::OperatorWithKernel* op_kernel = framework::OperatorWithKernel* op_kernel =
dynamic_cast<framework::OperatorWithKernel*>(opbase.get()); dynamic_cast<framework::OperatorWithKernel*>(opbase.get());
PADDLE_ENFORCE_NOT_NULL(op_kernel, "only support op with kernel"); PADDLE_ENFORCE_NOT_NULL(op_kernel, "only support op with kernel");
framework::Scope scope; framework::Scope scope;
platform::CPUPlace place; PreparedOp p = PreparedOp::Prepare(ctx, *op_kernel, place_);
PreparedOp p = PreparedOp::Prepare(ctx, *op_kernel, place); p.op.RuntimeInferShape(scope, place_, ctx);
p.op.RuntimeInferShape(scope, place, ctx); p.func(framework::ExecutionContext(p.op, scope, *p.dev_ctx, p.ctx));
p.func(framework::ExecutionContext(p.op, scope, *p.dev_ctx, p.ctx)); }
} }
for (auto it : grad_output_vars_) { for (size_t k = 0; k < grad_output_vars_.size(); ++k) {
auto& outputs = grad_outputs[it.first]; for (auto it : grad_output_vars_[k]) {
auto& origin_outputs = it.second; auto& outputs = grad_outputs[k][it.first];
PADDLE_ENFORCE_EQ(outputs.size(), origin_outputs.size()); auto& origin_outputs = it.second;
PADDLE_ENFORCE_EQ(outputs.size(), origin_outputs.size());
for (size_t i = 0; i < outputs.size(); ++i) {
framework::Variable* grad = outputs[i]; for (size_t i = 0; i < outputs.size(); ++i) {
framework::Variable* orig_grad = origin_outputs[i]; framework::Variable* grad = outputs[i];
AddTo(grad, orig_grad); framework::Variable* orig_grad = origin_outputs[i];
delete grad; AddTo(grad, orig_grad, place_);
delete grad;
}
} }
} }
return input_vars_; return input_vars_;
} }
...@@ -188,8 +274,10 @@ void VarBase::RunBackward() { ...@@ -188,8 +274,10 @@ void VarBase::RunBackward() {
VLOG(3) << "start backward"; VLOG(3) << "start backward";
auto grads_t = grads_->var_->GetMutable<framework::LoDTensor>(); auto grads_t = grads_->var_->GetMutable<framework::LoDTensor>();
float* data = grads_t->mutable_data<float>(platform::CPUPlace()); operators::math::set_constant(
std::fill(data, data + grads_t->numel(), 1.0); *(platform::DeviceContextPool::Instance().Get(
var_->GetMutable<framework::LoDTensor>()->place())),
grads_t, 1.0);
PADDLE_ENFORCE( PADDLE_ENFORCE(
grads_ == grads_ ==
......
...@@ -21,17 +21,21 @@ ...@@ -21,17 +21,21 @@
#include <map> // NOLINT #include <map> // NOLINT
#include <string> // NOLINT #include <string> // NOLINT
#include <vector> // NOLINT #include <vector> // NOLINT
#include <memory> // NOLINT
#include "paddle/fluid/framework/op_desc.h" #include "paddle/fluid/framework/op_desc.h"
#include "paddle/fluid/framework/operator.h" #include "paddle/fluid/framework/operator.h"
#include "paddle/fluid/framework/var_desc.h" #include "paddle/fluid/framework/var_desc.h"
#include "paddle/fluid/platform/enforce.h" #include "paddle/fluid/platform/enforce.h"
#include "paddle/fluid/platform/device_context.h"
#include "paddle/fluid/imperative/type_defs.h" #include "paddle/fluid/imperative/type_defs.h"
namespace paddle { namespace paddle {
namespace imperative { namespace imperative {
class VarBase;
namespace py = ::pybind11; namespace py = ::pybind11;
class PreparedOp { class PreparedOp {
...@@ -81,6 +85,8 @@ class PreparedOp { ...@@ -81,6 +85,8 @@ class PreparedOp {
return PreparedOp(op, ctx, kernel_iter->second, dev_ctx); return PreparedOp(op, ctx, kernel_iter->second, dev_ctx);
} }
inline platform::DeviceContext* GetDeviceContext() const { return dev_ctx; }
const framework::OperatorBase& op; const framework::OperatorBase& op;
const framework::RuntimeContext& ctx; const framework::RuntimeContext& ctx;
framework::OperatorWithKernel::OpKernelFunc func; framework::OperatorWithKernel::OpKernelFunc func;
...@@ -148,6 +154,9 @@ class VarBase { ...@@ -148,6 +154,9 @@ class VarBase {
framework::LoDTensor& GradValue(); framework::LoDTensor& GradValue();
std::unique_ptr<VarBase> NewVarBase(const platform::Place& dst_place,
const bool blocking) const;
inline std::string GradName() const { inline std::string GradName() const {
PADDLE_ENFORCE( PADDLE_ENFORCE(
var_desc_, var_desc_,
...@@ -175,11 +184,13 @@ class OpBase { ...@@ -175,11 +184,13 @@ class OpBase {
OpBase() OpBase()
: op_desc_(nullptr), : op_desc_(nullptr),
forward_id_(-1), forward_id_(-1),
grad_op_desc_(nullptr), backward_id_(-1),
backward_id_(-1) {} place_(platform::CPUPlace()) {}
virtual ~OpBase() { virtual ~OpBase() {
if (grad_op_desc_) delete grad_op_desc_; for (framework::OpDesc* desc : grad_op_descs_) {
delete desc;
}
} }
std::map<std::string, std::vector<VarBase*>> ApplyGrad(); std::map<std::string, std::vector<VarBase*>> ApplyGrad();
...@@ -188,18 +199,25 @@ class OpBase { ...@@ -188,18 +199,25 @@ class OpBase {
// For pure python PyLayer, use `forward_id_`, otherwise, use op_desc_. // For pure python PyLayer, use `forward_id_`, otherwise, use op_desc_.
framework::OpDesc* op_desc_; framework::OpDesc* op_desc_;
int forward_id_; int forward_id_;
// When has backward, one of `grad_op_desc_` or `backward_id_` is set,
// When has backward, one of `grad_op_descs_` or `backward_id_` is set,
// not both. // not both.
framework::OpDesc* grad_op_desc_; // Note: each fwd op corresponds to a vector of bwd ops.
std::vector<framework::OpDesc*> grad_op_descs_;
int backward_id_; int backward_id_;
platform::Place place_;
VarBasePtrMap input_vars_; VarBasePtrMap input_vars_;
VarBasePtrMap output_vars_; VarBasePtrMap output_vars_;
OpBasePtrMap pre_ops_; OpBasePtrMap pre_ops_;
std::map<std::string, std::vector<int>> pre_ops_out_idx_; std::map<std::string, std::vector<int>> pre_ops_out_idx_;
framework::VariableValueMap grad_input_vars_; // Inputs to a vector of bwd ops.
framework::VariableValueMap grad_output_vars_; std::vector<framework::VariableValueMap> grad_input_vars_;
// Outputs to a vector of bwd ops.
std::vector<framework::VariableValueMap> grad_output_vars_;
framework::BlockDesc* block_; framework::BlockDesc* block_;
}; };
......
...@@ -14,33 +14,60 @@ ...@@ -14,33 +14,60 @@
#include "paddle/fluid/imperative/tracer.h" #include "paddle/fluid/imperative/tracer.h"
#include "paddle/fluid/operators/math/math_function.h"
#include "paddle/fluid/platform/device_context.h"
#include "paddle/fluid/platform/enforce.h"
namespace paddle { namespace paddle {
namespace imperative { namespace imperative {
void CreateGradOp(const framework::OpDesc& op_desc, void CreateGradOp(const framework::OpDesc& op_desc,
const std::unordered_set<std::string>& no_grad_set, const std::unordered_set<std::string>& no_grad_set,
const std::vector<framework::BlockDesc*>& grad_sub_block, const std::vector<framework::BlockDesc*>& grad_sub_block,
framework::OpDesc** grad_op_desc, std::vector<framework::OpDesc*>* grad_op_descs,
std::unordered_map<std::string, std::string>* grad_to_var) { std::unordered_map<std::string, std::string>* grad_to_var) {
std::vector<std::unique_ptr<framework::OpDesc>> grad_op_descs = PADDLE_ENFORCE(grad_op_descs->empty());
std::vector<std::unique_ptr<framework::OpDesc>> descs =
framework::OpInfoMap::Instance() framework::OpInfoMap::Instance()
.Get(op_desc.Type()) .Get(op_desc.Type())
.GradOpMaker()(op_desc, no_grad_set, grad_to_var, grad_sub_block); .GradOpMaker()(op_desc, no_grad_set, grad_to_var, grad_sub_block);
PADDLE_ENFORCE(grad_op_descs.size() == 1, "Only support 1 grad op now."); for (auto& desc : descs) {
// TODO(panyx0718): Leak? grad_op_descs->emplace_back(desc.release());
*grad_op_desc = grad_op_descs[0].release(); }
} }
void InitVar(framework::Variable* var, framework::Variable* grad_var) { void InitVar(framework::Variable* var, framework::Variable* grad_var,
platform::DeviceContext* dev_ctx) {
PADDLE_ENFORCE_NOT_NULL(dev_ctx,
"Could not get valid device from forward op");
auto& var_t = var->Get<framework::LoDTensor>(); auto& var_t = var->Get<framework::LoDTensor>();
float* data = grad_var->GetMutable<framework::LoDTensor>()->mutable_data<float>(
grad_var->GetMutable<framework::LoDTensor>()->mutable_data<float>( var_t.dims(), dev_ctx->GetPlace());
var_t.dims(), platform::CPUPlace()); operators::math::set_constant(
std::fill(data, data + var_t.numel(), 0.0); *dev_ctx, grad_var->GetMutable<framework::LoDTensor>(), 0.0);
}
platform::Place GetExpectedPlace(platform::Place place, VarBasePtrMap inputs) {
platform::Place result = place;
for (auto it : inputs) {
for (VarBase* var : it.second) {
platform::Place tmp_place =
var->var_->Get<framework::LoDTensor>().place();
if (!platform::is_same_place(tmp_place, result)) {
PADDLE_THROW(
"Input variable should keep in the same place: %s, but get place: "
"%s of input %s instead",
result, tmp_place, it.first);
}
}
}
return result;
} }
void Tracer::Trace(OpBase* op, const VarBasePtrMap& inputs, void Tracer::Trace(OpBase* op, const VarBasePtrMap& inputs,
const VarBasePtrMap& outputs, framework::BlockDesc* block, const VarBasePtrMap& outputs, framework::BlockDesc* block,
const platform::Place expected_place,
const bool stop_gradient) { const bool stop_gradient) {
std::map<std::string, VarBase*> vars; std::map<std::string, VarBase*> vars;
...@@ -105,51 +132,59 @@ void Tracer::Trace(OpBase* op, const VarBasePtrMap& inputs, ...@@ -105,51 +132,59 @@ void Tracer::Trace(OpBase* op, const VarBasePtrMap& inputs,
PADDLE_ENFORCE_NOT_NULL(op_kernel, "only support op with kernel"); PADDLE_ENFORCE_NOT_NULL(op_kernel, "only support op with kernel");
framework::Scope scope; framework::Scope scope;
platform::CPUPlace place; op->place_ = GetExpectedPlace(expected_place, inputs);
PreparedOp p = PreparedOp::Prepare(ctx, *op_kernel, place); PreparedOp prepared_op = PreparedOp::Prepare(ctx, *op_kernel, op->place_);
p.op.RuntimeInferShape(scope, place, ctx); prepared_op.op.RuntimeInferShape(scope, op->place_, ctx);
p.func(framework::ExecutionContext(p.op, scope, *p.dev_ctx, p.ctx)); prepared_op.func(framework::ExecutionContext(
prepared_op.op, scope, *prepared_op.dev_ctx, prepared_op.ctx));
if (!stop_gradient) { if (!stop_gradient) {
framework::OpDesc* grad_op_desc;
// TODO(panyx): Is this leaked?
std::unique_ptr<std::unordered_map<std::string, std::string>> grad_to_var( std::unique_ptr<std::unordered_map<std::string, std::string>> grad_to_var(
new std::unordered_map<std::string, std::string>()); new std::unordered_map<std::string, std::string>());
CreateGradOp(*op_desc, {}, {block}, &grad_op_desc, grad_to_var.get()); CreateGradOp(*op_desc, {}, {block}, &op->grad_op_descs_, grad_to_var.get());
op->grad_op_desc_ = grad_op_desc;
op->grad_input_vars_.resize(op->grad_op_descs_.size());
for (auto it : grad_op_desc->Inputs()) { op->grad_output_vars_.resize(op->grad_op_descs_.size());
auto& grad_in_vars = op->grad_input_vars_[it.first]; for (size_t i = 0; i < op->grad_op_descs_.size(); ++i) {
for (const std::string& grad_invar : it.second) { framework::OpDesc* grad_op_desc = op->grad_op_descs_[i];
block->FindRecursiveOrCreateVar(grad_invar); for (auto it : grad_op_desc->Inputs()) {
auto var_it = grad_to_var->find(grad_invar); auto& grad_in_vars = op->grad_input_vars_[i][it.first];
if (var_it == grad_to_var->end()) { for (const std::string& grad_invar : it.second) {
auto fwd_var_it = vars.find(grad_invar); block->FindRecursiveOrCreateVar(grad_invar);
PADDLE_ENFORCE(fwd_var_it != vars.end()); auto var_it = grad_to_var->find(grad_invar);
// Forward inputs or outputs. if (var_it == grad_to_var->end()) {
grad_in_vars.push_back(fwd_var_it->second->var_); auto fwd_var_it = vars.find(grad_invar);
} else { PADDLE_ENFORCE(fwd_var_it != vars.end());
VarBase* var = vars[var_it->second]; // Forward inputs or outputs.
if (!var->grads_->var_->IsInitialized()) { grad_in_vars.push_back(fwd_var_it->second->var_);
InitVar(var->var_, var->grads_->var_); } else {
VarBase* var = vars[var_it->second];
if (!var->grads_->var_->IsInitialized()) {
InitVar(var->var_, var->grads_->var_,
prepared_op.GetDeviceContext());
}
// Douts.
grad_in_vars.push_back(var->grads_->var_);
} }
// Douts.
grad_in_vars.push_back(var->grads_->var_);
} }
} }
}
for (auto it : grad_op_desc->Outputs()) { for (auto it : grad_op_desc->Outputs()) {
auto& grad_out_vars = op->grad_output_vars_[it.first]; auto& grad_out_vars = op->grad_output_vars_[i][it.first];
for (const std::string& grad_outvar : it.second) { for (const std::string& grad_outvar : it.second) {
block->FindRecursiveOrCreateVar(grad_outvar); block->FindRecursiveOrCreateVar(grad_outvar);
auto var_it = grad_to_var->find(grad_outvar); auto var_it = grad_to_var->find(grad_outvar);
PADDLE_ENFORCE(var_it != grad_to_var->end()); PADDLE_ENFORCE(var_it != grad_to_var->end(),
VarBase* var = vars[var_it->second]; "Could not found the grad op output var, should this "
if (!var->grads_->var_->IsInitialized()) { "operator %s's stop gradient be True",
InitVar(var->var_, var->grads_->var_); op_desc->Type());
VarBase* var = vars[var_it->second];
if (!var->grads_->var_->IsInitialized()) {
InitVar(var->var_, var->grads_->var_,
prepared_op.GetDeviceContext());
}
grad_out_vars.push_back(var->grads_->var_);
} }
grad_out_vars.push_back(var->grads_->var_);
} }
} }
} }
...@@ -178,10 +213,12 @@ std::vector<VarBase*> Tracer::PyTrace(OpBase* op, ...@@ -178,10 +213,12 @@ std::vector<VarBase*> Tracer::PyTrace(OpBase* op,
out->TrackPreOp(op, PyLayer::kFwdOut, i, stop_gradient); out->TrackPreOp(op, PyLayer::kFwdOut, i, stop_gradient);
} }
if (!stop_gradient) { if (!stop_gradient) {
op->grad_input_vars_.resize(1);
op->grad_output_vars_.resize(1);
auto& grad_input_vars = auto& grad_input_vars =
op->grad_input_vars_[framework::GradVarName(PyLayer::kFwdInp)]; op->grad_input_vars_[0][framework::GradVarName(PyLayer::kFwdInp)];
auto& grad_output_vars = auto& grad_output_vars =
op->grad_output_vars_[framework::GradVarName(PyLayer::kFwdOut)]; op->grad_output_vars_[0][framework::GradVarName(PyLayer::kFwdOut)];
for (const VarBase* inp : inputs) { for (const VarBase* inp : inputs) {
grad_input_vars.push_back(inp->var_); grad_input_vars.push_back(inp->var_);
...@@ -189,16 +226,23 @@ std::vector<VarBase*> Tracer::PyTrace(OpBase* op, ...@@ -189,16 +226,23 @@ std::vector<VarBase*> Tracer::PyTrace(OpBase* op,
for (VarBase* out : outputs) { for (VarBase* out : outputs) {
grad_input_vars.push_back(out->var_); grad_input_vars.push_back(out->var_);
} }
platform::CPUPlace place;
for (VarBase* out : outputs) { for (VarBase* out : outputs) {
grad_input_vars.push_back(out->grads_->var_); grad_input_vars.push_back(out->grads_->var_);
if (!grad_input_vars.back()->IsInitialized()) { if (!grad_input_vars.back()->IsInitialized()) {
InitVar(out->var_, grad_input_vars.back()); // TODO(minqiyang): Add GPU support for PyLayer, only support CPU now
InitVar(out->var_, grad_input_vars.back(),
platform::DeviceContextPool::Instance().Get(place));
} }
} }
for (const VarBase* inp : inputs) { for (const VarBase* inp : inputs) {
grad_output_vars.push_back(inp->grads_->var_); grad_output_vars.push_back(inp->grads_->var_);
if (!grad_output_vars.back()->IsInitialized()) { if (!grad_output_vars.back()->IsInitialized()) {
InitVar(inp->var_, grad_output_vars.back()); // TODO(minqiyang): Add GPU support for PyLayer, only support CPU now
InitVar(inp->var_, grad_output_vars.back(),
platform::DeviceContextPool::Instance().Get(place));
} }
} }
} }
......
...@@ -22,6 +22,7 @@ ...@@ -22,6 +22,7 @@
#include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/imperative/engine.h" #include "paddle/fluid/imperative/engine.h"
#include "paddle/fluid/imperative/layer.h" #include "paddle/fluid/imperative/layer.h"
#include "paddle/fluid/platform/place.h"
namespace paddle { namespace paddle {
namespace imperative { namespace imperative {
...@@ -34,21 +35,25 @@ void CreateGradOp(const framework::OpDesc& op_desc, ...@@ -34,21 +35,25 @@ void CreateGradOp(const framework::OpDesc& op_desc,
void InitVar(framework::Variable* var, framework::Variable* grad_var); void InitVar(framework::Variable* var, framework::Variable* grad_var);
platform::Place GetExpectedPlace(platform::Place place, VarBasePtrMap inputs);
class Tracer { class Tracer {
public: public:
explicit Tracer(framework::BlockDesc* root_block) : root_block_(root_block) {} explicit Tracer(framework::BlockDesc* root_block) : root_block_(root_block) {}
virtual ~Tracer() {} virtual ~Tracer() {}
void Trace(OpBase* op, void Trace(OpBase* op, const VarBasePtrMap& inputs,
const std::map<std::string, std::vector<VarBase*>>& inputs, const VarBasePtrMap& outputs, framework::BlockDesc* block,
const std::map<std::string, std::vector<VarBase*>>& outputs, const platform::Place expected_place,
framework::BlockDesc* block, const bool stop_gradient = false); const bool stop_gradient = false);
std::vector<VarBase*> PyTrace(OpBase* op, const std::vector<VarBase*>& inputs, std::vector<VarBase*> PyTrace(OpBase* op, const std::vector<VarBase*>& inputs,
bool stop_gradient = false); bool stop_gradient = false);
private: private:
platform::Place GetPlace(const VarBasePtrMap& inputs);
framework::BlockDesc* root_block_; framework::BlockDesc* root_block_;
}; };
......
...@@ -28,6 +28,7 @@ ...@@ -28,6 +28,7 @@
#include "paddle/fluid/framework/ir/graph.h" #include "paddle/fluid/framework/ir/graph.h"
#include "paddle/fluid/framework/program_desc.h" #include "paddle/fluid/framework/program_desc.h"
#include "paddle/fluid/framework/scope.h" #include "paddle/fluid/framework/scope.h"
#include "paddle/fluid/inference/api/paddle_analysis_config.h"
#include "paddle/fluid/platform/variant.h" #include "paddle/fluid/platform/variant.h"
namespace paddle { namespace paddle {
...@@ -130,10 +131,14 @@ struct Argument { ...@@ -130,10 +131,14 @@ struct Argument {
DECL_ARGUMENT_FIELD(tensorrt_max_batch_size, TensorRtMaxBatchSize, int); DECL_ARGUMENT_FIELD(tensorrt_max_batch_size, TensorRtMaxBatchSize, int);
DECL_ARGUMENT_FIELD(tensorrt_workspace_size, TensorRtWorkspaceSize, int); DECL_ARGUMENT_FIELD(tensorrt_workspace_size, TensorRtWorkspaceSize, int);
DECL_ARGUMENT_FIELD(tensorrt_min_subgraph_size, TensorRtMinSubgraphSize, int); DECL_ARGUMENT_FIELD(tensorrt_min_subgraph_size, TensorRtMinSubgraphSize, int);
DECL_ARGUMENT_FIELD(tensorrt_precision_mode, TensorRtPrecisionMode,
contrib::AnalysisConfig::Precision);
// Memory optimized related. // Memory optimized related.
DECL_ARGUMENT_FIELD(enable_memory_optim, EnableMemoryOptim, bool); DECL_ARGUMENT_FIELD(enable_memory_optim, EnableMemoryOptim, bool);
DECL_ARGUMENT_FIELD(memory_optim_force_update, MemoryOptimForceUpdate, bool); DECL_ARGUMENT_FIELD(static_memory_optim, StaticMemoryOptim, bool);
DECL_ARGUMENT_FIELD(static_memory_optim_force_update,
StaticMemoryOptimForceUpdate, bool);
// Indicate which kind of sort algorithm is used for operators, the memory // Indicate which kind of sort algorithm is used for operators, the memory
// optimization relays on the sort algorithm. // optimization relays on the sort algorithm.
DECL_ARGUMENT_FIELD(memory_optim_sort_kind, MemoryOptimSortKind, int); DECL_ARGUMENT_FIELD(memory_optim_sort_kind, MemoryOptimSortKind, int);
......
...@@ -36,6 +36,14 @@ void SetAttr<int>(framework::proto::OpDesc *op, const std::string &name, ...@@ -36,6 +36,14 @@ void SetAttr<int>(framework::proto::OpDesc *op, const std::string &name,
attr->set_i(data); attr->set_i(data);
} }
template <> template <>
void SetAttr<bool>(framework::proto::OpDesc *op, const std::string &name,
const bool &data) {
auto *attr = op->add_attrs();
attr->set_name(name);
attr->set_type(paddle::framework::proto::AttrType::BOOLEAN);
attr->set_b(data);
}
template <>
void SetAttr<int64_t>(framework::proto::OpDesc *op, const std::string &name, void SetAttr<int64_t>(framework::proto::OpDesc *op, const std::string &name,
const int64_t &data) { const int64_t &data) {
auto *attr = op->add_attrs(); auto *attr = op->add_attrs();
......
...@@ -17,6 +17,7 @@ limitations under the License. */ ...@@ -17,6 +17,7 @@ limitations under the License. */
#include <sys/stat.h> #include <sys/stat.h>
#include <cstdio> #include <cstdio>
#include <fstream> #include <fstream>
#include <set>
#include <string> #include <string>
#include <typeindex> #include <typeindex>
#include <unordered_map> #include <unordered_map>
...@@ -29,9 +30,14 @@ limitations under the License. */ ...@@ -29,9 +30,14 @@ limitations under the License. */
#include "paddle/fluid/platform/port.h" #include "paddle/fluid/platform/port.h"
#ifdef _WIN32 #ifdef _WIN32
#include <direct.h>
#include <io.h>
#define GCC_ATTRIBUTE(attr__) ; #define GCC_ATTRIBUTE(attr__) ;
#define MKDIR(path) _mkdir(path)
#else #else
#include <unistd.h>
#define GCC_ATTRIBUTE(attr__) __attribute__((attr__)); #define GCC_ATTRIBUTE(attr__) __attribute__((attr__));
#define MKDIR(path) mkdir(path, S_IRWXU | S_IRWXG | S_IROTH | S_IXOTH)
#endif #endif
#define __SHOULD_USE_RESULT__ GCC_ATTRIBUTE(warn_unused_result) #define __SHOULD_USE_RESULT__ GCC_ATTRIBUTE(warn_unused_result)
...@@ -163,6 +169,54 @@ static bool PathExists(const std::string &path) { ...@@ -163,6 +169,54 @@ static bool PathExists(const std::string &path) {
return false; return false;
} }
static std::string GetDirRoot(const std::string &path) {
char sep = '/';
#ifdef _WIN32
sep = '\\';
#endif
size_t i = path.rfind(sep, path.length());
if (i != std::string::npos) {
return (path.substr(0, i));
}
return path;
}
static std::string GetOrCreateModelOptCacheDir(const std::string &model_root) {
std::string opt_cache_dir = model_root + "/_opt_cache/";
if (!PathExists(opt_cache_dir)) {
PADDLE_ENFORCE(MKDIR(opt_cache_dir.c_str()) != -1,
"Can not create optimize cache directory: %s, Make sure you "
"have permission to write",
opt_cache_dir);
}
return opt_cache_dir;
}
static std::string GetTrtCalibPath(const std::string &model_root,
const std::string &engine_key) {
return model_root + "/trt_calib_" + engine_key;
}
// If there is no calib table data file in model_opt_cache_dir, return "".
static std::string GetTrtCalibTableData(const std::string &model_opt_cache_dir,
const std::string &engine_key,
bool enable_int8) {
std::string trt_calib_table_path =
GetTrtCalibPath(model_opt_cache_dir, engine_key);
if (enable_int8 && FileExists(trt_calib_table_path)) {
VLOG(3) << "Calibration table file: " << trt_calib_table_path
<< "is found here";
std::ifstream infile(trt_calib_table_path, std::ios::in);
std::stringstream buffer;
buffer << infile.rdbuf();
std::string calibration_data(buffer.str());
return calibration_data;
}
return "";
}
} // namespace analysis } // namespace analysis
} // namespace inference } // namespace inference
} // namespace paddle } // namespace paddle
......
...@@ -67,6 +67,20 @@ void IRPassManager::CreatePasses(Argument *argument, ...@@ -67,6 +67,20 @@ void IRPassManager::CreatePasses(Argument *argument,
pass->Set("max_batch_size", new int(argument->tensorrt_max_batch_size())); pass->Set("max_batch_size", new int(argument->tensorrt_max_batch_size()));
pass->Set("min_subgraph_size", pass->Set("min_subgraph_size",
new int(argument->tensorrt_min_subgraph_size())); new int(argument->tensorrt_min_subgraph_size()));
pass->Set("program",
new framework::ProgramDesc *(&argument->main_program()));
bool enable_int8 = argument->tensorrt_precision_mode() ==
contrib::AnalysisConfig::Precision::kInt8;
pass->Set("enable_int8", new bool(enable_int8));
std::string model_opt_cache_dir =
argument->Has("model_dir")
? argument->model_dir()
: GetDirRoot(argument->model_program_path());
pass->Set(
"model_opt_cache_dir",
new std::string(GetOrCreateModelOptCacheDir(model_opt_cache_dir)));
} }
// graph_ = pass->Apply(std::move(graph_)); // graph_ = pass->Apply(std::move(graph_));
...@@ -91,11 +105,14 @@ std::unique_ptr<Graph> IRPassManager::Apply(std::unique_ptr<Graph> graph) { ...@@ -91,11 +105,14 @@ std::unique_ptr<Graph> IRPassManager::Apply(std::unique_ptr<Graph> graph) {
} }
framework::proto::ProgramDesc IRPassManager::AcquireProgram( framework::proto::ProgramDesc IRPassManager::AcquireProgram(
std::unique_ptr<Graph> *graph, const ProgramDesc &program) const { std::unique_ptr<Graph> *graph, ProgramDesc *program) const {
auto pass = auto pass =
framework::ir::PassRegistry::Instance().Get("graph_to_program_pass"); framework::ir::PassRegistry::Instance().Get("graph_to_program_pass");
ProgramDesc desc(program); // Direct using ProgramDesc desc(argument->main_program()) may cause
// incomplete copies of information.
ProgramDesc desc;
desc.CopyFrom(*program->Proto());
pass->SetNotOwned("program", &desc); pass->SetNotOwned("program", &desc);
auto *the_graph = graph->release(); auto *the_graph = graph->release();
*graph = pass->Apply(std::unique_ptr<Graph>(the_graph)); *graph = pass->Apply(std::unique_ptr<Graph>(the_graph));
......
...@@ -29,6 +29,7 @@ ...@@ -29,6 +29,7 @@
#include "paddle/fluid/framework/program_desc.h" #include "paddle/fluid/framework/program_desc.h"
#include "paddle/fluid/framework/scope.h" #include "paddle/fluid/framework/scope.h"
#include "paddle/fluid/inference/analysis/argument.h" #include "paddle/fluid/inference/analysis/argument.h"
#include "paddle/fluid/inference/analysis/helper.h"
namespace paddle { namespace paddle {
namespace inference { namespace inference {
...@@ -42,8 +43,8 @@ class IRPassManager final { ...@@ -42,8 +43,8 @@ class IRPassManager final {
std::unique_ptr<Graph> Apply(std::unique_ptr<Graph> graph); std::unique_ptr<Graph> Apply(std::unique_ptr<Graph> graph);
framework::proto::ProgramDesc AcquireProgram( framework::proto::ProgramDesc AcquireProgram(std::unique_ptr<Graph> *graph,
std::unique_ptr<Graph> *graph, const ProgramDesc &program) const; ProgramDesc *program) const;
framework::ir::Graph &graph() const { return *graph_; } framework::ir::Graph &graph() const { return *graph_; }
......
...@@ -13,6 +13,7 @@ ...@@ -13,6 +13,7 @@
// limitations under the License. // limitations under the License.
#include <algorithm> #include <algorithm>
#include <set>
#include <string> #include <string>
#include <vector> #include <vector>
...@@ -67,12 +68,33 @@ std::unique_ptr<framework::ir::Graph> analysis::TensorRtSubgraphPass::ApplyImpl( ...@@ -67,12 +68,33 @@ std::unique_ptr<framework::ir::Graph> analysis::TensorRtSubgraphPass::ApplyImpl(
return graph; return graph;
} }
std::string GenerateEngineKey(const std::set<std::string> &engine_inputs,
const std::set<std::string> &engine_outputs) {
std::string engine_hash_key = "";
for (auto name : engine_inputs) {
engine_hash_key += name;
}
for (auto name : engine_outputs) {
engine_hash_key += name;
}
auto engine_key = std::to_string(std::hash<std::string>()(engine_hash_key));
return engine_key;
}
void TensorRtSubgraphPass::CreateTensorRTOp(framework::ir::Node *node, void TensorRtSubgraphPass::CreateTensorRTOp(framework::ir::Node *node,
Graph *graph) const { Graph *graph) const {
auto *op_desc = node->Op(); auto *op_desc = node->Op();
auto &subgraph = *Agent(node).subgraph(); auto &subgraph = *Agent(node).subgraph();
PADDLE_ENFORCE(!subgraph.empty()); PADDLE_ENFORCE(!subgraph.empty());
framework::ProgramDesc *program_desc =
Get<framework::ProgramDesc *>("program");
// Add new block for TensorRTEngineOP
const framework::BlockDesc &main_block =
program_desc->Block(framework::kRootBlockIndex);
// const framework::BlockDesc& main_block = program_desc->Block(0);
framework::BlockDesc *new_block = program_desc->AppendBlock(main_block);
// An fake block desc. // An fake block desc.
framework::proto::BlockDesc block_proto; framework::proto::BlockDesc block_proto;
framework::BlockDesc block_desc(nullptr, &block_proto); framework::BlockDesc block_desc(nullptr, &block_proto);
...@@ -82,13 +104,18 @@ void TensorRtSubgraphPass::CreateTensorRTOp(framework::ir::Node *node, ...@@ -82,13 +104,18 @@ void TensorRtSubgraphPass::CreateTensorRTOp(framework::ir::Node *node,
subgraph.size()); subgraph.size());
for (auto *node : subgraph) { for (auto *node : subgraph) {
auto *new_block_op = new_block->AppendOp();
auto *op = block_desc.AppendOp(); auto *op = block_desc.AppendOp();
*new_block_op->Proto() = *node->Op()->Proto();
*op->Proto() = *node->Op()->Proto(); *op->Proto() = *node->Op()->Proto();
} }
// collect inputs // Then, we will use the input_names_with_id and output_names_with_id to
std::unordered_set<std::string> input_names; // generate the eigine key.
std::unordered_set<std::string> input_names_with_id; // So, We use set instead of unordered_set here to ensure that the engine key
// is unique.
std::set<std::string> input_names;
std::set<std::string> input_names_with_id;
for (auto *x : node->inputs) { for (auto *x : node->inputs) {
input_names.insert(x->Name()); input_names.insert(x->Name());
input_names_with_id.insert(x->Name() + std::to_string(x->id())); input_names_with_id.insert(x->Name() + std::to_string(x->id()));
...@@ -96,8 +123,8 @@ void TensorRtSubgraphPass::CreateTensorRTOp(framework::ir::Node *node, ...@@ -96,8 +123,8 @@ void TensorRtSubgraphPass::CreateTensorRTOp(framework::ir::Node *node,
op_desc->SetInput( op_desc->SetInput(
"Xs", std::vector<std::string>(input_names.begin(), input_names.end())); "Xs", std::vector<std::string>(input_names.begin(), input_names.end()));
std::unordered_set<std::string> output_names; std::set<std::string> output_names;
std::unordered_set<std::string> output_names_with_id; std::set<std::string> output_names_with_id;
for (auto *x : node->outputs) { for (auto *x : node->outputs) {
output_names.insert(x->Name()); output_names.insert(x->Name());
output_names_with_id.insert(x->Name() + std::to_string(x->id())); output_names_with_id.insert(x->Name() + std::to_string(x->id()));
...@@ -182,7 +209,6 @@ void TensorRtSubgraphPass::CreateTensorRTOp(framework::ir::Node *node, ...@@ -182,7 +209,6 @@ void TensorRtSubgraphPass::CreateTensorRTOp(framework::ir::Node *node,
// to Tensor. // to Tensor.
std::vector<std::string> output_mapping; std::vector<std::string> output_mapping;
for (auto name : output_names) { for (auto name : output_names) {
// LOG(INFO) << name << " " << output_name_map.size();
PADDLE_ENFORCE(output_name_map.count(name) != 0); PADDLE_ENFORCE(output_name_map.count(name) != 0);
output_mapping.push_back(output_name_map[name]); output_mapping.push_back(output_name_map[name]);
} }
...@@ -193,16 +219,29 @@ void TensorRtSubgraphPass::CreateTensorRTOp(framework::ir::Node *node, ...@@ -193,16 +219,29 @@ void TensorRtSubgraphPass::CreateTensorRTOp(framework::ir::Node *node,
*vars->Add() = *node->Var()->Proto(); *vars->Add() = *node->Var()->Proto();
} }
} }
PADDLE_ENFORCE(!block_desc.Proto()->vars().empty(), PADDLE_ENFORCE(!block_desc.Proto()->vars().empty(),
"the block has no var-desc"); "the block has no var-desc");
PADDLE_ENFORCE(!output_mapping.empty()); PADDLE_ENFORCE(!output_mapping.empty());
// Set attrs op_desc->SetBlockAttr("sub_block", new_block);
SetAttr(op_desc->Proto(), "subgraph", SetAttr(op_desc->Proto(), "subgraph",
block_desc.Proto()->SerializeAsString()); block_desc.Proto()->SerializeAsString());
// Set attrs
SetAttr(op_desc->Proto(), "max_batch_size", Get<int>("max_batch_size")); SetAttr(op_desc->Proto(), "max_batch_size", Get<int>("max_batch_size"));
SetAttr(op_desc->Proto(), "workspace_size", Get<int>("workspace_size")); SetAttr(op_desc->Proto(), "workspace_size", Get<int>("workspace_size"));
SetAttr(op_desc->Proto(), "parameters", ExtractParameters(graph->Nodes())); SetAttr(op_desc->Proto(), "parameters", ExtractParameters(graph->Nodes()));
SetAttr(op_desc->Proto(), "output_name_mapping", output_mapping); SetAttr(op_desc->Proto(), "output_name_mapping", output_mapping);
auto enable_int8 = Get<bool>("enable_int8");
auto engine_key =
GenerateEngineKey(input_names_with_id, output_names_with_id);
std::string calibration_data = GetTrtCalibTableData(
Get<std::string>("model_opt_cache_dir"), engine_key, enable_int8);
SetAttr(op_desc->Proto(), "calibration_data", calibration_data);
SetAttr(op_desc->Proto(), "enable_int8", enable_int8);
SetAttr(op_desc->Proto(), "engine_key", engine_key);
} }
std::vector<std::string> ExtractParameters( std::vector<std::string> ExtractParameters(
......
cc_library(ir_graph_build_pass SRCS ir_graph_build_pass.cc DEPS analysis_pass argument ir_pass_manager) cc_library(ir_graph_build_pass SRCS ir_graph_build_pass.cc DEPS analysis_pass argument ir_pass_manager)
cc_library(ir_analysis_pass SRCS ir_analysis_pass.cc DEPS analysis_pass argument ir_pass_manager) cc_library(ir_analysis_pass SRCS ir_analysis_pass.cc DEPS analysis_pass argument ir_pass_manager)
cc_library(memory_optim_pass SRCS memory_optimize_pass.cc DEPS analysis_pass) cc_library(memory_optim_pass SRCS memory_optimize_pass.cc DEPS analysis_pass zero_copy_tensor)
cc_library(ir_params_sync_among_devices_pass SRCS ir_params_sync_among_devices_pass.cc DEPS analysis_pass argument ir_pass_manager) cc_library(ir_params_sync_among_devices_pass SRCS ir_params_sync_among_devices_pass.cc DEPS analysis_pass argument ir_pass_manager)
cc_library(ir_graph_to_program_pass SRCS ir_graph_to_program_pass.cc DEPS analysis_pass graph_to_program_pass) cc_library(ir_graph_to_program_pass SRCS ir_graph_to_program_pass.cc DEPS analysis_pass graph_to_program_pass)
......
...@@ -31,7 +31,11 @@ void IrGraphToProgramPass::RunImpl(Argument *argument) { ...@@ -31,7 +31,11 @@ void IrGraphToProgramPass::RunImpl(Argument *argument) {
} }
std::unique_ptr<Graph> graph(argument->main_graph_ptr()); std::unique_ptr<Graph> graph(argument->main_graph_ptr());
framework::ProgramDesc desc(argument->main_program());
// Direct using ProgramDesc desc(argument->main_program()) may cause
// incomplete copies of information.
framework::ProgramDesc desc;
desc.CopyFrom(*argument->main_program().Proto());
pass->SetNotOwned("program", &desc); pass->SetNotOwned("program", &desc);
auto thegraph = pass->Apply(std::move(graph)); auto thegraph = pass->Apply(std::move(graph));
thegraph.release(); // the argument still own the graph. thegraph.release(); // the argument still own the graph.
......
...@@ -444,6 +444,26 @@ std::vector<std::map<std::string, std::vector<int>>> DeseralizeBatchVarShapes( ...@@ -444,6 +444,26 @@ std::vector<std::map<std::string, std::vector<int>>> DeseralizeBatchVarShapes(
return batch_shapes; return batch_shapes;
} }
// Replace the -1 in shape to a real number to fake the shape.
std::vector<std::map<std::string, std::vector<int>>> FakeBatchVarShapes(
const framework::ProgramDesc& program) {
std::vector<std::map<std::string, std::vector<int>>> res;
res.emplace_back();
auto& record = res.front();
const int fake_batch_size = 3;
for (auto* var : program.Block(0).AllVars()) {
if (var->GetType() ==
framework::proto::VarType::Type::VarType_Type_LOD_TENSOR) {
auto shape = var->GetShape();
for (auto& v : shape) {
if (v < 0) v = fake_batch_size;
}
record[var->Name()].assign(shape.begin(), shape.end());
}
}
return res;
}
// Calculate the average dim of each tensor from the batch shape cache. // Calculate the average dim of each tensor from the batch shape cache.
std::unordered_map<std::string, size_t> GetBatchAverageSize( std::unordered_map<std::string, size_t> GetBatchAverageSize(
const std::vector<std::map<std::string, std::vector<int>>>& batches) { const std::vector<std::map<std::string, std::vector<int>>>& batches) {
...@@ -478,6 +498,7 @@ std::vector<std::unordered_set<std::string>> AnalysisBatchShapesByBatchSize( ...@@ -478,6 +498,7 @@ std::vector<std::unordered_set<std::string>> AnalysisBatchShapesByBatchSize(
std::unordered_map<std::string, std::stringstream> var_batchsize_hashes; std::unordered_map<std::string, std::stringstream> var_batchsize_hashes;
for (auto& batch : batches) { for (auto& batch : batches) {
for (auto& ele : batch) { for (auto& ele : batch) {
PADDLE_ENFORCE(!ele.second.empty());
int batch_size = ele.second.front(); int batch_size = ele.second.front();
// TODO(Superjomn) might consume large memory here, use combine hash. // TODO(Superjomn) might consume large memory here, use combine hash.
var_batchsize_hashes[ele.first] << batch_size; var_batchsize_hashes[ele.first] << batch_size;
...@@ -538,9 +559,21 @@ std::vector<std::unordered_set<std::string>> AnalysisBatchShapesBySimilarSize( ...@@ -538,9 +559,21 @@ std::vector<std::unordered_set<std::string>> AnalysisBatchShapesBySimilarSize(
std::string MemoryOptimizePass::repr() const { return "memory optimize pass"; } std::string MemoryOptimizePass::repr() const { return "memory optimize pass"; }
std::pair<size_t, size_t> GetRange(
const std::unordered_map<std::string, size_t>& ave_size) {
auto res = std::make_pair(std::numeric_limits<size_t>::max(),
std::numeric_limits<size_t>::min());
for (auto& item : ave_size) {
res.first = std::min(item.second, res.first);
res.second = std::max(item.second, res.second);
}
return res;
}
void MemoryOptimizePass::RunImpl(Argument* argument) { void MemoryOptimizePass::RunImpl(Argument* argument) {
// When force update, should not optimize memory. // When force update, should not optimize memory.
if (!argument->enable_memory_optim() || argument->memory_optim_force_update()) if (!argument->enable_memory_optim() ||
argument->static_memory_optim_force_update())
return; return;
graph_ = argument->main_graph_ptr(); graph_ = argument->main_graph_ptr();
...@@ -549,21 +582,38 @@ void MemoryOptimizePass::RunImpl(Argument* argument) { ...@@ -549,21 +582,38 @@ void MemoryOptimizePass::RunImpl(Argument* argument) {
argument->model_program_path_valid() ? argument->model_program_path() argument->model_program_path_valid() ? argument->model_program_path()
: ""); : "");
VLOG(3) << "Load memory cache from " << path; VLOG(3) << "Load memory cache from " << path;
if (inference::IsFileExists(path)) { std::vector<std::map<std::string, std::vector<int>>> batches;
VLOG(4) << "Performing memory optimize";
auto batches = DeseralizeBatchVarShapes(path); if (argument->static_memory_optim() && inference::IsFileExists(path)) {
auto var_batch_ave_size = GetBatchAverageSize(batches); string::PrettyLogInfo("--- Performing static memory optimize");
batches = DeseralizeBatchVarShapes(path);
} else {
string::PrettyLogInfo("--- Performing dynamic memory optimize");
batches = FakeBatchVarShapes(argument->main_program());
}
auto var_batch_ave_size = GetBatchAverageSize(batches);
// Get min and max memory size.
const auto range = GetRange(var_batch_ave_size);
const int cluster_size = std::max(
static_cast<int>((range.second - range.first) / 100 /*cluster num*/),
1024);
const int cluster_size1 = std::max(
static_cast<int>((range.second - range.first) / 1000 /*cluster num*/),
1024);
std::unordered_map<std::string, Node*> tensor_nodes; std::unordered_map<std::string, Node*> tensor_nodes;
space_table_t space_table; space_table_t space_table;
CollectVarMemorySize(var_batch_ave_size, &tensor_nodes, &space_table); CollectVarMemorySize(var_batch_ave_size, &tensor_nodes, &space_table);
std::unordered_map<std::string, std::string> reuse_table; std::unordered_map<std::string, std::string> reuse_table;
double max_saving_ratio = 0.; double max_saving_ratio = 0.;
std::vector<std::function<MemoryAllocation()>> strategies; std::vector<std::function<MemoryAllocation()>> strategies;
for (int sort_kind = 0; sort_kind < 2; sort_kind++) { for (int sort_kind = 0; sort_kind < 2; sort_kind++) {
if (argument->static_memory_optim()) {
// This strategy only make scene in static memory optimize.
strategies.emplace_back([&, sort_kind] { strategies.emplace_back([&, sort_kind] {
auto clustered_vars_by_batch_size = auto clustered_vars_by_batch_size =
AnalysisBatchShapesByBatchSize(batches); AnalysisBatchShapesByBatchSize(batches);
...@@ -572,71 +622,67 @@ void MemoryOptimizePass::RunImpl(Argument* argument) { ...@@ -572,71 +622,67 @@ void MemoryOptimizePass::RunImpl(Argument* argument) {
space_table, &reuse_table, sort_kind, &allocation); space_table, &reuse_table, sort_kind, &allocation);
return allocation; return allocation;
}); });
}
strategies.emplace_back([&, sort_kind] { strategies.emplace_back([&, sort_kind] {
auto clustered_vars_by_ave_size = AnalysisBatchShapesBySimilarSize( auto clustered_vars_by_ave_size =
space_table, batches, 1024); // interval 1kb AnalysisBatchShapesBySimilarSize(space_table, batches, cluster_size);
MemoryAllocation allocation; MemoryAllocation allocation;
MakeReusePlan(clustered_vars_by_ave_size, var_batch_ave_size, MakeReusePlan(clustered_vars_by_ave_size, var_batch_ave_size, space_table,
space_table, &reuse_table, sort_kind, &allocation); &reuse_table, sort_kind, &allocation);
return allocation; return allocation;
}); });
strategies.emplace_back([&, sort_kind] {
auto clustered_vars_by_ave_size =
AnalysisBatchShapesBySimilarSize(space_table, batches, cluster_size1);
MemoryAllocation allocation;
MakeReusePlan(clustered_vars_by_ave_size, var_batch_ave_size, space_table,
&reuse_table, sort_kind, &allocation);
return allocation;
});
strategies.emplace_back([&, sort_kind] {
auto clustered_vars_by_ave_size = AnalysisBatchShapesBySimilarSize(
space_table, batches,
std::numeric_limits<int>::max()); // no intervals
MemoryAllocation allocation;
MakeReusePlan(clustered_vars_by_ave_size, var_batch_ave_size, space_table,
&reuse_table, sort_kind, &allocation);
return allocation;
});
}
strategies.emplace_back([&, sort_kind] { std::function<MemoryAllocation()>* best_strategy{nullptr};
auto clustered_vars_by_ave_size = AnalysisBatchShapesBySimilarSize(
space_table, batches, 1024 * 1024); // interval 1MB
MemoryAllocation allocation;
MakeReusePlan(clustered_vars_by_ave_size, var_batch_ave_size,
space_table, &reuse_table, sort_kind, &allocation);
return allocation;
});
strategies.emplace_back([&, sort_kind] { // Try all strategies to get the best result.
auto clustered_vars_by_ave_size = AnalysisBatchShapesBySimilarSize( for (auto& strategy : strategies) {
space_table, batches, auto allocation = strategy();
std::numeric_limits<int>::max()); // no intervals string::PrettyLogDetail("--- get strategy saving %f memory for workspace",
MemoryAllocation allocation; allocation.GetSavingRatio());
MakeReusePlan(clustered_vars_by_ave_size, var_batch_ave_size, if (allocation.GetSavingRatio() > max_saving_ratio) {
space_table, &reuse_table, sort_kind, &allocation); max_saving_ratio = allocation.GetSavingRatio();
return allocation; best_strategy = &strategy;
});
} }
}
if (!best_strategy) {
LOG(ERROR) << "This model makes poor memory optimize, skip memory optimize";
return;
}
auto memory_allocation = (*best_strategy)();
std::function<MemoryAllocation()>* best_strategy{nullptr}; string::PrettyLogInfo(
"--- Saved %.2f%s memory for workspace(temporary variables)",
memory_allocation.GetSavingRatio() * 100, "%");
// Try all strategies to get the best result. argument->main_graph().Set(framework::ir::kGraphToProgramVarsToRemove,
for (auto& strategy : strategies) { new std::unordered_set<std::string>);
auto allocation = strategy(); auto& vars2remove =
string::PrettyLogDetail("--- get strategy saving %f memory for workspace", argument->main_graph().Get<std::unordered_set<std::string>>(
allocation.GetSavingRatio()); framework::ir::kGraphToProgramVarsToRemove);
if (allocation.GetSavingRatio() > max_saving_ratio) {
max_saving_ratio = allocation.GetSavingRatio(); PerformReusePlan(reuse_table, memory_allocation.sort_kind, &vars2remove);
best_strategy = &strategy; argument->SetMemoryOptimSortKind(memory_allocation.sort_kind);
}
}
if (!best_strategy) {
LOG(ERROR)
<< "This model makes poor memory optimize, skip memory optimize";
return;
}
auto memory_allocation = (*best_strategy)();
string::PrettyLogH2(
"--- Saved %.2f%s memory for workspace(temporary variables)",
memory_allocation.GetSavingRatio() * 100, "%");
string::PrettyLogDetail("--- Allocated %d MB",
memory_allocation.allocated / 1024. / 1024.);
string::PrettyLogDetail("--- Saved %d MB",
memory_allocation.saved / 1024. / 1024.);
argument->main_graph().Set(framework::ir::kGraphToProgramVarsToRemove,
new std::unordered_set<std::string>);
auto& vars2remove =
argument->main_graph().Get<std::unordered_set<std::string>>(
framework::ir::kGraphToProgramVarsToRemove);
PerformReusePlan(reuse_table, memory_allocation.sort_kind, &vars2remove);
argument->SetMemoryOptimSortKind(memory_allocation.sort_kind);
}
} }
float MemoryOptimizePass::MemoryAllocation::GetSavingRatio() const { float MemoryOptimizePass::MemoryAllocation::GetSavingRatio() const {
......
...@@ -15,7 +15,7 @@ ...@@ -15,7 +15,7 @@
#pragma once #pragma once
#include "paddle/fluid/inference/analysis/analysis_pass.h" #include "paddle/fluid/inference/analysis/analysis_pass.h"
#include "paddle/fluid/inference/analysis/passes/memory_optimize_pass.h" #include "paddle/fluid/platform/port.h"
namespace paddle { namespace paddle {
namespace inference { namespace inference {
......
...@@ -95,12 +95,14 @@ contrib::AnalysisConfig::AnalysisConfig(const contrib::AnalysisConfig &other) { ...@@ -95,12 +95,14 @@ contrib::AnalysisConfig::AnalysisConfig(const contrib::AnalysisConfig &other) {
CP_MEMBER(memory_pool_init_size_mb_); CP_MEMBER(memory_pool_init_size_mb_);
CP_MEMBER(enable_memory_optim_); CP_MEMBER(enable_memory_optim_);
CP_MEMBER(memory_optim_force_update_); CP_MEMBER(static_memory_optim_);
CP_MEMBER(static_memory_optim_force_update_);
// TensorRT releated. // TensorRT releated.
CP_MEMBER(use_tensorrt_); CP_MEMBER(use_tensorrt_);
CP_MEMBER(tensorrt_workspace_size_); CP_MEMBER(tensorrt_workspace_size_);
CP_MEMBER(tensorrt_max_batchsize_); CP_MEMBER(tensorrt_max_batchsize_);
CP_MEMBER(tensorrt_min_subgraph_size_); CP_MEMBER(tensorrt_min_subgraph_size_);
CP_MEMBER(tensorrt_precision_mode_);
// MKLDNN releated. // MKLDNN releated.
CP_MEMBER(use_mkldnn_); CP_MEMBER(use_mkldnn_);
CP_MEMBER(mkldnn_enabled_op_types_); CP_MEMBER(mkldnn_enabled_op_types_);
...@@ -140,9 +142,9 @@ void contrib::AnalysisConfig::EnableMKLDNN() { ...@@ -140,9 +142,9 @@ void contrib::AnalysisConfig::EnableMKLDNN() {
Update(); Update();
} }
void contrib::AnalysisConfig::EnableTensorRtEngine(int workspace_size, void contrib::AnalysisConfig::EnableTensorRtEngine(
int max_batch_size, int workspace_size, int max_batch_size, int min_subgraph_size,
int min_subgraph_size) { contrib::AnalysisConfig::Precision precision_mode) {
#ifdef PADDLE_WITH_CUDA #ifdef PADDLE_WITH_CUDA
if (!use_gpu()) { if (!use_gpu()) {
LOG(ERROR) << "To use TensorRT engine, please call EnableGpu() first"; LOG(ERROR) << "To use TensorRT engine, please call EnableGpu() first";
...@@ -153,6 +155,7 @@ void contrib::AnalysisConfig::EnableTensorRtEngine(int workspace_size, ...@@ -153,6 +155,7 @@ void contrib::AnalysisConfig::EnableTensorRtEngine(int workspace_size,
tensorrt_workspace_size_ = workspace_size; tensorrt_workspace_size_ = workspace_size;
tensorrt_max_batchsize_ = max_batch_size; tensorrt_max_batchsize_ = max_batch_size;
tensorrt_min_subgraph_size_ = min_subgraph_size; tensorrt_min_subgraph_size_ = min_subgraph_size;
tensorrt_precision_mode_ = precision_mode;
Update(); Update();
#else #else
...@@ -238,7 +241,8 @@ std::string contrib::AnalysisConfig::SerializeInfoCache() { ...@@ -238,7 +241,8 @@ std::string contrib::AnalysisConfig::SerializeInfoCache() {
ss << tensorrt_min_subgraph_size_; ss << tensorrt_min_subgraph_size_;
ss << enable_memory_optim_; ss << enable_memory_optim_;
ss << memory_optim_force_update_; ss << static_memory_optim_;
ss << static_memory_optim_force_update_;
ss << use_mkldnn_; ss << use_mkldnn_;
for (auto &item : mkldnn_enabled_op_types_) ss << item; for (auto &item : mkldnn_enabled_op_types_) ss << item;
...@@ -278,9 +282,11 @@ float contrib::AnalysisConfig::fraction_of_gpu_memory_for_pool() const { ...@@ -278,9 +282,11 @@ float contrib::AnalysisConfig::fraction_of_gpu_memory_for_pool() const {
#endif #endif
} }
void contrib::AnalysisConfig::EnableMemoryOptim(bool force_update_cache) { void contrib::AnalysisConfig::EnableMemoryOptim(
bool static_optim, bool force_update_static_cache) {
enable_memory_optim_ = true; enable_memory_optim_ = true;
memory_optim_force_update_ = force_update_cache; static_memory_optim_ = static_optim;
static_memory_optim_force_update_ = force_update_static_cache;
Update(); Update();
} }
...@@ -300,4 +306,16 @@ void contrib::AnalysisConfig::SetModelBuffer(const char *prog_buffer, ...@@ -300,4 +306,16 @@ void contrib::AnalysisConfig::SetModelBuffer(const char *prog_buffer,
Update(); Update();
} }
NativeConfig contrib::AnalysisConfig::ToNativeConfig() const {
NativeConfig config;
config.model_dir = model_dir_;
config.prog_file = prog_file_;
config.param_file = params_file_;
config.use_gpu = use_gpu_;
config.device = device_id_;
config.fraction_of_gpu_memory = fraction_of_gpu_memory_for_pool();
config.specify_input_name = specify_input_name_;
return config;
}
} // namespace paddle } // namespace paddle
...@@ -15,6 +15,7 @@ ...@@ -15,6 +15,7 @@
#include "paddle/fluid/inference/api/analysis_predictor.h" #include "paddle/fluid/inference/api/analysis_predictor.h"
#include <glog/logging.h> #include <glog/logging.h>
#include <algorithm> #include <algorithm>
#include <fstream>
#include <memory> #include <memory>
#include <string> #include <string>
#include <vector> #include <vector>
...@@ -25,6 +26,7 @@ ...@@ -25,6 +26,7 @@
#include "paddle/fluid/framework/naive_executor.h" #include "paddle/fluid/framework/naive_executor.h"
#include "paddle/fluid/framework/scope.h" #include "paddle/fluid/framework/scope.h"
#include "paddle/fluid/framework/var_type_traits.h" #include "paddle/fluid/framework/var_type_traits.h"
#include "paddle/fluid/inference/analysis/helper.h"
#include "paddle/fluid/inference/analysis/passes/memory_optimize_pass.h" #include "paddle/fluid/inference/analysis/passes/memory_optimize_pass.h"
#include "paddle/fluid/inference/api/helper.h" #include "paddle/fluid/inference/api/helper.h"
#include "paddle/fluid/inference/api/paddle_inference_api.h" #include "paddle/fluid/inference/api/paddle_inference_api.h"
...@@ -37,6 +39,8 @@ ...@@ -37,6 +39,8 @@
#if PADDLE_WITH_TENSORRT #if PADDLE_WITH_TENSORRT
#include "paddle/fluid/inference/tensorrt/convert/op_converter.h" #include "paddle/fluid/inference/tensorrt/convert/op_converter.h"
#include "paddle/fluid/inference/tensorrt/trt_int8_calibrator.h"
#endif #endif
DECLARE_bool(profile); DECLARE_bool(profile);
...@@ -44,6 +48,12 @@ DECLARE_bool(profile); ...@@ -44,6 +48,12 @@ DECLARE_bool(profile);
namespace paddle { namespace paddle {
using contrib::AnalysisConfig; using contrib::AnalysisConfig;
using inference::Singleton;
#if PADDLE_WITH_TENSORRT
using inference::tensorrt::TRTInt8Calibrator;
using inference::tensorrt::TRTCalibratorEngine;
using inference::tensorrt::TRTCalibratorEngineManager;
#endif
namespace { namespace {
bool IsPersistable(const framework::VarDesc *var) { bool IsPersistable(const framework::VarDesc *var) {
...@@ -113,6 +123,15 @@ bool AnalysisPredictor::PrepareProgram( ...@@ -113,6 +123,15 @@ bool AnalysisPredictor::PrepareProgram(
if (!program) { if (!program) {
if (!LoadProgramDesc()) return false; if (!LoadProgramDesc()) return false;
// If not cloned, the parameters should be loaded.
// If config_.ir_optim() is True, parameters is loaded in
// OptimizeInferenceProgram(), but other persistable variables
// (like RAW type var) are not created in scope.
// If config_.ir_optim() is False, parameters is loaded in LoadParameters(),
// still need to create other persistable variables.
// So in both case, create persistable variables at first.
executor_->CreateVariables(*inference_program_, 0, true, sub_scope_);
// Optimize the program, and load parameters and modify them in the // Optimize the program, and load parameters and modify them in the
// scope_. // scope_.
// This will change the scope_ address. // This will change the scope_ address.
...@@ -120,15 +139,6 @@ bool AnalysisPredictor::PrepareProgram( ...@@ -120,15 +139,6 @@ bool AnalysisPredictor::PrepareProgram(
status_ir_optim_enabled_ = true; status_ir_optim_enabled_ = true;
OptimizeInferenceProgram(); OptimizeInferenceProgram();
} else { } else {
// If the parent_scope is passed, we assert that the persistable variables
// are already created, so just create the no persistable variables.
// If not cloned, the parameters should be loaded
// OptimizeInferenceProgram.
// So in both cases, just the local variables are needed to load, not the
// parematers.
executor_->CreateVariables(*inference_program_, 0, true, sub_scope_);
// Load parameters // Load parameters
LOG(INFO) << "load parameters "; LOG(INFO) << "load parameters ";
LoadParameters(); LoadParameters();
...@@ -298,15 +308,15 @@ void AnalysisPredictor::GetFetchOne(const framework::LoDTensor &fetch, ...@@ -298,15 +308,15 @@ void AnalysisPredictor::GetFetchOne(const framework::LoDTensor &fetch,
bool AnalysisPredictor::GetFetch(std::vector<PaddleTensor> *outputs, bool AnalysisPredictor::GetFetch(std::vector<PaddleTensor> *outputs,
framework::Scope *scope) { framework::Scope *scope) {
VLOG(3) << "Predictor::get_fetch"; VLOG(3) << "Predictor::get_fetch";
outputs->resize(fetchs_.size()); outputs->resize(fetches_.size());
for (size_t i = 0; i < fetchs_.size(); ++i) { for (size_t i = 0; i < fetches_.size(); ++i) {
int idx = boost::get<int>(fetchs_[i]->GetAttr("col")); int idx = boost::get<int>(fetches_[i]->GetAttr("col"));
PADDLE_ENFORCE((size_t)idx == i); PADDLE_ENFORCE((size_t)idx == i);
framework::LoDTensor &fetch = framework::LoDTensor &fetch =
framework::GetFetchVariable(*scope, "fetch", idx); framework::GetFetchVariable(*scope, "fetch", idx);
auto type = fetch.type(); auto type = fetch.type();
auto output = &(outputs->at(i)); auto output = &(outputs->at(i));
output->name = fetchs_[idx]->Input("X")[0]; output->name = fetches_[idx]->Input("X")[0];
if (type == framework::proto::VarType::FP32) { if (type == framework::proto::VarType::FP32) {
GetFetchOne<float>(fetch, output); GetFetchOne<float>(fetch, output);
output->dtype = PaddleDType::FLOAT32; output->dtype = PaddleDType::FLOAT32;
...@@ -327,7 +337,9 @@ void AnalysisPredictor::OptimizeInferenceProgram() { ...@@ -327,7 +337,9 @@ void AnalysisPredictor::OptimizeInferenceProgram() {
argument_.SetUseGPU(config_.use_gpu()); argument_.SetUseGPU(config_.use_gpu());
argument_.SetGPUDeviceId(config_.gpu_device_id()); argument_.SetGPUDeviceId(config_.gpu_device_id());
argument_.SetEnableMemoryOptim(config_.enable_memory_optim()); argument_.SetEnableMemoryOptim(config_.enable_memory_optim());
argument_.SetMemoryOptimForceUpdate(config_.memory_optim_force_update_); argument_.SetStaticMemoryOptim(config_.static_memory_optim_);
argument_.SetStaticMemoryOptimForceUpdate(
config_.static_memory_optim_force_update_);
argument_.SetModelFromMemory(config_.model_from_memory_); argument_.SetModelFromMemory(config_.model_from_memory_);
// Analyze inference_program // Analyze inference_program
if (!config_.model_dir().empty()) { if (!config_.model_dir().empty()) {
...@@ -337,6 +349,8 @@ void AnalysisPredictor::OptimizeInferenceProgram() { ...@@ -337,6 +349,8 @@ void AnalysisPredictor::OptimizeInferenceProgram() {
!config_.params_file().empty(), !config_.params_file().empty(),
"Either model_dir or (param_file, prog_file) should be set."); "Either model_dir or (param_file, prog_file) should be set.");
PADDLE_ENFORCE(!config_.prog_file().empty()); PADDLE_ENFORCE(!config_.prog_file().empty());
std::string dir = inference::analysis::GetDirRoot(config_.prog_file());
argument_.SetModelProgramPath(config_.prog_file()); argument_.SetModelProgramPath(config_.prog_file());
argument_.SetModelParamsPath(config_.params_file()); argument_.SetModelParamsPath(config_.params_file());
} }
...@@ -347,6 +361,7 @@ void AnalysisPredictor::OptimizeInferenceProgram() { ...@@ -347,6 +361,7 @@ void AnalysisPredictor::OptimizeInferenceProgram() {
argument_.SetTensorRtWorkspaceSize(config_.tensorrt_workspace_size_); argument_.SetTensorRtWorkspaceSize(config_.tensorrt_workspace_size_);
argument_.SetTensorRtMaxBatchSize(config_.tensorrt_max_batchsize_); argument_.SetTensorRtMaxBatchSize(config_.tensorrt_max_batchsize_);
argument_.SetTensorRtMinSubgraphSize(config_.tensorrt_min_subgraph_size_); argument_.SetTensorRtMinSubgraphSize(config_.tensorrt_min_subgraph_size_);
argument_.SetTensorRtPrecisionMode(config_.tensorrt_precision_mode_);
} }
if (config_.use_mkldnn_) { if (config_.use_mkldnn_) {
...@@ -361,7 +376,7 @@ void AnalysisPredictor::OptimizeInferenceProgram() { ...@@ -361,7 +376,7 @@ void AnalysisPredictor::OptimizeInferenceProgram() {
} }
argument_.SetIrAnalysisPasses(passes); argument_.SetIrAnalysisPasses(passes);
argument_.SetAnalysisPasses(config_.pass_builder()->AnalysisPasses()); argument_.SetAnalysisPasses(config_.pass_builder()->AnalysisPasses());
argument_.SetScopeNotOwned(const_cast<framework::Scope *>(scope_.get())); argument_.SetScopeNotOwned(scope_.get());
Analyzer().Run(&argument_); Analyzer().Run(&argument_);
PADDLE_ENFORCE(argument_.scope_valid()); PADDLE_ENFORCE(argument_.scope_valid());
...@@ -422,10 +437,10 @@ void AnalysisPredictor::PrepareFeedFetch() { ...@@ -422,10 +437,10 @@ void AnalysisPredictor::PrepareFeedFetch() {
feed_names_[op->Output("Out")[0]] = idx; feed_names_[op->Output("Out")[0]] = idx;
} else if (op->Type() == "fetch") { } else if (op->Type() == "fetch") {
int idx = boost::get<int>(op->GetAttr("col")); int idx = boost::get<int>(op->GetAttr("col"));
if (fetchs_.size() <= static_cast<size_t>(idx)) { if (fetches_.size() <= static_cast<size_t>(idx)) {
fetchs_.resize(idx + 1); fetches_.resize(idx + 1);
} }
fetchs_[idx] = op; fetches_[idx] = op;
} }
} }
} }
...@@ -567,7 +582,67 @@ bool AnalysisPredictor::LoadParameters() { ...@@ -567,7 +582,67 @@ bool AnalysisPredictor::LoadParameters() {
return true; return true;
} }
#if PADDLE_WITH_TENSORRT
bool AnalysisPredictor::SaveTrtCalibToDisk() {
PADDLE_ENFORCE(config_.tensorrt_engine_enabled(),
"This func can be invoked only in trt mode");
auto &block = inference_program_->Block(0);
for (auto &op_desc : block.AllOps()) {
if (op_desc->Type() == "tensorrt_engine") {
std::string engine_name =
boost::get<std::string>(op_desc->GetAttr("engine_key"));
if (!Singleton<TRTCalibratorEngineManager>::Global().Has(engine_name)) {
LOG(ERROR) << "You should run the predictor(with trt) on the real data "
"to generate calibration info";
return false;
}
TRTCalibratorEngine *calib_engine =
Singleton<TRTCalibratorEngineManager>::Global().Get(engine_name);
LOG(INFO) << "Wait for calib threads done.";
calib_engine->calib_->waitAndSetDone();
LOG(INFO) << "Generating TRT Calibration table data, this may cost a lot "
"of time...";
calib_engine->thr_->join();
std::string calibration_table_data =
calib_engine->calib_->getCalibrationTableAsString();
if (calibration_table_data.empty()) {
LOG(ERROR) << "the calibration table is empty.";
return false;
}
std::string model_opt_cache_dir =
argument_.Has("model_dir")
? argument_.model_dir()
: inference::analysis::GetDirRoot(argument_.model_program_path());
std::string calibration_table_data_path =
inference::analysis::GetTrtCalibPath(
inference::analysis::GetOrCreateModelOptCacheDir(
model_opt_cache_dir),
engine_name);
std::ofstream ofile(calibration_table_data_path, std::ios::out);
LOG(INFO) << "Write Paddle-TRT INT8 calibration table data to file "
<< calibration_table_data_path;
ofile << calibration_table_data;
ofile.close();
}
}
// Free all calibrator resources.
Singleton<TRTCalibratorEngineManager>::Global().DeleteALL();
return true;
}
#endif
AnalysisPredictor::~AnalysisPredictor() { AnalysisPredictor::~AnalysisPredictor() {
#if PADDLE_WITH_TENSORRT
if (config_.tensorrt_engine_enabled() &&
config_.tensorrt_precision_mode_ == AnalysisConfig::Precision::kInt8 &&
Singleton<TRTCalibratorEngineManager>::Global().Has()) {
SaveTrtCalibToDisk();
}
#endif
if (FLAGS_profile) { if (FLAGS_profile) {
platform::DisableProfiler(platform::EventSortingKey::kTotal, platform::DisableProfiler(platform::EventSortingKey::kTotal,
"./profile.log"); "./profile.log");
...@@ -638,12 +713,12 @@ bool AnalysisPredictor::need_collect_var_shapes_for_memory_optim() { ...@@ -638,12 +713,12 @@ bool AnalysisPredictor::need_collect_var_shapes_for_memory_optim() {
// check if the cache exists // check if the cache exists
if (!config_.enable_memory_optim()) { if (!config_.enable_memory_optim()) {
need = false; need = false;
} else if (config_.enable_memory_optim() && } else if (config_.static_memory_optim_ &&
!inference::IsFileExists(inference::analysis::GetMemoryCachePath( !inference::IsFileExists(inference::analysis::GetMemoryCachePath(
config_.model_dir(), config_.prog_file()))) { config_.model_dir(), config_.prog_file()))) {
need = true; need = true;
} else if (config_.enable_memory_optim() && } else if (config_.static_memory_optim_ &&
config_.memory_optim_force_update_) { config_.static_memory_optim_force_update_) {
need = true; need = true;
} }
...@@ -651,6 +726,10 @@ bool AnalysisPredictor::need_collect_var_shapes_for_memory_optim() { ...@@ -651,6 +726,10 @@ bool AnalysisPredictor::need_collect_var_shapes_for_memory_optim() {
return need; return need;
} }
std::string AnalysisPredictor::GetSeriazlizedProgram() const {
return inference_program_->Proto()->SerializeAsString();
}
template <> template <>
std::unique_ptr<PaddlePredictor> CreatePaddlePredictor<contrib::AnalysisConfig>( std::unique_ptr<PaddlePredictor> CreatePaddlePredictor<contrib::AnalysisConfig>(
const contrib::AnalysisConfig &config) { const contrib::AnalysisConfig &config) {
......
...@@ -75,6 +75,8 @@ class AnalysisPredictor : public PaddlePredictor { ...@@ -75,6 +75,8 @@ class AnalysisPredictor : public PaddlePredictor {
void SetMkldnnThreadID(int tid); void SetMkldnnThreadID(int tid);
std::string GetSeriazlizedProgram() const override;
protected: protected:
// For memory optimization. // For memory optimization.
bool need_collect_var_shapes_for_memory_optim(); bool need_collect_var_shapes_for_memory_optim();
...@@ -97,6 +99,21 @@ class AnalysisPredictor : public PaddlePredictor { ...@@ -97,6 +99,21 @@ class AnalysisPredictor : public PaddlePredictor {
void GetFetchOne(const framework::LoDTensor &fetchs, void GetFetchOne(const framework::LoDTensor &fetchs,
PaddleTensor *output_data); PaddleTensor *output_data);
#if PADDLE_WITH_TENSORRT
// When we use Paddle-TRT INT8 engine, we need to generate calibration table
// data first,
// the calibration table contains the range for each op's input and output,
// this whole process can be divided into several steps:
//
// 1. Builds a 32-bit engine, runs it on the calibration set, and records a
// histogram for each
// tensor of the distribution of activation values.
// 2. Builds a calibration table from the histograms.
//
// After step 2, we need to store the calibration table on disk
bool SaveTrtCalibToDisk();
#endif
// Some more detailed tests, they are made the friends of the predictor, so that // Some more detailed tests, they are made the friends of the predictor, so that
// the all the details can be tested. // the all the details can be tested.
#if PADDLE_WITH_TESTING #if PADDLE_WITH_TESTING
...@@ -115,7 +132,7 @@ class AnalysisPredictor : public PaddlePredictor { ...@@ -115,7 +132,7 @@ class AnalysisPredictor : public PaddlePredictor {
std::shared_ptr<framework::ProgramDesc> inference_program_; std::shared_ptr<framework::ProgramDesc> inference_program_;
std::vector<framework::OpDesc *> feeds_; std::vector<framework::OpDesc *> feeds_;
std::map<std::string, size_t> feed_names_; std::map<std::string, size_t> feed_names_;
std::vector<framework::OpDesc *> fetchs_; std::vector<framework::OpDesc *> fetches_;
// Memory buffer for feed inputs. The temporary LoDTensor will cause serious // Memory buffer for feed inputs. The temporary LoDTensor will cause serious
// concurrency problems, wrong results and memory leak, so cache them. // concurrency problems, wrong results and memory leak, so cache them.
std::vector<framework::LoDTensor> feed_tensors_; std::vector<framework::LoDTensor> feed_tensors_;
......
...@@ -215,6 +215,8 @@ TEST(AnalysisPredictor, memory_optim) { ...@@ -215,6 +215,8 @@ TEST(AnalysisPredictor, memory_optim) {
{ {
// The first predictor help to cache the memory optimize strategy. // The first predictor help to cache the memory optimize strategy.
auto predictor = CreatePaddlePredictor<AnalysisConfig>(config); auto predictor = CreatePaddlePredictor<AnalysisConfig>(config);
LOG(INFO) << "serialized program: " << predictor->GetSeriazlizedProgram();
ASSERT_FALSE(predictor->GetSeriazlizedProgram().empty());
// Run several times to check the parameters are not reused by mistake. // Run several times to check the parameters are not reused by mistake.
for (int i = 0; i < 5; i++) { for (int i = 0; i < 5; i++) {
......
...@@ -12,6 +12,8 @@ ...@@ -12,6 +12,8 @@
// See the License for the specific language governing permissions and // See the License for the specific language governing permissions and
// limitations under the License. // limitations under the License.
#include <sstream>
#include "paddle/fluid/framework/commit.h"
#include "paddle/fluid/framework/lod_tensor.h" #include "paddle/fluid/framework/lod_tensor.h"
#include "paddle/fluid/framework/scope.h" #include "paddle/fluid/framework/scope.h"
#include "paddle/fluid/inference/api/paddle_inference_api.h" #include "paddle/fluid/inference/api/paddle_inference_api.h"
...@@ -97,4 +99,12 @@ void PaddleBuf::Free() { ...@@ -97,4 +99,12 @@ void PaddleBuf::Free() {
} }
} }
std::string get_version() {
std::stringstream ss;
ss << "version: " << framework::paddle_version() << "\n";
ss << "commit: " << framework::paddle_commit() << "\n";
ss << "branch: " << framework::paddle_compile_branch() << "\n";
return ss.str();
}
} // namespace paddle } // namespace paddle
...@@ -61,4 +61,10 @@ TEST(paddle_inference_api, demo) { ...@@ -61,4 +61,10 @@ TEST(paddle_inference_api, demo) {
predictor->Run({}, &outputs); predictor->Run({}, &outputs);
} }
TEST(paddle_inference_api, get_version) {
LOG(INFO) << "paddle version:\n" << get_version();
auto version = get_version();
ASSERT_FALSE(version.empty());
}
} // namespace paddle } // namespace paddle
...@@ -42,6 +42,10 @@ struct AnalysisConfig { ...@@ -42,6 +42,10 @@ struct AnalysisConfig {
explicit AnalysisConfig(const std::string& model_dir); explicit AnalysisConfig(const std::string& model_dir);
explicit AnalysisConfig(const std::string& prog_file, explicit AnalysisConfig(const std::string& prog_file,
const std::string& params_file); const std::string& params_file);
enum class Precision {
kFloat32 = 0,
kInt8,
};
/** Set model with a directory. /** Set model with a directory.
*/ */
...@@ -135,7 +139,8 @@ struct AnalysisConfig { ...@@ -135,7 +139,8 @@ struct AnalysisConfig {
* subgraph is less than this, it will not transfer to TensorRT engine. * subgraph is less than this, it will not transfer to TensorRT engine.
*/ */
void EnableTensorRtEngine(int workspace_size = 1 << 20, void EnableTensorRtEngine(int workspace_size = 1 << 20,
int max_batch_size = 1, int min_subgraph_size = 3); int max_batch_size = 1, int min_subgraph_size = 3,
Precision precision = Precision::kFloat32);
/** A boolean state telling whether the TensorRT engine is used. /** A boolean state telling whether the TensorRT engine is used.
*/ */
bool tensorrt_engine_enabled() const { return use_tensorrt_; } bool tensorrt_engine_enabled() const { return use_tensorrt_; }
...@@ -162,17 +167,7 @@ struct AnalysisConfig { ...@@ -162,17 +167,7 @@ struct AnalysisConfig {
/** Transform the AnalysisConfig to NativeConfig. /** Transform the AnalysisConfig to NativeConfig.
*/ */
NativeConfig ToNativeConfig() const { NativeConfig ToNativeConfig() const;
NativeConfig config;
config.model_dir = model_dir_;
config.prog_file = prog_file_;
config.param_file = params_file_;
config.use_gpu = use_gpu_;
config.device = device_id_;
config.fraction_of_gpu_memory = fraction_of_gpu_memory_for_pool();
config.specify_input_name = specify_input_name_;
return config;
}
/** Specify the operator type list to use MKLDNN acceleration. /** Specify the operator type list to use MKLDNN acceleration.
* @param op_list the operator type list. * @param op_list the operator type list.
*/ */
...@@ -195,7 +190,8 @@ struct AnalysisConfig { ...@@ -195,7 +190,8 @@ struct AnalysisConfig {
/** Turn on memory optimize /** Turn on memory optimize
* NOTE still in development, will release latter. * NOTE still in development, will release latter.
*/ */
void EnableMemoryOptim(bool force_update_cache = false); void EnableMemoryOptim(bool static_optim = false,
bool force_update_static_cache = false);
/** Tell whether the memory optimization is activated. */ /** Tell whether the memory optimization is activated. */
bool enable_memory_optim() const; bool enable_memory_optim() const;
...@@ -238,10 +234,12 @@ struct AnalysisConfig { ...@@ -238,10 +234,12 @@ struct AnalysisConfig {
// We set this variable to control the minimum number of nodes in the // We set this variable to control the minimum number of nodes in the
// subgraph, 3 as default value. // subgraph, 3 as default value.
int tensorrt_min_subgraph_size_{3}; int tensorrt_min_subgraph_size_{3};
Precision tensorrt_precision_mode_;
// memory reuse related. // memory reuse related.
bool enable_memory_optim_{false}; bool enable_memory_optim_{false};
bool memory_optim_force_update_{false}; bool static_memory_optim_{false};
bool static_memory_optim_force_update_{false};
bool use_mkldnn_{false}; bool use_mkldnn_{false};
std::unordered_set<std::string> mkldnn_enabled_op_types_; std::unordered_set<std::string> mkldnn_enabled_op_types_;
......
...@@ -215,6 +215,14 @@ class PaddlePredictor { ...@@ -215,6 +215,14 @@ class PaddlePredictor {
*/ */
virtual ~PaddlePredictor() = default; virtual ~PaddlePredictor() = default;
/** \brief Get the serialized model program that executes in inference phase.
* Its data type is ProgramDesc, which is a protobuf message.
*/
virtual std::string GetSeriazlizedProgram() const {
assert(false); // Force raise error.
return "NotImplemented";
};
/** The common configs for all the predictors. /** The common configs for all the predictors.
*/ */
struct Config { struct Config {
...@@ -288,4 +296,6 @@ std::unique_ptr<PaddlePredictor> CreatePaddlePredictor(const ConfigT& config); ...@@ -288,4 +296,6 @@ std::unique_ptr<PaddlePredictor> CreatePaddlePredictor(const ConfigT& config);
int PaddleDtypeSize(PaddleDType dtype); int PaddleDtypeSize(PaddleDType dtype);
std::string get_version();
} // namespace paddle } // namespace paddle
...@@ -154,13 +154,16 @@ class GpuPassStrategy : public PassStrategy { ...@@ -154,13 +154,16 @@ class GpuPassStrategy : public PassStrategy {
public: public:
GpuPassStrategy() : PassStrategy({}) { GpuPassStrategy() : PassStrategy({}) {
passes_.assign({ passes_.assign({
"infer_clean_graph_pass", // "infer_clean_graph_pass", //
"conv_affine_channel_fuse_pass", // "conv_affine_channel_fuse_pass", //
"conv_eltwiseadd_affine_channel_fuse_pass", // "conv_eltwiseadd_affine_channel_fuse_pass", //
"conv_bn_fuse_pass", // "conv_bn_fuse_pass", //
"conv_elementwise_add_act_fuse_pass", // #if CUDNN_VERSION >= 7100 // To run conv_fusion, the version of cudnn must be
"conv_elementwise_add2_act_fuse_pass", // // guaranteed at least v7
"conv_elementwise_add_fuse_pass", // "conv_elementwise_add_act_fuse_pass", //
"conv_elementwise_add2_act_fuse_pass", //
"conv_elementwise_add_fuse_pass", //
#endif
}); });
for (int i = 6; i >= 3; i--) { for (int i = 6; i >= 3; i--) {
......
nv_library(tensorrt_engine SRCS engine.cc DEPS ${GLOB_OPERATOR_DEPS} framework_proto device_context) nv_library(tensorrt_engine SRCS engine.cc trt_int8_calibrator.cc DEPS ${GLOB_OPERATOR_DEPS} framework_proto device_context)
nv_library(tensorrt_op_teller SRCS op_teller.cc DEPS framework_proto) nv_library(tensorrt_op_teller SRCS op_teller.cc DEPS framework_proto)
nv_test(test_tensorrt SRCS test_tensorrt.cc DEPS dynload_cuda device_context dynamic_loader) nv_test(test_tensorrt SRCS test_tensorrt.cc DEPS dynload_cuda device_context dynamic_loader)
nv_test(test_tensorrt_engine SRCS test_engine.cc DEPS dynload_cuda tensorrt_engine) nv_test(test_tensorrt_engine SRCS test_engine.cc DEPS dynload_cuda tensorrt_engine)
......
...@@ -69,6 +69,13 @@ void TensorRTEngine::FreezeNetwork() { ...@@ -69,6 +69,13 @@ void TensorRTEngine::FreezeNetwork() {
// build engine. // build engine.
infer_builder_->setMaxBatchSize(max_batch_); infer_builder_->setMaxBatchSize(max_batch_);
infer_builder_->setMaxWorkspaceSize(max_workspace_); infer_builder_->setMaxWorkspaceSize(max_workspace_);
if (enable_int8_) {
infer_builder_->setInt8Mode(true);
PADDLE_ENFORCE(
calibrator_ != nullptr,
"The precision mode is 'INT8', the calibrator should not be nullptr");
infer_builder_->setInt8Calibrator(calibrator_);
}
infer_engine_.reset(infer_builder_->buildCudaEngine(*infer_network_)); infer_engine_.reset(infer_builder_->buildCudaEngine(*infer_network_));
PADDLE_ENFORCE(infer_engine_ != nullptr, "build cuda engine failed!"); PADDLE_ENFORCE(infer_engine_ != nullptr, "build cuda engine failed!");
......
...@@ -23,12 +23,14 @@ limitations under the License. */ ...@@ -23,12 +23,14 @@ limitations under the License. */
#include "paddle/fluid/inference/engine.h" #include "paddle/fluid/inference/engine.h"
#include "paddle/fluid/inference/tensorrt/helper.h" #include "paddle/fluid/inference/tensorrt/helper.h"
#include "paddle/fluid/inference/tensorrt/plugin/trt_plugin.h" #include "paddle/fluid/inference/tensorrt/plugin/trt_plugin.h"
#include "paddle/fluid/inference/tensorrt/trt_int8_calibrator.h"
#include "paddle/fluid/inference/utils/singleton.h" #include "paddle/fluid/inference/utils/singleton.h"
namespace paddle { namespace paddle {
namespace inference { namespace inference {
namespace tensorrt { namespace tensorrt {
class TRTInt8Calibrator;
/* /*
* TensorRT Engine. * TensorRT Engine.
* *
...@@ -55,13 +57,16 @@ class TensorRTEngine : public EngineBase { ...@@ -55,13 +57,16 @@ class TensorRTEngine : public EngineBase {
}; };
TensorRTEngine(int max_batch, int max_workspace, cudaStream_t stream, TensorRTEngine(int max_batch, int max_workspace, cudaStream_t stream,
int device = 0, int device = 0, bool enable_int8 = false,
TRTInt8Calibrator* calibrator = nullptr,
nvinfer1::ILogger& logger = NaiveLogger::Global()) nvinfer1::ILogger& logger = NaiveLogger::Global())
: max_batch_(max_batch), : max_batch_(max_batch),
max_workspace_(max_workspace), max_workspace_(max_workspace),
stream_(stream), stream_(stream),
logger_(logger), device_(device),
device_(device) {} enable_int8_(enable_int8),
calibrator_(calibrator),
logger_(logger) {}
virtual ~TensorRTEngine(); virtual ~TensorRTEngine();
...@@ -139,8 +144,8 @@ class TensorRTEngine : public EngineBase { ...@@ -139,8 +144,8 @@ class TensorRTEngine : public EngineBase {
// In the normal case, the paddle-trt exists bug when runing the googlenet. // In the normal case, the paddle-trt exists bug when runing the googlenet.
// When there are more than two convolutions of 1 * 1 with the same input, the // When there are more than two convolutions of 1 * 1 with the same input, the
// paddle-tensorrt will do the merging optimization, which fuse those conv // paddle-tensorrt will do the merging optimization, which fuse those conv
// into // into one conv, and then trigger bug. So, We should use strategy to avoid
// one conv, and then trigger bug. So, We should use strategy to avoid this // this
// optimization for the time being. This bug will be fixed in the future. // optimization for the time being. This bug will be fixed in the future.
std::unordered_map<std::string /*name*/, int /*ITensor_quote_num*/> std::unordered_map<std::string /*name*/, int /*ITensor_quote_num*/>
itensor_quote_num; itensor_quote_num;
...@@ -153,9 +158,14 @@ class TensorRTEngine : public EngineBase { ...@@ -153,9 +158,14 @@ class TensorRTEngine : public EngineBase {
// the max memory size the engine uses // the max memory size the engine uses
int max_workspace_; int max_workspace_;
cudaStream_t stream_;
// The specific GPU id that the TensorRTEngine bounded to.
int device_;
bool enable_int8_;
TRTInt8Calibrator* calibrator_;
// batch size of the current data, will be updated each Executation. // batch size of the current data, will be updated each Executation.
int batch_size_{-1}; int batch_size_{-1};
cudaStream_t stream_;
nvinfer1::ILogger& logger_; nvinfer1::ILogger& logger_;
...@@ -165,8 +175,6 @@ class TensorRTEngine : public EngineBase { ...@@ -165,8 +175,6 @@ class TensorRTEngine : public EngineBase {
std::unordered_map<std::string /*name*/, nvinfer1::ITensor* /*ITensor*/> std::unordered_map<std::string /*name*/, nvinfer1::ITensor* /*ITensor*/>
itensor_map_; itensor_map_;
// The specific GPU id that the TensorRTEngine bounded to.
int device_;
std::vector<std::unique_ptr<plugin::PluginTensorRT>> owned_plugin_; std::vector<std::unique_ptr<plugin::PluginTensorRT>> owned_plugin_;
// TensorRT related internal members // TensorRT related internal members
......
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/fluid/inference/tensorrt/trt_int8_calibrator.h"
#include "glog/logging.h"
namespace paddle {
namespace inference {
namespace tensorrt {
// set the batch size before constructing the thread to execute engine
int TRTInt8Calibrator::getBatchSize() const { return batch_size_; }
TRTInt8Calibrator::TRTInt8Calibrator(
const std::unordered_map<std::string, size_t>& buffers, int batch_size,
std::string engine_name, const platform::Place place)
: batch_size_(batch_size), engine_name_(engine_name) {
int i = 0;
VLOG(4) << "Init a new calibrator: " << engine_name_;
for (const auto it : buffers) {
framework::Tensor temp_tensor;
std::string input_name = it.first;
int data_size = it.second;
int num_ele = data_size / sizeof(int16_t);
framework::DDim data_shape = framework::make_ddim({num_ele});
temp_tensor.Resize(data_shape);
data_tensors_.push_back(temp_tensor);
data_buffers_[input_name] = std::pair<void*, size_t>(
static_cast<void*>(temp_tensor.mutable_data<int16_t>(place)), num_ele);
i += 1;
}
}
TRTInt8Calibrator::TRTInt8Calibrator(const std::string& calib_data)
: batch_size_(0),
calib_running_(false),
data_is_set_(false),
done_(true),
calibration_table_(calib_data) {}
void TRTInt8Calibrator::waitAndSetDone() {
std::unique_lock<std::mutex> lk(mut_);
while ((calib_running_ || data_is_set_) && !done_) cond_.wait(lk);
if (!done_) {
done_ = true;
cond_.notify_all();
}
}
// There might be more than one input for trt subgraph,
// So, we use a map to store input information.
bool TRTInt8Calibrator::setBatch(
const std::unordered_map<std::string, void*>& data) {
VLOG(3) << "set batch: " << engine_name_;
std::unique_lock<std::mutex> lk(mut_);
// There is a producer and a consumer. The producer set the batch data and
// the consumer get the batch data. The size of the data pool is one.
// So, the producer has to wait for the consumer to finish processing before
// they can set the data.
while ((calib_running_ || data_is_set_) && (!done_)) cond_.wait(lk);
// The done_ is set to true using waitAndSetDone, When all calibration data
// are processed.
if (done_) return false;
// Sets the batch.
for (const auto& it : data) {
auto dataptr = data_buffers_.find(it.first);
if (dataptr == data_buffers_.end()) {
LOG(FATAL) << "FATAL " << engine_name_ << " input name '" << it.first
<< "' does not match with the buffer names";
}
const auto& d = dataptr->second;
PADDLE_ENFORCE(
cudaMemcpy(d.first, it.second, d.second, cudaMemcpyDeviceToDevice),
"Fail to cudaMemcpy %s for %s", engine_name_, it.first);
}
data_is_set_ = true;
cond_.notify_all();
return true;
}
bool TRTInt8Calibrator::getBatch(void** bindings, const char** names,
int num_bindings) {
VLOG(4) << "get batch: " << engine_name_;
std::unique_lock<std::mutex> lk(mut_);
// The consumer has just finished processing a data.
// The producer can set the data again.
calib_running_ = false;
cond_.notify_all();
// As long as there is data in the pool, the consumer can get it.
while (!data_is_set_ && !done_) cond_.wait(lk);
if (done_) return false;
// Gets the batch
for (int i = 0; i < num_bindings; i++) {
auto it = data_buffers_.find(names[i]);
if (it == data_buffers_.end()) {
LOG(FATAL) << "Calibration engine asked for unknown tensor name '"
<< names[i] << "' at position " << i;
}
bindings[i] = it->second.first;
}
data_is_set_ = false;
calib_running_ = true;
VLOG(4) << "get batch done: " << engine_name_;
return true;
}
void TRTInt8Calibrator::setDone() {
std::unique_lock<std::mutex> lk(mut_);
done_ = true;
cond_.notify_all();
}
const void* TRTInt8Calibrator::readCalibrationCache(size_t& length) {
if (calibration_table_.empty()) return nullptr;
length = calibration_table_.size();
return calibration_table_.data();
}
void TRTInt8Calibrator::writeCalibrationCache(const void* ptr,
std::size_t length) {
calibration_table_ = std::string((const char*)ptr, length);
VLOG(4) << "Got calibration data for " << engine_name_ << " " << ptr
<< " length=" << length;
}
TRTInt8Calibrator::~TRTInt8Calibrator() {
VLOG(4) << "Destroying calibrator for " << engine_name_;
}
} // namespace tensorrt
} // namespace inference
} // namespace paddle
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include <atomic>
#include <memory>
#include <mutex>
#include <string>
#include <unordered_map>
#include <utility>
#include <vector>
#include <NvInfer.h>
#include <cuda_runtime_api.h>
#include "paddle/fluid/framework/tensor.h"
#include "paddle/fluid/inference/tensorrt/engine.h"
#include "paddle/fluid/platform/place.h"
namespace paddle {
namespace inference {
namespace tensorrt {
class TensorRTEngine;
struct TRTInt8Calibrator : public nvinfer1::IInt8EntropyCalibrator {
public:
TRTInt8Calibrator(const std::unordered_map<std::string, size_t>& buffers,
int batch_size, std::string engine_name,
const platform::Place place);
explicit TRTInt8Calibrator(const std::string& calibration_data);
~TRTInt8Calibrator();
int getBatchSize() const override;
bool getBatch(void* bindings[], const char* names[],
int num_bindings) override;
bool setBatch(const std::unordered_map<std::string, void*>& data);
void setDone();
void waitAndSetDone();
const void* readCalibrationCache(std::size_t& length) override;
void writeCalibrationCache(const void* ptr, std::size_t length) override;
const std::string& getCalibrationTableAsString() {
return calibration_table_;
}
private:
const int batch_size_;
bool calib_running_{true};
bool data_is_set_{false};
bool done_{false};
std::mutex mut_;
std::condition_variable cond_;
std::unordered_map<std::string, std::pair<void*, size_t>> data_buffers_;
std::vector<framework::Tensor> data_tensors_;
std::string engine_name_;
std::string calibration_table_;
};
class TRTCalibratorEngine {
public:
TRTCalibratorEngine() {}
std::unique_ptr<TRTInt8Calibrator> calib_;
std::unique_ptr<std::thread> thr_;
std::unique_ptr<TensorRTEngine> engine_;
};
/*
* Manager to control the TensorRT Int8 calibration creation and deltetion.
*/
class TRTCalibratorEngineManager {
public:
bool Has() const { return res_.size() > 0; }
bool Has(const std::string& name) const {
if (res_.count(name) == 0) return false;
return res_.at(name).get() != nullptr;
}
// Get Int8Calibrator via name
TRTCalibratorEngine* Get(const std::string& name) const {
return res_.at(name).get();
}
// Look up or create a calibrator.
TRTCalibratorEngine* LookupOrCreate(const std::string& engine_name) {
if (res_.count(engine_name) == 0) {
auto* p = new TRTCalibratorEngine;
res_[engine_name].reset(p);
}
return res_.at(engine_name).get();
}
// Create an Int8Calibrator
TRTCalibratorEngine* Create(const std::string& engine_name) {
auto* p = new TRTCalibratorEngine;
res_[engine_name].reset(p);
return p;
}
void DeleteALL() {
for (auto& item : res_) {
item.second.reset(nullptr);
}
}
private:
std::unordered_map<std::string, std::unique_ptr<TRTCalibratorEngine>> res_;
};
} // namespace tensorrt
} // namespace inference
} // namespace paddle
...@@ -54,6 +54,7 @@ else() ...@@ -54,6 +54,7 @@ else()
message(WARNING "These tests has been disabled in OSX or WITH_MKL=OFF before being fixed: \n test_analyzer_seq_pool1") message(WARNING "These tests has been disabled in OSX or WITH_MKL=OFF before being fixed: \n test_analyzer_seq_pool1")
endif() endif()
# RNN2 # RNN2
set(RNN2_INSTALL_DIR "${INFERENCE_DEMO_INSTALL_DIR}/rnn2") set(RNN2_INSTALL_DIR "${INFERENCE_DEMO_INSTALL_DIR}/rnn2")
download_model_and_data(${RNN2_INSTALL_DIR} "rnn2_model.tar.gz" "rnn2_data.txt.tar.gz") download_model_and_data(${RNN2_INSTALL_DIR} "rnn2_model.tar.gz" "rnn2_data.txt.tar.gz")
...@@ -115,6 +116,10 @@ if (NOT EXISTS ${MOBILENET_INSTALL_DIR}) ...@@ -115,6 +116,10 @@ if (NOT EXISTS ${MOBILENET_INSTALL_DIR})
endif() endif()
inference_analysis_api_test_with_refer_result(test_analyzer_mobilenet_transpose ${MOBILENET_INSTALL_DIR} analyzer_vis_tester.cc SERIAL) inference_analysis_api_test_with_refer_result(test_analyzer_mobilenet_transpose ${MOBILENET_INSTALL_DIR} analyzer_vis_tester.cc SERIAL)
# googlenet
inference_analysis_api_test_with_fake_data(test_analyzer_googlenet
"${INFERENCE_DEMO_INSTALL_DIR}/googlenet" analyzer_resnet50_tester.cc "googlenet.tar.gz" SERIAL)
# resnet50 # resnet50
inference_analysis_api_test_with_fake_data(test_analyzer_resnet50 inference_analysis_api_test_with_fake_data(test_analyzer_resnet50
"${INFERENCE_DEMO_INSTALL_DIR}/resnet50" analyzer_resnet50_tester.cc "resnet50_model.tar.gz" SERIAL) "${INFERENCE_DEMO_INSTALL_DIR}/resnet50" analyzer_resnet50_tester.cc "resnet50_model.tar.gz" SERIAL)
......
...@@ -253,7 +253,7 @@ void compare(bool use_mkldnn = false) { ...@@ -253,7 +253,7 @@ void compare(bool use_mkldnn = false) {
} }
// Compare result of NativeConfig and AnalysisConfig with memory optimization. // Compare result of NativeConfig and AnalysisConfig with memory optimization.
TEST(Analyzer_dam, compare_with_memory_optim) { TEST(Analyzer_dam, compare_with_static_memory_optim) {
// The small dam will core in CI, but works in local. // The small dam will core in CI, but works in local.
if (FLAGS_max_turn_num == 9) { if (FLAGS_max_turn_num == 9) {
contrib::AnalysisConfig cfg, cfg1; contrib::AnalysisConfig cfg, cfg1;
...@@ -263,7 +263,7 @@ TEST(Analyzer_dam, compare_with_memory_optim) { ...@@ -263,7 +263,7 @@ TEST(Analyzer_dam, compare_with_memory_optim) {
SetInput(&input_slots_all); SetInput(&input_slots_all);
// Run the first time to force to update memory cache // Run the first time to force to update memory cache
SetConfig(&cfg); SetConfig(&cfg);
cfg.EnableMemoryOptim(true); cfg.EnableMemoryOptim(true, true /*force update*/);
CompareNativeAndAnalysis( CompareNativeAndAnalysis(
reinterpret_cast<const PaddlePredictor::Config *>(&cfg), reinterpret_cast<const PaddlePredictor::Config *>(&cfg),
...@@ -271,7 +271,7 @@ TEST(Analyzer_dam, compare_with_memory_optim) { ...@@ -271,7 +271,7 @@ TEST(Analyzer_dam, compare_with_memory_optim) {
// Run second time to use the memory cache and perform memory optimization. // Run second time to use the memory cache and perform memory optimization.
SetConfig(&cfg1); SetConfig(&cfg1);
cfg1.EnableMemoryOptim(); cfg1.EnableMemoryOptim(true, false /*do not force update*/);
CompareNativeAndAnalysis( CompareNativeAndAnalysis(
reinterpret_cast<const PaddlePredictor::Config *>(&cfg1), reinterpret_cast<const PaddlePredictor::Config *>(&cfg1),
...@@ -279,6 +279,24 @@ TEST(Analyzer_dam, compare_with_memory_optim) { ...@@ -279,6 +279,24 @@ TEST(Analyzer_dam, compare_with_memory_optim) {
} }
} }
TEST(Analyzer_dam, compare_with_dynamic_memory_optim) {
// The small dam will core in CI, but works in local.
if (FLAGS_max_turn_num == 9) {
contrib::AnalysisConfig cfg, cfg1;
DataRecord data(FLAGS_infer_data, FLAGS_batch_size);
std::vector<std::vector<PaddleTensor>> input_slots_all;
SetInput(&input_slots_all);
// Run the first time to force to update memory cache
SetConfig(&cfg);
cfg.EnableMemoryOptim();
CompareNativeAndAnalysis(
reinterpret_cast<const PaddlePredictor::Config *>(&cfg),
input_slots_all);
}
}
TEST(Analyzer_dam, compare) { compare(); } TEST(Analyzer_dam, compare) { compare(); }
#ifdef PADDLE_WITH_MKLDNN #ifdef PADDLE_WITH_MKLDNN
......
...@@ -14,6 +14,7 @@ ...@@ -14,6 +14,7 @@
#include "paddle/fluid/memory/allocation/legacy_allocator.h" #include "paddle/fluid/memory/allocation/legacy_allocator.h"
#include <string> #include <string>
#include <utility>
#include <vector> #include <vector>
#include "glog/logging.h" #include "glog/logging.h"
#include "paddle/fluid/memory/detail/buddy_allocator.h" #include "paddle/fluid/memory/detail/buddy_allocator.h"
...@@ -37,7 +38,7 @@ template <typename Place> ...@@ -37,7 +38,7 @@ template <typename Place>
void *Alloc(const Place &place, size_t size); void *Alloc(const Place &place, size_t size);
template <typename Place> template <typename Place>
void Free(const Place &place, void *p); void Free(const Place &place, void *p, size_t size);
template <typename Place> template <typename Place>
size_t Used(const Place &place); size_t Used(const Place &place);
...@@ -52,6 +53,11 @@ size_t memory_usage(const platform::Place &p); ...@@ -52,6 +53,11 @@ size_t memory_usage(const platform::Place &p);
using BuddyAllocator = detail::BuddyAllocator; using BuddyAllocator = detail::BuddyAllocator;
std::unordered_map</*device id*/ int,
std::pair</*current memory usage*/ uint64_t,
/*peak memory usage*/ uint64_t>>
gpu_mem_info;
BuddyAllocator *GetCPUBuddyAllocator() { BuddyAllocator *GetCPUBuddyAllocator() {
// We tried thread_local for inference::RNN1 model, but that not works much // We tried thread_local for inference::RNN1 model, but that not works much
// for multi-thread test. // for multi-thread test.
...@@ -98,7 +104,8 @@ void *Alloc<platform::CPUPlace>(const platform::CPUPlace &place, size_t size) { ...@@ -98,7 +104,8 @@ void *Alloc<platform::CPUPlace>(const platform::CPUPlace &place, size_t size) {
} }
template <> template <>
void Free<platform::CPUPlace>(const platform::CPUPlace &place, void *p) { void Free<platform::CPUPlace>(const platform::CPUPlace &place, void *p,
size_t size) {
VLOG(10) << "Free pointer=" << p << " on " << platform::Place(place); VLOG(10) << "Free pointer=" << p << " on " << platform::Place(place);
GetCPUBuddyAllocator()->Free(p); GetCPUBuddyAllocator()->Free(p);
} }
...@@ -177,9 +184,16 @@ void *Alloc<platform::CUDAPlace>(const platform::CUDAPlace &place, ...@@ -177,9 +184,16 @@ void *Alloc<platform::CUDAPlace>(const platform::CUDAPlace &place,
LOG(WARNING) << "GPU memory used: " LOG(WARNING) << "GPU memory used: "
<< string::HumanReadableSize(Used<platform::CUDAPlace>(place)); << string::HumanReadableSize(Used<platform::CUDAPlace>(place));
platform::SetDeviceId(cur_dev); platform::SetDeviceId(cur_dev);
} } else {
if (FLAGS_init_allocated_mem) { gpu_mem_info[place.device].first += size;
cudaMemset(ptr, 0xEF, size); if (gpu_mem_info[place.device].first > gpu_mem_info[place.device].second) {
gpu_mem_info[place.device].second = gpu_mem_info[place.device].first;
VLOG(3) << "device: " << place.device << " peak memory usage : "
<< (gpu_mem_info[place.device].second >> 20) << " MiB";
}
if (FLAGS_init_allocated_mem) {
cudaMemset(ptr, 0xEF, size);
}
} }
return ptr; return ptr;
#else #else
...@@ -188,9 +202,11 @@ void *Alloc<platform::CUDAPlace>(const platform::CUDAPlace &place, ...@@ -188,9 +202,11 @@ void *Alloc<platform::CUDAPlace>(const platform::CUDAPlace &place,
} }
template <> template <>
void Free<platform::CUDAPlace>(const platform::CUDAPlace &place, void *p) { void Free<platform::CUDAPlace>(const platform::CUDAPlace &place, void *p,
size_t size) {
#ifdef PADDLE_WITH_CUDA #ifdef PADDLE_WITH_CUDA
GetGPUBuddyAllocator(place.device)->Free(p); GetGPUBuddyAllocator(place.device)->Free(p);
gpu_mem_info[place.device].first -= size;
#else #else
PADDLE_THROW("'CUDAPlace' is not supported in CPU only device."); PADDLE_THROW("'CUDAPlace' is not supported in CPU only device.");
#endif #endif
...@@ -243,7 +259,7 @@ void *Alloc<platform::CUDAPinnedPlace>(const platform::CUDAPinnedPlace &place, ...@@ -243,7 +259,7 @@ void *Alloc<platform::CUDAPinnedPlace>(const platform::CUDAPinnedPlace &place,
template <> template <>
void Free<platform::CUDAPinnedPlace>(const platform::CUDAPinnedPlace &place, void Free<platform::CUDAPinnedPlace>(const platform::CUDAPinnedPlace &place,
void *p) { void *p, size_t size) {
#ifdef PADDLE_WITH_CUDA #ifdef PADDLE_WITH_CUDA
GetCUDAPinnedBuddyAllocator()->Free(p); GetCUDAPinnedBuddyAllocator()->Free(p);
#else #else
...@@ -264,15 +280,17 @@ struct AllocVisitor : public boost::static_visitor<void *> { ...@@ -264,15 +280,17 @@ struct AllocVisitor : public boost::static_visitor<void *> {
}; };
struct FreeVisitor : public boost::static_visitor<void> { struct FreeVisitor : public boost::static_visitor<void> {
inline explicit FreeVisitor(void *ptr) : ptr_(ptr) {} inline explicit FreeVisitor(void *ptr, size_t size)
: ptr_(ptr), size_(size) {}
template <typename Place> template <typename Place>
inline void operator()(const Place &place) const { inline void operator()(const Place &place) const {
Free<Place>(place, ptr_); Free<Place>(place, ptr_, size_);
} }
private: private:
void *ptr_; void *ptr_;
size_t size_;
}; };
size_t Usage::operator()(const platform::CPUPlace &cpu) const { size_t Usage::operator()(const platform::CPUPlace &cpu) const {
...@@ -304,8 +322,9 @@ Allocation *LegacyAllocator::AllocateImpl(size_t size, Allocator::Attr attr) { ...@@ -304,8 +322,9 @@ Allocation *LegacyAllocator::AllocateImpl(size_t size, Allocator::Attr attr) {
} }
void LegacyAllocator::Free(Allocation *allocation) { void LegacyAllocator::Free(Allocation *allocation) {
boost::apply_visitor(legacy::FreeVisitor(allocation->ptr()), boost::apply_visitor(
allocation->place()); legacy::FreeVisitor(allocation->ptr(), allocation->size()),
allocation->place());
delete allocation; delete allocation;
} }
} // namespace allocation } // namespace allocation
......
...@@ -13,6 +13,7 @@ add_subdirectory(detection) ...@@ -13,6 +13,7 @@ add_subdirectory(detection)
add_subdirectory(elementwise) add_subdirectory(elementwise)
add_subdirectory(fused) add_subdirectory(fused)
add_subdirectory(metrics) add_subdirectory(metrics)
add_subdirectory(ngraph)
add_subdirectory(optimizers) add_subdirectory(optimizers)
add_subdirectory(reduce_ops) add_subdirectory(reduce_ops)
add_subdirectory(sequence_ops) add_subdirectory(sequence_ops)
...@@ -66,7 +67,7 @@ set(COMMON_OP_DEPS ${OP_HEADER_DEPS}) ...@@ -66,7 +67,7 @@ set(COMMON_OP_DEPS ${OP_HEADER_DEPS})
set(COMMON_OP_DEPS ${COMMON_OP_DEPS} selected_rows_functor selected_rows lod_tensor maxouting unpooling pooling lod_rank_table context_project sequence_pooling executor) set(COMMON_OP_DEPS ${COMMON_OP_DEPS} selected_rows_functor selected_rows lod_tensor maxouting unpooling pooling lod_rank_table context_project sequence_pooling executor)
set(COMMON_OP_DEPS ${COMMON_OP_DEPS} dynload_warpctc) set(COMMON_OP_DEPS ${COMMON_OP_DEPS} dynload_warpctc)
set(COMMON_OP_DEPS ${COMMON_OP_DEPS} sequence_padding sequence_scale cos_sim_functor memory jit_kernel_helper concat_and_split cross_entropy softmax vol2col im2col sampler tree2col) set(COMMON_OP_DEPS ${COMMON_OP_DEPS} sequence_padding sequence_scale cos_sim_functor memory jit_kernel_helper concat_and_split cross_entropy softmax vol2col im2col sampler tree2col)
set(COMMON_OP_DEPS ${COMMON_OP_DEPS} sequence2batch lstm_compute matrix_bit_code gru_compute activation_functions) set(COMMON_OP_DEPS ${COMMON_OP_DEPS} sequence2batch lstm_compute matrix_bit_code gru_compute activation_functions beam_search)
if (WITH_GPU) if (WITH_GPU)
set(COMMON_OP_DEPS ${COMMON_OP_DEPS} depthwise_conv prelu) set(COMMON_OP_DEPS ${COMMON_OP_DEPS} depthwise_conv prelu)
endif() endif()
...@@ -86,7 +87,6 @@ set(GLOB_OPERATOR_DEPS ${OPERATOR_DEPS} CACHE INTERNAL "Global Op dependencies") ...@@ -86,7 +87,6 @@ set(GLOB_OPERATOR_DEPS ${OPERATOR_DEPS} CACHE INTERNAL "Global Op dependencies")
cc_test(gather_test SRCS gather_test.cc DEPS tensor) cc_test(gather_test SRCS gather_test.cc DEPS tensor)
cc_test(scatter_test SRCS scatter_test.cc DEPS tensor math_function) cc_test(scatter_test SRCS scatter_test.cc DEPS tensor math_function)
cc_test(beam_search_decode_op_test SRCS beam_search_decode_op_test.cc DEPS lod_tensor) cc_test(beam_search_decode_op_test SRCS beam_search_decode_op_test.cc DEPS lod_tensor)
cc_test(beam_search_op_test SRCS beam_search_op_test.cc DEPS lod_tensor beam_search_op)
cc_test(strided_memcpy_test SRCS strided_memcpy_test.cc DEPS tensor memory) cc_test(strided_memcpy_test SRCS strided_memcpy_test.cc DEPS tensor memory)
cc_test(save_load_op_test SRCS save_load_op_test.cc DEPS save_op load_op) cc_test(save_load_op_test SRCS save_load_op_test.cc DEPS save_op load_op)
cc_test(save_load_combine_op_test SRCS save_load_combine_op_test.cc DEPS save_combine_op load_combine_op) cc_test(save_load_combine_op_test SRCS save_load_combine_op_test.cc DEPS save_combine_op load_combine_op)
......
...@@ -12,205 +12,15 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. ...@@ -12,205 +12,15 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#include <algorithm> #include "paddle/fluid/operators/beam_search_op.h"
#include <map>
#include <string> #include <string>
#include <vector> #include <vector>
#include "paddle/fluid/framework/lod_tensor.h"
#include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/operators/beam_search_op.h"
namespace paddle { namespace paddle {
namespace operators { namespace operators {
void BeamSearch::operator()(const framework::LoDTensor &pre_ids,
const framework::LoDTensor &pre_scores,
framework::LoDTensor *selected_ids,
framework::LoDTensor *selected_scores) {
auto abs_lod = framework::ToAbsOffset(ids_->lod());
auto &high_level = abs_lod[lod_level_];
auto items = SelectTopBeamSizeItems(pre_ids, pre_scores);
auto selected_items = ToMap(items, high_level.back());
VLOG(3) << "selected_items:";
for (size_t i = 0; i < selected_items.size(); ++i) {
VLOG(3) << "offset:" << i;
for (auto &item : selected_items[i]) {
VLOG(3) << ItemToString(item);
}
}
PruneEndBeams(pre_ids, &selected_items);
// calculate the output tensor's height
size_t num_instances = std::accumulate(
std::begin(selected_items), std::end(selected_items), 0,
[](size_t a, std::vector<Item> &b) { return a + b.size(); });
// the output tensor shape should be [num_instances, 1]
auto dims = framework::make_ddim(
std::vector<int64_t>({static_cast<int>(num_instances), 1}));
selected_ids->Resize(dims);
selected_scores->Resize(dims);
std::map<size_t /*offset*/, std::vector<Item>> hash;
framework::LoD new_lod;
auto *ids_data = selected_ids->mutable_data<int64_t>(platform::CPUPlace());
auto *scores_data =
selected_scores->mutable_data<float>(platform::CPUPlace());
// fill in data
std::vector<size_t> low_level;
size_t low_offset = 0;
for (auto &items : selected_items) {
low_level.push_back(low_offset);
for (auto &item : items) {
ids_data[low_offset] = item.id;
scores_data[low_offset] = item.score;
low_offset++;
}
}
low_level.push_back(low_offset);
// fill lod
framework::LoD lod(2);
lod[0].assign(high_level.begin(), high_level.end());
lod[1].assign(low_level.begin(), low_level.end());
if (!framework::CheckLoD(lod)) {
PADDLE_THROW("lod %s is not right", framework::LoDToString(lod));
}
selected_ids->set_lod(lod);
selected_scores->set_lod(lod);
}
void BeamSearch::PruneEndBeams(const framework::LoDTensor &pre_ids,
std::vector<std::vector<Item>> *items) {
auto *pre_ids_data = pre_ids.data<int64_t>();
auto abs_lod = framework::ToAbsOffset(ids_->lod());
auto &high_level = abs_lod[lod_level_];
for (size_t src_idx = 0; src_idx < high_level.size() - 1; ++src_idx) {
size_t src_prefix_start = high_level[src_idx];
size_t src_prefix_end = high_level[src_idx + 1];
bool finish_flag = true;
for (size_t offset = src_prefix_start; offset < src_prefix_end; offset++) {
for (auto &item : items->at(offset)) {
if (item.id != static_cast<size_t>(end_id_) ||
pre_ids_data[offset] != end_id_) {
finish_flag = false;
break;
}
}
if (!finish_flag) break;
}
if (finish_flag) { // all branchs of the beam (source sentence) end and
// prune this beam
for (size_t offset = src_prefix_start; offset < src_prefix_end; offset++)
items->at(offset).clear();
}
}
}
std::vector<std::vector<BeamSearch::Item>> BeamSearch::ToMap(
const std::vector<std::vector<Item>> &items, size_t element_num) {
std::vector<std::vector<Item>> result;
result.resize(element_num);
for (auto &entries : items) {
for (const auto &item : entries) {
result[item.offset].push_back(item);
}
}
return result;
}
std::vector<std::vector<BeamSearch::Item>> BeamSearch::SelectTopBeamSizeItems(
const framework::LoDTensor &pre_ids,
const framework::LoDTensor &pre_scores) {
std::vector<std::vector<Item>> result;
std::vector<Item> items;
// for each source sentence, select the top beam_size items across all
// candidate sets.
while (NextItemSet(pre_ids, pre_scores, &items)) {
std::nth_element(
std::begin(items), std::begin(items) + beam_size_, std::end(items),
[](const Item &a, const Item &b) { return a.score > b.score; });
// prune the top beam_size items.
if (items.size() > beam_size_) {
items.resize(beam_size_);
}
result.emplace_back(items);
}
VLOG(3) << "SelectTopBeamSizeItems result size " << result.size();
for (auto &items : result) {
VLOG(3) << "item set:";
for (auto &item : items) {
VLOG(3) << ItemToString(item);
}
}
return result;
}
// the candidates of a source
bool BeamSearch::NextItemSet(const framework::LoDTensor &pre_ids,
const framework::LoDTensor &pre_scores,
std::vector<BeamSearch::Item> *items) {
if (sent_offset_ >= ids_->NumElements(lod_level_)) {
return false;
}
// find the current candidates
auto ids = *ids_;
auto scores = *scores_;
auto abs_lod = framework::ToAbsOffset(ids.lod());
auto *ids_data = ids.data<int64_t>();
auto *scores_data = scores.data<float>();
size_t instance_dim = 1;
for (int i = 1; i < ids.dims().size(); i++) {
instance_dim *= ids.dims()[i];
}
auto *pre_ids_data = pre_ids.data<int64_t>();
auto *pre_scores_data = pre_scores.data<float>();
items->clear();
items->reserve(framework::product(ids.dims()));
for (size_t offset = abs_lod[lod_level_][sent_offset_];
offset < abs_lod[lod_level_][sent_offset_ + 1]; offset++) {
auto pre_id = pre_ids_data[offset];
auto pre_score = pre_scores_data[offset];
if (pre_id == end_id_) {
// Allocate all probability mass to eos_id for finished branchs and the
// other candidate ids can be ignored.
items->emplace_back(offset, end_id_, pre_score);
} else {
for (size_t d = 0; d < instance_dim; d++) {
const size_t dim_offset = offset * instance_dim + d;
items->emplace_back(offset, ids_data[dim_offset],
scores_data[dim_offset]);
}
}
}
sent_offset_++;
return true;
}
std::ostream &operator<<(std::ostream &os, const BeamSearch::Item &item) {
os << "{";
os << "offset: " << item.offset << ", ";
os << "id: " << item.id << ", ";
os << "score: " << item.score << "";
os << "}";
return os;
}
std::string ItemToString(const BeamSearch::Item &item) {
std::ostringstream stream;
stream << item;
return stream.str();
}
class BeamSearchOpMaker : public framework::OpProtoAndCheckerMaker { class BeamSearchOpMaker : public framework::OpProtoAndCheckerMaker {
public: public:
void Make() override { void Make() override {
...@@ -219,18 +29,23 @@ class BeamSearchOpMaker : public framework::OpProtoAndCheckerMaker { ...@@ -219,18 +29,23 @@ class BeamSearchOpMaker : public framework::OpProtoAndCheckerMaker {
"(LoDTensor) The LoDTensor containing the selected ids at the " "(LoDTensor) The LoDTensor containing the selected ids at the "
"previous step. It should be a tensor with shape (batch_size, 1) " "previous step. It should be a tensor with shape (batch_size, 1) "
"and lod `[[0, 1, ... , batch_size], [0, 1, ..., batch_size]]` at " "and lod `[[0, 1, ... , batch_size], [0, 1, ..., batch_size]]` at "
"thefirst step."); "the first step.");
AddInput("pre_scores", AddInput("pre_scores",
"(LoDTensor) The LoDTensor containing the accumulated " "(LoDTensor) The LoDTensor containing the accumulated "
"scores corresponding to the selected ids at the previous step."); "scores corresponding to the selected ids at the previous step.");
AddInput("ids", AddInput("ids",
"(LoDTensor) The LoDTensor containing the candidates ids. Its " "(LoDTensor) The LoDTensor containing the candidates ids. Its "
"shape should be (batch_size * beam_size, K), where K supposed to " "shape should be (batch_size * beam_size, W). If not set, it will "
"be beam_size."); "be calculated out according to Input(scores) in this operator.")
.AsDispensable();
AddInput("scores", AddInput("scores",
"(LoDTensor) The LodTensor containing the accumulated scores " "(LoDTensor) The LoDTensor containing the current scores "
"corresponding to Input(ids) and its shape is the same as the " "corresponding to Input(ids). If Input(ids) is not nullptr, its "
"shape of Input(ids)."); "shape is the same as that of Input(ids)."
"If is_accumulated is true, Input(scores) is accumulated scores "
"and will be used derectedly. Else, each score will be "
"transformed to the log field and accumulate Input(pre_sores) "
"first.");
AddOutput("selected_ids", AddOutput("selected_ids",
"A LodTensor that stores the IDs selected by beam search."); "A LodTensor that stores the IDs selected by beam search.");
AddOutput("selected_scores", AddOutput("selected_scores",
...@@ -242,6 +57,9 @@ class BeamSearchOpMaker : public framework::OpProtoAndCheckerMaker { ...@@ -242,6 +57,9 @@ class BeamSearchOpMaker : public framework::OpProtoAndCheckerMaker {
AddAttr<int>("beam_size", "beam size for beam search"); AddAttr<int>("beam_size", "beam size for beam search");
AddAttr<int>("end_id", AddAttr<int>("end_id",
"the token id which indicates the end of a sequence"); "the token id which indicates the end of a sequence");
AddAttr<bool>("is_accumulated",
"Whether the Input(scores) is accumulated scores.")
.SetDefault(true);
AddComment(R"DOC( AddComment(R"DOC(
This operator does the search in beams for one time step. This operator does the search in beams for one time step.
...@@ -265,10 +83,9 @@ class BeamSearchOp : public framework::OperatorWithKernel { ...@@ -265,10 +83,9 @@ class BeamSearchOp : public framework::OperatorWithKernel {
public: public:
using framework::OperatorWithKernel::OperatorWithKernel; using framework::OperatorWithKernel::OperatorWithKernel;
protected:
void InferShape(framework::InferShapeContext *ctx) const override { void InferShape(framework::InferShapeContext *ctx) const override {
for (const std::string &arg : for (const std::string &arg :
std::vector<std::string>({"pre_ids", "ids", "scores"})) { std::vector<std::string>({"pre_ids", "scores"})) {
PADDLE_ENFORCE(ctx->HasInput(arg), "BeamSearch need input argument '%s'", PADDLE_ENFORCE(ctx->HasInput(arg), "BeamSearch need input argument '%s'",
arg); arg);
} }
...@@ -279,12 +96,22 @@ class BeamSearchOp : public framework::OperatorWithKernel { ...@@ -279,12 +96,22 @@ class BeamSearchOp : public framework::OperatorWithKernel {
} }
} }
protected:
framework::OpKernelType GetExpectedKernelType( framework::OpKernelType GetExpectedKernelType(
const framework::ExecutionContext &ctx) const override { const framework::ExecutionContext &ctx) const override {
framework::OpKernelType kt = framework::OpKernelType( auto *scores = ctx.Input<framework::LoDTensor>("scores");
ctx.Input<framework::LoDTensor>("pre_ids")->type(), size_t level = ctx.Attr<int>("level");
platform::CPUPlace()); size_t batch_size = scores->lod()[level].size() - 1;
return kt; // The current CUDA kernel only support cases with batch_size < 4.
// Compute on CPU for cases with batch_size > 4.
if (batch_size <= 4) {
return framework::OpKernelType(
ctx.Input<framework::LoDTensor>("pre_ids")->type(), ctx.GetPlace());
} else {
return framework::OpKernelType(
ctx.Input<framework::LoDTensor>("pre_ids")->type(),
platform::CPUPlace());
}
} }
}; };
......
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/operators/beam_search_op.h"
#include "paddle/fluid/framework/op_registry.h"
namespace ops = paddle::operators;
REGISTER_OP_CUDA_KERNEL(
beam_search,
ops::BeamSearchOpKernel<paddle::platform::CUDADeviceContext, float>,
ops::BeamSearchOpKernel<paddle::platform::CUDADeviceContext, double>,
ops::BeamSearchOpKernel<paddle::platform::CUDADeviceContext, int>,
ops::BeamSearchOpKernel<paddle::platform::CUDADeviceContext, int64_t>);
...@@ -4,7 +4,7 @@ Licensed under the Apache License, Version 2.0 (the "License"); ...@@ -4,7 +4,7 @@ Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License. you may not use this file except in compliance with the License.
You may obtain a copy of the License at You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0 http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS, distributed under the License is distributed on an "AS IS" BASIS,
...@@ -14,187 +14,12 @@ limitations under the License. */ ...@@ -14,187 +14,12 @@ limitations under the License. */
#pragma once #pragma once
#include <string>
#include <vector>
#include "paddle/fluid/framework/lod_tensor.h"
#include "paddle/fluid/framework/operator.h" #include "paddle/fluid/framework/operator.h"
#include "paddle/fluid/operators/math/beam_search.h"
namespace paddle { namespace paddle {
namespace operators { namespace operators {
/*
* This is an implementation of beam search.
*
* To explain the details, lets take machine translation task for example, in
* this task, one source sentence is translated to multiple target sentences,
* during this period, one sentence will be translated to multiple translation
* prefixes(target sentence that have not ended), in each time step a prefix
* will have some candidates, input the candidate ids and their corresponding
* scores (probabilities), it will sort and select the top beam_size candidates
* for each source sentence, and store the selected candidates's score and their
* corresponding ids to LoDTensors.
*
* A detailed example:
*
* Input
*
* ids:
* LoD (should have 2 levels)
* first level: [0, 1, 4]
* second level: [0, 1, 2, 3, 4]
*
* tensor's data
* [
* [4, 2, 5]
* [2, 1, 3]
* [3, 5, 2]
* [8, 2, 1]
* ]
*
* scores:
* LoD same as `ids`
* tensor's data
* [
* [0.5, 0.3, 0.2]
* [0.6, 0.3, 0.1]
* [0.9, 0.5, 0.1]
* [0.7, 0.5, 0.1]
* ]
*
* the inputs means that there are 2 source sentences to translate, and the
* first source has 1 prefix, the second source has 2 prefix.
*
* lets assume beam size is 2, and the beam search's output should be
* LoD
* first level:
* [0, 1, 2]
* second level:
* [0, 2, 4]
*
* id tensor's data
* [[
* 4,
* 1,
* 3,
* 8,
* ]]
*
* score tensor's data
* [[
* 0.5,
* 0.3,
* 0.9,
* 0.7
* ]]
*
* TODO all the prune operations should be in the beam search, so it is better
* to split the beam search algorithm into a sequence of smaller operators, and
* the prune operators can be inserted in this sequence.
*/
class BeamSearch {
public:
// TODO(superjom) make type customizable
using id_t = size_t;
using score_t = float;
/*
* Input the arguments that needed by this class.
*/
BeamSearch(const framework::LoDTensor& ids,
const framework::LoDTensor& scores, size_t level, size_t beam_size,
int end_id)
: beam_size_(beam_size),
ids_(&ids),
scores_(&scores),
lod_level_(level),
end_id_(end_id) {}
/*
* The main function of beam search.
*
* @selected_ids: a [None, 1]-shaped tensor with LoD.
* In a machine translation model, it might be the candidate term id sets,
* each set stored as a varience-length sequence.
* The format might be described with a two-level LoD
* - [[0 1]
* - [0 1 2]]
* - [[]
* - [0 1]]
* the first level of LoD tells that there are two source sentences. The
* second level describes the details of the candidate id set's offsets in
* the
* source sentences.
*
* @selected_scores: a LoD tensor with the same shape and LoD with
* selected_ids.
* It stores the corresponding scores of candidate ids in selected_ids.
*
* Return false if all the input tensor is empty, in machine translation task
* that means no candidates is provided, and the task will stop running.
*/
void operator()(const framework::LoDTensor& pre_ids,
const framework::LoDTensor& pre_scores,
framework::LoDTensor* selected_ids,
framework::LoDTensor* selected_scores);
/*
* The basic items help to sort.
*/
struct Item {
Item() {}
Item(size_t offset, size_t id, float score)
: offset(offset), id(id), score(score) {}
// offset in the higher lod level.
size_t offset;
// // prefix id in the lower lod level.
// size_t prefix;
// the candidate id
id_t id;
// the corresponding score
score_t score;
};
protected:
/*
* Prune the source sentences all branchs finished, and it is optional.
* Pruning must one step later than finishing (thus pre_ids is needed here),
* since the end tokens must be writed out.
*/
void PruneEndBeams(const framework::LoDTensor& pre_ids,
std::vector<std::vector<Item>>* items);
/*
* Transform the items into a map whose key is offset, value is the items.
* NOTE low performance.
*/
std::vector<std::vector<Item>> ToMap(
const std::vector<std::vector<Item>>& inputs, size_t element_num);
/*
* For each source, select top beam_size records.
*/
std::vector<std::vector<Item>> SelectTopBeamSizeItems(
const framework::LoDTensor& pre_ids,
const framework::LoDTensor& pre_scores);
/*
* Get the items of next source sequence, return false if no remaining items.
*/
bool NextItemSet(const framework::LoDTensor& pre_ids,
const framework::LoDTensor& pre_scores,
std::vector<Item>* items);
private:
size_t beam_size_;
const framework::LoDTensor* ids_;
const framework::LoDTensor* scores_;
size_t lod_level_{0};
size_t sent_offset_{0};
int end_id_{0};
};
std::ostream& operator<<(std::ostream& os, const BeamSearch::Item& item);
std::string ItemToString(const BeamSearch::Item& item);
template <typename DeviceContext, typename T> template <typename DeviceContext, typename T>
class BeamSearchOpKernel : public framework::OpKernel<T> { class BeamSearchOpKernel : public framework::OpKernel<T> {
public: public:
...@@ -203,7 +28,7 @@ class BeamSearchOpKernel : public framework::OpKernel<T> { ...@@ -203,7 +28,7 @@ class BeamSearchOpKernel : public framework::OpKernel<T> {
auto* scores = context.Input<framework::LoDTensor>("scores"); auto* scores = context.Input<framework::LoDTensor>("scores");
auto* pre_ids = context.Input<framework::LoDTensor>("pre_ids"); auto* pre_ids = context.Input<framework::LoDTensor>("pre_ids");
auto* pre_scores = context.Input<framework::LoDTensor>("pre_scores"); auto* pre_scores = context.Input<framework::LoDTensor>("pre_scores");
PADDLE_ENFORCE_NOT_NULL(ids);
PADDLE_ENFORCE_NOT_NULL(scores); PADDLE_ENFORCE_NOT_NULL(scores);
PADDLE_ENFORCE_NOT_NULL(pre_ids); PADDLE_ENFORCE_NOT_NULL(pre_ids);
PADDLE_ENFORCE_NOT_NULL(pre_scores); PADDLE_ENFORCE_NOT_NULL(pre_scores);
...@@ -211,14 +36,20 @@ class BeamSearchOpKernel : public framework::OpKernel<T> { ...@@ -211,14 +36,20 @@ class BeamSearchOpKernel : public framework::OpKernel<T> {
size_t level = context.Attr<int>("level"); size_t level = context.Attr<int>("level");
size_t beam_size = context.Attr<int>("beam_size"); size_t beam_size = context.Attr<int>("beam_size");
int end_id = context.Attr<int>("end_id"); int end_id = context.Attr<int>("end_id");
BeamSearch alg(*ids, *scores, level, beam_size, end_id); bool is_accumulated = context.Attr<bool>("is_accumulated");
auto selected_ids = context.Output<framework::LoDTensor>("selected_ids"); auto selected_ids = context.Output<framework::LoDTensor>("selected_ids");
auto selected_scores = auto selected_scores =
context.Output<framework::LoDTensor>("selected_scores"); context.Output<framework::LoDTensor>("selected_scores");
PADDLE_ENFORCE_NOT_NULL(selected_ids); PADDLE_ENFORCE_NOT_NULL(selected_ids);
PADDLE_ENFORCE_NOT_NULL(selected_scores); PADDLE_ENFORCE_NOT_NULL(selected_scores);
alg(*pre_ids, *pre_scores, selected_ids, selected_scores);
math::BeamSearchFunctor<DeviceContext, T> alg;
alg(context.template device_context<DeviceContext>(), pre_ids, pre_scores,
ids, scores, selected_ids, selected_scores, level, beam_size, end_id,
is_accumulated);
} }
}; };
} // namespace operators } // namespace operators
} // namespace paddle } // namespace paddle
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/operators/beam_search_op.h"
#include <gtest/gtest.h>
#include <vector>
namespace paddle {
namespace test {
using std::vector;
using framework::LoDTensor;
using framework::LoD;
using operators::BeamSearch;
using paddle::platform::CPUPlace;
using std::cout;
using std::endl;
void CreateInput(LoDTensor* ids, LoDTensor* scores) {
LoD lod;
vector<size_t> level0({0, 2, 4});
vector<size_t> level1({0, 1, 2, 3, 4});
lod.push_back(level0);
lod.push_back(level1);
ids->set_lod(lod);
scores->set_lod(lod);
auto dims = framework::make_ddim(vector<int64_t>({4, 3}));
ids->Resize(dims);
scores->Resize(dims);
CPUPlace place;
auto* ids_data = ids->mutable_data<int64_t>(place);
auto* scores_data = scores->mutable_data<float>(place);
vector<int64_t> _ids({4, 2, 5, 2, 1, 3, 3, 5, 2, 8, 2, 1});
vector<float> _scores(
{0.5f, 0.3f, 0.2f, 0.6f, 0.3f, 0.1f, 0.9f, 0.5f, 0.1f, 0.7f, 0.5f, 0.1f});
for (int i = 0; i < 12; i++) {
ids_data[i] = _ids[i];
scores_data[i] = _scores[i];
}
}
// It seems that beam_search_op has bugs.
TEST(DISABLED_beam_search_op, run) {
CPUPlace place;
LoDTensor ids, scores;
CreateInput(&ids, &scores);
LoDTensor pre_ids;
pre_ids.Resize(framework::make_ddim(vector<int64_t>(4, 1)));
for (int i = 0; i < 4; i++) {
pre_ids.mutable_data<int64_t>(place)[i] = i + 1;
}
LoDTensor pre_scores;
pre_scores.Resize(framework::make_ddim(vector<int64_t>(4, 1)));
for (int i = 0; i < 4; i++) {
pre_scores.mutable_data<float>(place)[i] = 0.1 * (i + 1);
}
BeamSearch beamsearch(ids, scores, (size_t)0, (size_t)2, 0);
LoDTensor sids, sscores;
beamsearch(pre_ids, pre_scores, &sids, &sscores);
LOG(INFO) << "score: " << sscores << endl;
ASSERT_EQ(sids.lod(), sscores.lod());
vector<int> tids({4, 2, 3, 8});
vector<float> tscores({0.5f, 0.6f, 0.9f, 0.7f});
for (int i = 0; i < 4; i++) {
ASSERT_EQ(tids[i], sids.data<int64_t>()[i]);
ASSERT_EQ(tscores[i], sscores.data<float>()[i]);
}
}
} // namespace test
} // namespace paddle
...@@ -87,8 +87,8 @@ class BprLossGradientOpKernel : public framework::OpKernel<T> { ...@@ -87,8 +87,8 @@ class BprLossGradientOpKernel : public framework::OpKernel<T> {
auto* label = ctx.Input<Tensor>("Label"); auto* label = ctx.Input<Tensor>("Label");
auto* dx = ctx.Output<Tensor>(framework::GradVarName("X")); auto* dx = ctx.Output<Tensor>(framework::GradVarName("X"));
const int step_size = x->dims()[0]; const size_t step_size = static_cast<size_t>(x->dims()[0]);
const int num_classes = x->dims()[1]; const size_t num_classes = static_cast<size_t>(x->dims()[1]);
T* dx_data = dx->mutable_data<T>(ctx.GetPlace()); T* dx_data = dx->mutable_data<T>(ctx.GetPlace());
const T* dy_data = dy->data<T>(); const T* dy_data = dy->data<T>();
const T* x_data = x->data<T>(); const T* x_data = x->data<T>();
......
...@@ -104,9 +104,7 @@ class CUDNNConvFusionOpKernel : public framework::OpKernel<T> { ...@@ -104,9 +104,7 @@ class CUDNNConvFusionOpKernel : public framework::OpKernel<T> {
// ------------------- cudnn conv algorithm --------------------- // ------------------- cudnn conv algorithm ---------------------
cudnnConvolutionFwdAlgo_t algo; cudnnConvolutionFwdAlgo_t algo;
auto handle = dev_ctx.cudnn_handle(); auto handle = dev_ctx.cudnn_handle();
auto workspace_handle = dev_ctx.cudnn_workspace_handle();
Tensor cudnn_workspace;
void* cudnn_workspace_ptr = nullptr;
CUDNN_ENFORCE(platform::dynload::cudnnSetConvolutionMathType( CUDNN_ENFORCE(platform::dynload::cudnnSetConvolutionMathType(
cudnn_conv_desc, CUDNN_DEFAULT_MATH)); cudnn_conv_desc, CUDNN_DEFAULT_MATH));
...@@ -120,24 +118,19 @@ class CUDNNConvFusionOpKernel : public framework::OpKernel<T> { ...@@ -120,24 +118,19 @@ class CUDNNConvFusionOpKernel : public framework::OpKernel<T> {
workspace_size_limit, &algo)); workspace_size_limit, &algo));
VLOG(3) << "cuDNN forward algo " << algo; VLOG(3) << "cuDNN forward algo " << algo;
} else { } else {
cudnn_workspace =
ctx.AllocateTmpTensor<int8_t, platform::CUDADeviceContext>(
framework::make_ddim(
{static_cast<int64_t>(workspace_size_limit)}),
dev_ctx);
cudnn_workspace_ptr = static_cast<void*>(cudnn_workspace.data<int8_t>());
auto search_func = [&]() { auto search_func = [&]() {
int returned_algo_count; int returned_algo_count;
std::array<cudnnConvolutionFwdAlgoPerf_t, kNUM_CUDNN_FWD_ALGS> std::array<cudnnConvolutionFwdAlgoPerf_t, kNUM_CUDNN_FWD_ALGS>
fwd_perf_stat; fwd_perf_stat;
auto cudnn_find_func = [&](void* cudnn_workspace) {
CUDNN_ENFORCE(platform::dynload::cudnnFindConvolutionForwardAlgorithmEx( CUDNN_ENFORCE(
handle, cudnn_input_desc, input_data, cudnn_filter_desc, platform::dynload::cudnnFindConvolutionForwardAlgorithmEx(
filter_data, cudnn_conv_desc, cudnn_output_desc, output_data, handle, cudnn_input_desc, input_data, cudnn_filter_desc,
kNUM_CUDNN_FWD_ALGS, &returned_algo_count, fwd_perf_stat.data(), filter_data, cudnn_conv_desc, cudnn_output_desc, output_data,
cudnn_workspace_ptr, workspace_size_limit)); kNUM_CUDNN_FWD_ALGS, &returned_algo_count,
fwd_perf_stat.data(), cudnn_workspace, workspace_size_limit));
};
workspace_handle.RunFunc(cudnn_find_func, workspace_size_limit);
VLOG(3) << "Perf result: (algo: stat, time, memory)"; VLOG(3) << "Perf result: (algo: stat, time, memory)";
for (int i = 0; i < returned_algo_count; ++i) { for (int i = 0; i < returned_algo_count; ++i) {
const auto& stat = fwd_perf_stat[i]; const auto& stat = fwd_perf_stat[i];
...@@ -188,15 +181,6 @@ class CUDNNConvFusionOpKernel : public framework::OpKernel<T> { ...@@ -188,15 +181,6 @@ class CUDNNConvFusionOpKernel : public framework::OpKernel<T> {
PADDLE_ENFORCE_LE(workspace_size_in_bytes, workspace_size_limit, PADDLE_ENFORCE_LE(workspace_size_in_bytes, workspace_size_limit,
"workspace_size to be allocated exceeds the limit"); "workspace_size to be allocated exceeds the limit");
if (!cudnn_workspace_ptr) {
cudnn_workspace =
ctx.AllocateTmpTensor<int8_t, platform::CUDADeviceContext>(
framework::make_ddim(
{static_cast<int64_t>(workspace_size_in_bytes)}),
dev_ctx);
cudnn_workspace_ptr = static_cast<void*>(cudnn_workspace.data<int8_t>());
}
if ((activation == "identity") && (!residual)) { if ((activation == "identity") && (!residual)) {
// Only the CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_PRECOMP_GEMM algo is // Only the CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_PRECOMP_GEMM algo is
// enabled with CUDNN_ACTIVATION_IDENTITY in cuDNN lib. // enabled with CUDNN_ACTIVATION_IDENTITY in cuDNN lib.
...@@ -204,12 +188,13 @@ class CUDNNConvFusionOpKernel : public framework::OpKernel<T> { ...@@ -204,12 +188,13 @@ class CUDNNConvFusionOpKernel : public framework::OpKernel<T> {
// cudnnConvolutionForward and cudnnAddTensor // cudnnConvolutionForward and cudnnAddTensor
// ------------- cudnn conv forward and bias add --------------------- // ------------- cudnn conv forward and bias add ---------------------
ScalingParamType<T> alpha = 1.0f, beta = 0.0f; ScalingParamType<T> alpha = 1.0f, beta = 0.0f;
auto cudnn_func = [&](void* cudnn_workspace) {
CUDNN_ENFORCE(platform::dynload::cudnnConvolutionForward( CUDNN_ENFORCE(platform::dynload::cudnnConvolutionForward(
handle, &alpha, cudnn_input_desc, input_data, cudnn_filter_desc, handle, &alpha, cudnn_input_desc, input_data, cudnn_filter_desc,
filter_data, cudnn_conv_desc, algo, cudnn_workspace_ptr, filter_data, cudnn_conv_desc, algo, cudnn_workspace,
workspace_size_in_bytes, &beta, cudnn_output_desc, output_data)); workspace_size_in_bytes, &beta, cudnn_output_desc, output_data));
};
workspace_handle.RunFunc(cudnn_func, workspace_size_in_bytes);
CUDNN_ENFORCE(platform::dynload::cudnnAddTensor( CUDNN_ENFORCE(platform::dynload::cudnnAddTensor(
handle, &alpha, cudnn_bias_desc, bias_data, &alpha, cudnn_output_desc, handle, &alpha, cudnn_bias_desc, bias_data, &alpha, cudnn_output_desc,
output_data)); output_data));
...@@ -220,13 +205,15 @@ class CUDNNConvFusionOpKernel : public framework::OpKernel<T> { ...@@ -220,13 +205,15 @@ class CUDNNConvFusionOpKernel : public framework::OpKernel<T> {
// ------------------- cudnn conv+bias+act forward -------------------- // ------------------- cudnn conv+bias+act forward --------------------
ScalingParamType<T> alpha1 = 1.0f; ScalingParamType<T> alpha1 = 1.0f;
ScalingParamType<T> alpha2 = residual ? 1.0f : 0.0f; ScalingParamType<T> alpha2 = residual ? 1.0f : 0.0f;
auto cudnn_func = [&](void* cudnn_workspace) {
CUDNN_ENFORCE(platform::dynload::cudnnConvolutionBiasActivationForward( CUDNN_ENFORCE(platform::dynload::cudnnConvolutionBiasActivationForward(
handle, &alpha1, cudnn_input_desc, input_data, cudnn_filter_desc, handle, &alpha1, cudnn_input_desc, input_data, cudnn_filter_desc,
filter_data, cudnn_conv_desc, algo, cudnn_workspace_ptr, filter_data, cudnn_conv_desc, algo, cudnn_workspace,
workspace_size_in_bytes, &alpha2, cudnn_output_desc, residual_data, workspace_size_in_bytes, &alpha2, cudnn_output_desc, residual_data,
cudnn_bias_desc, bias_data, cudnn_act_desc, cudnn_output_desc, cudnn_bias_desc, bias_data, cudnn_act_desc, cudnn_output_desc,
output_data)); output_data));
};
workspace_handle.RunFunc(cudnn_func, workspace_size_in_bytes);
} }
std::vector<int> channels = ctx.Attr<std::vector<int>>("split_channels"); std::vector<int> channels = ctx.Attr<std::vector<int>>("split_channels");
if (channels.size()) { if (channels.size()) {
......
...@@ -104,18 +104,16 @@ class CUDNNConvTransposeOpKernel : public framework::OpKernel<T> { ...@@ -104,18 +104,16 @@ class CUDNNConvTransposeOpKernel : public framework::OpKernel<T> {
int output_offset = output->numel() / output->dims()[0] / groups; int output_offset = output->numel() / output->dims()[0] / groups;
int filter_offset = filter->numel() / groups; int filter_offset = filter->numel() / groups;
T alpha = 1.0f, beta = 0.0f; T alpha = 1.0f, beta = 0.0f;
auto workspace_handle = dev_ctx.cudnn_workspace_handle();
auto temp_allocation =
platform::DeviceTemporaryAllocator::Instance().Get(dev_ctx).Allocate(
workspace_size_in_bytes);
void* cudnn_workspace = temp_allocation->ptr();
for (int g = 0; g < groups; g++) { for (int g = 0; g < groups; g++) {
CUDNN_ENFORCE(platform::dynload::cudnnConvolutionBackwardData( auto cudnn_func = [&](void* cudnn_workspace) {
handle, &alpha, cudnn_filter_desc, filter_data + filter_offset * g, CUDNN_ENFORCE(platform::dynload::cudnnConvolutionBackwardData(
cudnn_input_desc, input_data + input_offset * g, cudnn_conv_desc, handle, &alpha, cudnn_filter_desc, filter_data + filter_offset * g,
algo, cudnn_workspace, workspace_size_in_bytes, &beta, cudnn_input_desc, input_data + input_offset * g, cudnn_conv_desc,
cudnn_output_desc, output_data + output_offset * g)); algo, cudnn_workspace, workspace_size_in_bytes, &beta,
cudnn_output_desc, output_data + output_offset * g));
};
workspace_handle.RunFunc(cudnn_func, workspace_size_in_bytes);
} }
} }
}; };
...@@ -211,22 +209,20 @@ class CUDNNConvTransposeGradOpKernel : public framework::OpKernel<T> { ...@@ -211,22 +209,20 @@ class CUDNNConvTransposeGradOpKernel : public framework::OpKernel<T> {
output_grad->numel() / output_grad->dims()[0] / groups; output_grad->numel() / output_grad->dims()[0] / groups;
int filter_offset = filter->numel() / groups; int filter_offset = filter->numel() / groups;
T alpha = 1.0f, beta = 0.0f; T alpha = 1.0f, beta = 0.0f;
auto workspace_handle = dev_ctx.cudnn_workspace_handle();
auto temp_allocation =
platform::DeviceTemporaryAllocator::Instance().Get(dev_ctx).Allocate(
workspace_size_in_bytes);
void* cudnn_workspace = temp_allocation->ptr();
if (input_grad) { if (input_grad) {
T* input_grad_data = input_grad->mutable_data<T>(ctx.GetPlace()); T* input_grad_data = input_grad->mutable_data<T>(ctx.GetPlace());
// Because beta is zero, it is unnecessary to reset input_grad. // Because beta is zero, it is unnecessary to reset input_grad.
for (int g = 0; g < groups; g++) { for (int g = 0; g < groups; g++) {
CUDNN_ENFORCE(platform::dynload::cudnnConvolutionForward( auto cudnn_func = [&](void* cudnn_workspace) {
handle, &alpha, cudnn_output_desc, CUDNN_ENFORCE(platform::dynload::cudnnConvolutionForward(
output_grad_data + output_grad_offset * g, cudnn_filter_desc, handle, &alpha, cudnn_output_desc,
filter_data + filter_offset * g, cudnn_conv_desc, data_algo, output_grad_data + output_grad_offset * g, cudnn_filter_desc,
cudnn_workspace, workspace_size_in_bytes, &beta, cudnn_input_desc, filter_data + filter_offset * g, cudnn_conv_desc, data_algo,
input_grad_data + input_offset * g)); cudnn_workspace, workspace_size_in_bytes, &beta, cudnn_input_desc,
input_grad_data + input_offset * g));
};
workspace_handle.RunFunc(cudnn_func, workspace_size_in_bytes);
} }
} }
...@@ -236,12 +232,15 @@ class CUDNNConvTransposeGradOpKernel : public framework::OpKernel<T> { ...@@ -236,12 +232,15 @@ class CUDNNConvTransposeGradOpKernel : public framework::OpKernel<T> {
// Because beta is zero, it is unnecessary to reset filter_grad. // Because beta is zero, it is unnecessary to reset filter_grad.
// Gradient with respect to the filter // Gradient with respect to the filter
for (int g = 0; g < groups; g++) { for (int g = 0; g < groups; g++) {
CUDNN_ENFORCE(platform::dynload::cudnnConvolutionBackwardFilter( auto cudnn_func = [&](void* cudnn_workspace) {
handle, &alpha, cudnn_output_desc, CUDNN_ENFORCE(platform::dynload::cudnnConvolutionBackwardFilter(
output_grad_data + output_grad_offset * g, cudnn_input_desc, handle, &alpha, cudnn_output_desc,
input_data + input_offset * g, cudnn_conv_desc, filter_algo, output_grad_data + output_grad_offset * g, cudnn_input_desc,
cudnn_workspace, workspace_size_in_bytes, &beta, cudnn_filter_desc, input_data + input_offset * g, cudnn_conv_desc, filter_algo,
filter_grad_data + filter_offset * g)); cudnn_workspace, workspace_size_in_bytes, &beta,
cudnn_filter_desc, filter_grad_data + filter_offset * g));
};
workspace_handle.RunFunc(cudnn_func, workspace_size_in_bytes);
} }
} }
} }
......
...@@ -20,7 +20,7 @@ if(WITH_GRPC) ...@@ -20,7 +20,7 @@ if(WITH_GRPC)
collective_client.cc collective_server.cc collective_client.cc collective_server.cc
${GRPC_SRCS} ${GRPC_SRCS}
PROTO send_recv.proto PROTO send_recv.proto
DEPS lod_tensor selected_rows_functor memory) DEPS lod_tensor selected_rows_functor memory scope ${GRPC_DEPS})
set_source_files_properties(grpc_serde_test.cc rpc_server_test.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS}) set_source_files_properties(grpc_serde_test.cc rpc_server_test.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
set(RPC_DEPS sendrecvop_rpc ${GRPC_DEPS}) set(RPC_DEPS sendrecvop_rpc ${GRPC_DEPS})
...@@ -32,15 +32,17 @@ else() ...@@ -32,15 +32,17 @@ else()
set(BRPC_SRCS brpc/brpc_client.cc brpc/brpc_server.cc brpc/brpc_sendrecvop_utils.cc brpc/brpc_variable_response.cc brpc/brpc_rdma_pool.cc) set(BRPC_SRCS brpc/brpc_client.cc brpc/brpc_server.cc brpc/brpc_sendrecvop_utils.cc brpc/brpc_variable_response.cc brpc/brpc_rdma_pool.cc)
set_source_files_properties(${BRPC_SRCS} parameter_prefetch.cc parameter_send.cc rpc_server_test.cc brpc/brpc_serde_test.cc collective_server.cc collective_server_test.cc collective_client.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS}) set_source_files_properties(${BRPC_SRCS} parameter_prefetch.cc parameter_send.cc rpc_server_test.cc brpc/brpc_serde_test.cc collective_server.cc collective_server_test.cc collective_client.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
set(BRPC_DEPS brpc ssl crypto protobuf leveldb snappystream snappy zlib)
brpc_library(sendrecvop_rpc SRCS sendrecvop_utils.cc brpc_library(sendrecvop_rpc SRCS sendrecvop_utils.cc
request_handler_impl.cc rpc_client.cc rpc_server.cc request_handler_impl.cc rpc_client.cc rpc_server.cc
variable_response.cc variable_response.cc
collective_client.cc collective_server.cc collective_client.cc collective_server.cc
${BRPC_SRCS} ${BRPC_SRCS}
PROTO send_recv.proto PROTO send_recv.proto
DEPS lod_tensor selected_rows memory) DEPS lod_tensor selected_rows memory scope ${BRPC_DEPS})
set(RPC_DEPS sendrecvop_rpc brpc ssl crypto protobuf leveldb snappystream snappy zlib) set(RPC_DEPS sendrecvop_rpc ${BRPC_DEPS})
cc_test(brpc_serde_test SRCS brpc/brpc_serde_test.cc cc_test(brpc_serde_test SRCS brpc/brpc_serde_test.cc
DEPS ${RPC_DEPS} gflags glog executor proto_desc lookup_sparse_table_op SERIAL) DEPS ${RPC_DEPS} gflags glog executor proto_desc lookup_sparse_table_op SERIAL)
endif() endif()
......
...@@ -62,7 +62,7 @@ VarHandlePtr BRPCClient::AsyncSendVar(const std::string& ep, ...@@ -62,7 +62,7 @@ VarHandlePtr BRPCClient::AsyncSendVar(const std::string& ep,
const std::string var_name_val = var_name; const std::string var_name_val = var_name;
const framework::Scope* p_scope = &scope; const framework::Scope* p_scope = &scope;
const auto ch_ptr = GetChannel(ep_val); const auto ch_ptr = GetChannel(ep_val);
const std::string method = "SendRPC"; const std::string method = kSendRPC;
VarHandlePtr var_h(new VarHandle(ep, method, var_name_val, p_ctx, p_scope)); VarHandlePtr var_h(new VarHandle(ep, method, var_name_val, p_ctx, p_scope));
framework::AsyncIO([=] { framework::AsyncIO([=] {
...@@ -156,15 +156,18 @@ VarHandlePtr BRPCClient::_AsyncGetVar(const std::string& ep, ...@@ -156,15 +156,18 @@ VarHandlePtr BRPCClient::_AsyncGetVar(const std::string& ep,
const platform::DeviceContext& ctx, const platform::DeviceContext& ctx,
const framework::Scope& scope, const framework::Scope& scope,
const std::string& var_name, const std::string& var_name,
const std::string& out_var_name,
const std::string& method_name, const std::string& method_name,
int64_t time_out) { int64_t time_out) {
const platform::DeviceContext* p_ctx = &ctx; const platform::DeviceContext* p_ctx = &ctx;
const std::string ep_val = ep; const std::string ep_val = ep;
const std::string var_name_val = var_name; const std::string var_name_val = var_name;
const std::string out_varname_val = out_var_name;
const framework::Scope* p_scope = &scope; const framework::Scope* p_scope = &scope;
const auto ch_ptr = GetChannel(ep_val); const auto ch_ptr = GetChannel(ep_val);
const std::string method = "GetRPC"; const std::string method = kGetRPC;
VarHandlePtr var_h(new VarHandle(ep, method, var_name_val, p_ctx, p_scope)); VarHandlePtr var_h(
new VarHandle(ep, method, out_varname_val, p_ctx, p_scope));
framework::AsyncIO([=] { framework::AsyncIO([=] {
auto ch_ctx = ch_ptr->Pop(); auto ch_ctx = ch_ptr->Pop();
...@@ -175,6 +178,7 @@ VarHandlePtr BRPCClient::_AsyncGetVar(const std::string& ep, ...@@ -175,6 +178,7 @@ VarHandlePtr BRPCClient::_AsyncGetVar(const std::string& ep,
sendrecv::VariableMessage req; sendrecv::VariableMessage req;
req.set_varname(var_name_val); req.set_varname(var_name_val);
req.set_out_varname(out_varname_val);
req.set_trainer_id(trainer_id_); req.set_trainer_id(trainer_id_);
google::protobuf::Closure* done = brpc::NewCallback( google::protobuf::Closure* done = brpc::NewCallback(
...@@ -182,8 +186,10 @@ VarHandlePtr BRPCClient::_AsyncGetVar(const std::string& ep, ...@@ -182,8 +186,10 @@ VarHandlePtr BRPCClient::_AsyncGetVar(const std::string& ep,
platform::RecordRPCEvent record_event(method, p_ctx); platform::RecordRPCEvent record_event(method, p_ctx);
if (method_name == "GetMonomerVariable") { if (method_name == kGetMonomerRPC) {
ch_ctx->stub->GetMonomerVariable(cntl, &req, response, done); ch_ctx->stub->GetMonomerVariable(cntl, &req, response, done);
} else if (method_name == kGetNoBarrierRPC) {
ch_ctx->stub->GetVariableNoBarrier(cntl, &req, response, done);
} else { } else {
ch_ctx->stub->GetVariable(cntl, &req, response, done); ch_ctx->stub->GetVariable(cntl, &req, response, done);
} }
...@@ -198,25 +204,39 @@ VarHandlePtr BRPCClient::_AsyncGetVar(const std::string& ep, ...@@ -198,25 +204,39 @@ VarHandlePtr BRPCClient::_AsyncGetVar(const std::string& ep,
return var_h; return var_h;
} }
VarHandlePtr BRPCClient::AsyncGetVarNoBarrier(
const std::string& ep, const platform::DeviceContext& ctx,
const framework::Scope& scope, const std::string& var_name,
const std::string& out_var_name, int64_t time_out) {
std::string var_name_no_barrier =
string::Sprintf("%s%s", var_name, WITHOUT_BARRIER_MESSAGE);
return _AsyncGetVar(ep, ctx, scope, var_name_no_barrier, out_var_name,
kGetNoBarrierRPC, time_out);
}
VarHandlePtr BRPCClient::AsyncGetMonomerVariable( VarHandlePtr BRPCClient::AsyncGetMonomerVariable(
const std::string& ep, const platform::DeviceContext& ctx, const std::string& ep, const platform::DeviceContext& ctx,
const framework::Scope& scope, const std::string& var_name, const framework::Scope& scope, const std::string& var_name,
int64_t time_out) { int64_t time_out) {
return _AsyncGetVar(ep, ctx, scope, var_name, "GetMonomerVariable", time_out); return _AsyncGetVar(ep, ctx, scope, var_name, var_name, kGetMonomerRPC,
time_out);
} }
VarHandlePtr BRPCClient::AsyncGetMonomerBarrier(const std::string& ep, VarHandlePtr BRPCClient::AsyncGetMonomerBarrier(const std::string& ep,
const std::string& var_name, const std::string& var_name,
int64_t time_out) { int64_t time_out) {
return AsyncSendMessage(ep, "GetMonomerBarrier", var_name, time_out); return AsyncSendMessage(ep, kSendMonomerFetchBarrierRPC, var_name, time_out);
} }
VarHandlePtr BRPCClient::AsyncGetVar(const std::string& ep, VarHandlePtr BRPCClient::AsyncGetVar(const std::string& ep,
const platform::DeviceContext& ctx, const platform::DeviceContext& ctx,
const framework::Scope& scope, const framework::Scope& scope,
const std::string& var_name, const std::string& var_name,
const std::string& out_var_name,
int64_t time_out) { int64_t time_out) {
return _AsyncGetVar(ep, ctx, scope, var_name, "GetVariable", time_out); return _AsyncGetVar(ep, ctx, scope, var_name, out_var_name, kGetRPC,
time_out);
} }
VarHandlePtr BRPCClient::AsyncPrefetchVar(const std::string& ep, VarHandlePtr BRPCClient::AsyncPrefetchVar(const std::string& ep,
...@@ -234,7 +254,7 @@ VarHandlePtr BRPCClient::AsyncPrefetchVar(const std::string& ep, ...@@ -234,7 +254,7 @@ VarHandlePtr BRPCClient::AsyncPrefetchVar(const std::string& ep,
const framework::Scope* p_scope = &scope; const framework::Scope* p_scope = &scope;
const auto ch_ptr = GetChannel(ep_val); const auto ch_ptr = GetChannel(ep_val);
const std::string method = "PrefetchRPC"; const std::string method = kPrefetchRPC;
VarHandlePtr var_h( VarHandlePtr var_h(
new VarHandle(ep, method, out_var_name_val, p_ctx, p_scope)); new VarHandle(ep, method, out_var_name_val, p_ctx, p_scope));
...@@ -270,7 +290,7 @@ VarHandlePtr BRPCClient::AsyncPrefetchVar(const std::string& ep, ...@@ -270,7 +290,7 @@ VarHandlePtr BRPCClient::AsyncPrefetchVar(const std::string& ep,
VarHandlePtr BRPCClient::AsyncSendBatchBarrier(const std::string& ep, VarHandlePtr BRPCClient::AsyncSendBatchBarrier(const std::string& ep,
int64_t time_out) { int64_t time_out) {
return AsyncSendMessage(ep, "BatchBarrierRPC", BATCH_BARRIER_MESSAGE, return AsyncSendMessage(ep, kBatchBarrierRPC, BATCH_BARRIER_MESSAGE,
time_out); time_out);
} }
...@@ -286,7 +306,7 @@ VarHandlePtr BRPCClient::AsyncSendFetchBarrier(const std::string& ep, ...@@ -286,7 +306,7 @@ VarHandlePtr BRPCClient::AsyncSendFetchBarrier(const std::string& ep,
sendrecv::VariableMessage req; sendrecv::VariableMessage req;
req.set_varname(FETCH_BARRIER_MESSAGE); req.set_varname(FETCH_BARRIER_MESSAGE);
const std::string method = "FetchBarrierRPC"; const std::string method = kFetchBarrierRPC;
// var handle // var handle
VarHandlePtr var_h( VarHandlePtr var_h(
new VarHandle(ep, method, FETCH_BARRIER_MESSAGE, nullptr, nullptr)); new VarHandle(ep, method, FETCH_BARRIER_MESSAGE, nullptr, nullptr));
...@@ -367,7 +387,7 @@ ChannelQueuePtr BRPCClient::GetChannel(const std::string& ep) { ...@@ -367,7 +387,7 @@ ChannelQueuePtr BRPCClient::GetChannel(const std::string& ep) {
VarHandlePtr BRPCClient::AsyncSendComplete(const std::string& ep, VarHandlePtr BRPCClient::AsyncSendComplete(const std::string& ep,
int64_t time_out) { int64_t time_out) {
return AsyncSendMessage(ep, "SendCompleteRPC", COMPLETE_MESSAGE, time_out); return AsyncSendMessage(ep, kSendCompleteRPC, COMPLETE_MESSAGE, time_out);
} }
void BRPCClient::SendComplete() { void BRPCClient::SendComplete() {
...@@ -394,9 +414,9 @@ VarHandlePtr BRPCClient::AsyncSendVarMessage( ...@@ -394,9 +414,9 @@ VarHandlePtr BRPCClient::AsyncSendVarMessage(
google::protobuf::Closure* done = brpc::NewCallback( google::protobuf::Closure* done = brpc::NewCallback(
&HandleSendResponse, cntl, response, var_h, ch_ptr, ch_ctx, this); &HandleSendResponse, cntl, response, var_h, ch_ptr, ch_ctx, this);
if (method_name == "CheckPointNotifyRPC") { if (method_name == kCheckPointNotifyRPC) {
ch_ctx->stub->CheckpointNotify(cntl, &req, response, done); ch_ctx->stub->CheckpointNotify(cntl, &req, response, done);
} else if (method_name == "GetMonomerBarrier") { } else if (method_name == kSendMonomerFetchBarrierRPC) {
ch_ctx->stub->GetMonomerBarrier(cntl, &req, response, done); ch_ctx->stub->GetMonomerBarrier(cntl, &req, response, done);
} else { } else {
ch_ctx->stub->SendVariable(cntl, &req, response, done); ch_ctx->stub->SendVariable(cntl, &req, response, done);
......
...@@ -65,6 +65,7 @@ class BRPCClient : public RPCClient { ...@@ -65,6 +65,7 @@ class BRPCClient : public RPCClient {
const platform::DeviceContext& ctx, const platform::DeviceContext& ctx,
const framework::Scope& scope, const framework::Scope& scope,
const std::string& var_name, const std::string& var_name,
const std::string& out_var_name,
int64_t time_out = FLAGS_rpc_deadline) override; int64_t time_out = FLAGS_rpc_deadline) override;
VarHandlePtr AsyncGetMonomerBarrier( VarHandlePtr AsyncGetMonomerBarrier(
...@@ -76,6 +77,13 @@ class BRPCClient : public RPCClient { ...@@ -76,6 +77,13 @@ class BRPCClient : public RPCClient {
const framework::Scope& scope, const std::string& var_name, const framework::Scope& scope, const std::string& var_name,
int64_t time_out = FLAGS_rpc_deadline) override; int64_t time_out = FLAGS_rpc_deadline) override;
VarHandlePtr AsyncGetVarNoBarrier(const std::string& ep,
const platform::DeviceContext& ctx,
const framework::Scope& scope,
const std::string& var_name,
const std::string& out_varname,
int64_t time_out = FLAGS_rpc_deadline);
VarHandlePtr AsyncPrefetchVar(const std::string& ep, VarHandlePtr AsyncPrefetchVar(const std::string& ep,
const platform::DeviceContext& ctx, const platform::DeviceContext& ctx,
const framework::Scope& scope, const framework::Scope& scope,
...@@ -103,6 +111,7 @@ class BRPCClient : public RPCClient { ...@@ -103,6 +111,7 @@ class BRPCClient : public RPCClient {
const platform::DeviceContext& ctx, const platform::DeviceContext& ctx,
const framework::Scope& scope, const framework::Scope& scope,
const std::string& var_name, const std::string& var_name,
const std::string& out_var_name,
const std::string& method_name, const std::string& method_name,
int64_t time_out = FLAGS_rpc_deadline); int64_t time_out = FLAGS_rpc_deadline);
......
...@@ -45,6 +45,13 @@ class BRPCServiceImpl : public SendRecvService { ...@@ -45,6 +45,13 @@ class BRPCServiceImpl : public SendRecvService {
rpc_server_->GetThreadNum(distributed::kRequestGet))); rpc_server_->GetThreadNum(distributed::kRequestGet)));
} }
it = rpc_call_map.find(distributed::kRequestGetNoBarrier);
if (it != rpc_call_map.end()) {
request_getnobarrier_h_ = it->second;
getnobarrier_threads_.reset(new paddle::framework::ThreadPool(
rpc_server_->GetThreadNum(distributed::kRequestGetNoBarrier)));
}
it = rpc_call_map.find(distributed::kRequestPrefetch); it = rpc_call_map.find(distributed::kRequestPrefetch);
if (it != rpc_call_map.end()) { if (it != rpc_call_map.end()) {
request_prefetch_h_ = it->second; request_prefetch_h_ = it->second;
...@@ -112,6 +119,14 @@ class BRPCServiceImpl : public SendRecvService { ...@@ -112,6 +119,14 @@ class BRPCServiceImpl : public SendRecvService {
[=] { _GetVariable(cntl_butil, request, response, done); }); [=] { _GetVariable(cntl_butil, request, response, done); });
} }
void GetVariableNoBarrier(google::protobuf::RpcController* cntl_butil,
const VariableMessage* request,
VariableMessage* response,
google::protobuf::Closure* done) override {
getnobarrier_threads_->Run(
[=] { _GetVariableNoBarrier(cntl_butil, request, response, done); });
}
void _GetVariable(google::protobuf::RpcController* cntl_butil, void _GetVariable(google::protobuf::RpcController* cntl_butil,
const VariableMessage* request, VariableMessage* response, const VariableMessage* request, VariableMessage* response,
google::protobuf::Closure* done) { google::protobuf::Closure* done) {
...@@ -122,23 +137,59 @@ class BRPCServiceImpl : public SendRecvService { ...@@ -122,23 +137,59 @@ class BRPCServiceImpl : public SendRecvService {
brpc::Controller* cntl = static_cast<brpc::Controller*>(cntl_butil); brpc::Controller* cntl = static_cast<brpc::Controller*>(cntl_butil);
std::string varname = request->varname(); std::string varname = request->varname();
std::string out_varname = request->out_varname();
VLOG(3) << "RequestGet varname:" << varname VLOG(3) << "RequestGet varname:" << varname
<< ", out_varname:" << out_varname
<< ", trainer_id:" << request->trainer_id() << ", trainer_id:" << request->trainer_id()
<< ", from:" << cntl->remote_side(); << ", from:" << cntl->remote_side();
auto scope = request_get_h_->scope(); auto scope = request_get_h_->scope();
auto invar = scope->FindVar(varname); paddle::framework::Variable* invar = nullptr;
int trainer_id = request->trainer_id();
paddle::framework::Variable* outvar = nullptr;
request_get_h_->Handle(varname, scope, invar, &outvar, trainer_id,
out_varname);
if (outvar) {
distributed::SerializeToIOBuf(out_varname, outvar,
*request_get_h_->dev_ctx(), response,
&cntl->response_attachment(), "", false);
}
}
void _GetVariableNoBarrier(google::protobuf::RpcController* cntl_butil,
const VariableMessage* request,
VariableMessage* response,
google::protobuf::Closure* done) {
PADDLE_ENFORCE(request_getnobarrier_h_ != nullptr,
"RequestGetNoBarrier handler should be registed first!");
brpc::ClosureGuard done_guard(done);
brpc::Controller* cntl = static_cast<brpc::Controller*>(cntl_butil);
std::string varname = request->varname();
std::string out_varname = request->out_varname();
int trainer_id = request->trainer_id(); int trainer_id = request->trainer_id();
VLOG(3) << "RequestGetNoBarrier varname:" << varname
<< ", out_varname:" << out_varname << ", trainer_id:" << trainer_id
<< ", from:" << cntl->remote_side();
auto scope = request_getnobarrier_h_->scope();
paddle::framework::Variable* invar = nullptr;
paddle::framework::Variable* outvar = nullptr; paddle::framework::Variable* outvar = nullptr;
request_get_h_->Handle(varname, scope, invar, &outvar, trainer_id); request_getnobarrier_h_->Handle(varname, scope, invar, &outvar, trainer_id,
out_varname);
if (outvar) { if (outvar) {
distributed::SerializeToIOBuf(varname, outvar, *request_get_h_->dev_ctx(), distributed::SerializeToIOBuf(
response, &cntl->response_attachment(), "", out_varname, outvar, *request_getnobarrier_h_->dev_ctx(), response,
false); &cntl->response_attachment(), "", false);
} }
} }
void PrefetchVariable(google::protobuf::RpcController* cntl_butil, void PrefetchVariable(google::protobuf::RpcController* cntl_butil,
const VariableMessage* request, const VariableMessage* request,
VariableMessage* response, VariableMessage* response,
...@@ -282,6 +333,7 @@ class BRPCServiceImpl : public SendRecvService { ...@@ -282,6 +333,7 @@ class BRPCServiceImpl : public SendRecvService {
private: private:
distributed::RequestHandler* request_send_h_{nullptr}; distributed::RequestHandler* request_send_h_{nullptr};
distributed::RequestHandler* request_get_h_{nullptr}; distributed::RequestHandler* request_get_h_{nullptr};
distributed::RequestHandler* request_getnobarrier_h_{nullptr};
distributed::RequestHandler* request_prefetch_h_{nullptr}; distributed::RequestHandler* request_prefetch_h_{nullptr};
distributed::RequestHandler* request_checkpoint_h_{nullptr}; distributed::RequestHandler* request_checkpoint_h_{nullptr};
distributed::RequestHandler* request_get_monomer_handler_h_{nullptr}; distributed::RequestHandler* request_get_monomer_handler_h_{nullptr};
...@@ -289,9 +341,10 @@ class BRPCServiceImpl : public SendRecvService { ...@@ -289,9 +341,10 @@ class BRPCServiceImpl : public SendRecvService {
distributed::RPCServer* rpc_server_{nullptr}; distributed::RPCServer* rpc_server_{nullptr};
// FIXME(gongwb): brpc should support process one rpce use one threadpool. // FIXME(gongwb): brpc should support process one rpc use one threadpool.
std::unique_ptr<paddle::framework::ThreadPool> send_threads_; std::unique_ptr<paddle::framework::ThreadPool> send_threads_;
std::unique_ptr<paddle::framework::ThreadPool> get_threads_; std::unique_ptr<paddle::framework::ThreadPool> get_threads_;
std::unique_ptr<paddle::framework::ThreadPool> getnobarrier_threads_;
std::unique_ptr<paddle::framework::ThreadPool> prefetch_threads_; std::unique_ptr<paddle::framework::ThreadPool> prefetch_threads_;
std::unique_ptr<paddle::framework::ThreadPool> checkpoint_notify_threads_; std::unique_ptr<paddle::framework::ThreadPool> checkpoint_notify_threads_;
}; };
......
...@@ -54,9 +54,20 @@ bool RequestSendHandler::Handle(const std::string& varname, ...@@ -54,9 +54,20 @@ bool RequestSendHandler::Handle(const std::string& varname,
// Async // Async
if (!sync_mode_) { if (!sync_mode_) {
VLOG(3) << "async process var: " << varname; VLOG(3) << "async process var: " << varname;
executor_->RunPreparedContext((*grad_to_prepared_ctx_)[varname].get(), if (varname == BATCH_BARRIER_MESSAGE) {
scope); PADDLE_THROW(
delete scope; "async mode should not recv BATCH_BARRIER_MESSAGE or "
"COMPLETE_MESSAGE");
}
try {
executor_->RunPreparedContext((*grad_to_prepared_ctx_)[varname].get(),
scope);
delete scope;
} catch (std::exception& e) {
LOG(ERROR) << "async: run sub program error " << e.what();
return false;
}
return true; return true;
} else { // sync } else { // sync
rpc_server_->WaitCond(kRequestSend); rpc_server_->WaitCond(kRequestSend);
......
...@@ -39,27 +39,33 @@ void RPCServer::SavePort() const { ...@@ -39,27 +39,33 @@ void RPCServer::SavePort() const {
port_file.open(file_path); port_file.open(file_path);
port_file << selected_port_; port_file << selected_port_;
port_file.close(); port_file.close();
VLOG(4) << "selected port written to " << file_path; VLOG(3) << "selected port written to " << file_path;
} }
void RPCServer::WaitBarrier(const std::string& rpc_name) { void RPCServer::WaitBarrier(const std::string& rpc_name) {
VLOG(3) << "WaitBarrier in: " << rpc_name;
std::unique_lock<std::mutex> lock(this->mutex_); std::unique_lock<std::mutex> lock(this->mutex_);
barrier_cond_.wait(lock, [this, &rpc_name] { barrier_cond_.wait(lock, [this, &rpc_name] {
return ((barrier_counter_[rpc_name] == client_num_ && client_num_ != 0) || return ((barrier_counter_[rpc_name] == client_num_ && client_num_ != 0) ||
exit_flag_.load()); exit_flag_.load());
}); });
VLOG(3) << "batch_barrier_: " << rpc_name << " " VLOG(3) << "WaitBarrier out: " << rpc_name
<< barrier_counter_[rpc_name]; << " counter: " << barrier_counter_[rpc_name];
} }
void RPCServer::IncreaseBatchBarrier(const std::string rpc_name) { void RPCServer::IncreaseBatchBarrier(const std::string rpc_name) {
VLOG(4) << "RPCServer begin IncreaseBatchBarrier " << rpc_name; VLOG(3) << "RPCServer begin IncreaseBatchBarrier " << rpc_name;
// barrier msg should make sure that it's in the right cond(send|recv)
WaitCond(rpc_name);
int b = 0; int b = 0;
std::unique_lock<std::mutex> lock(mutex_); std::unique_lock<std::mutex> lock(mutex_);
b = ++barrier_counter_[rpc_name]; b = ++barrier_counter_[rpc_name];
VLOG(3) << rpc_name << " barrier_counter: " << b;
if (b >= client_num_) { if (b >= client_num_) {
lock.unlock(); lock.unlock();
VLOG(3) << "BatchBarrier counter reach " << client_num_ << " for "
<< rpc_name;
barrier_cond_.notify_all(); barrier_cond_.notify_all();
lock.lock(); lock.lock();
} }
...@@ -71,7 +77,7 @@ void RPCServer::Complete() { ...@@ -71,7 +77,7 @@ void RPCServer::Complete() {
client_num_--; client_num_--;
need_reset_all_vars_ = true; need_reset_all_vars_ = true;
VLOG(4) << "decrease client_num to: " << client_num_; VLOG(3) << "decrease client_num to: " << client_num_;
if (cur_cond_.load() == rpc_cond_map_[kRequestGet]) { if (cur_cond_.load() == rpc_cond_map_[kRequestGet]) {
barrier_counter_[kRequestGet]--; barrier_counter_[kRequestGet]--;
} }
...@@ -105,8 +111,8 @@ void RPCServer::RegisterRPC(const std::string& rpc_name, ...@@ -105,8 +111,8 @@ void RPCServer::RegisterRPC(const std::string& rpc_name,
static int cond = -1; static int cond = -1;
rpc_cond_map_[rpc_name] = ++cond; rpc_cond_map_[rpc_name] = ++cond;
VLOG(4) << "RegisterRPC rpc_name:" << rpc_name << ", handler:" << handler VLOG(3) << "RegisterRPC rpc_name: " << rpc_name << ", handler: " << handler
<< ", cond:" << rpc_cond_map_[rpc_name]; << ", cond: " << rpc_cond_map_[rpc_name];
} }
void RPCServer::SetCond(const std::string& rpc_name) { void RPCServer::SetCond(const std::string& rpc_name) {
...@@ -120,7 +126,7 @@ void RPCServer::SetCond(const std::string& rpc_name) { ...@@ -120,7 +126,7 @@ void RPCServer::SetCond(const std::string& rpc_name) {
} }
void RPCServer::WaitCond(const std::string& rpc_name) { void RPCServer::WaitCond(const std::string& rpc_name) {
VLOG(4) << "RPCServer WaitCond " << rpc_name; VLOG(3) << "RPCServer WaitCond in " << rpc_name;
int cond = 0; int cond = 0;
{ {
std::unique_lock<std::mutex> lock(mutex_); std::unique_lock<std::mutex> lock(mutex_);
...@@ -130,6 +136,7 @@ void RPCServer::WaitCond(const std::string& rpc_name) { ...@@ -130,6 +136,7 @@ void RPCServer::WaitCond(const std::string& rpc_name) {
std::unique_lock<std::mutex> lock(mutex_); std::unique_lock<std::mutex> lock(mutex_);
rpc_cond_.wait( rpc_cond_.wait(
lock, [=] { return (cur_cond_.load() == cond || exit_flag_.load()); }); lock, [=] { return (cur_cond_.load() == cond || exit_flag_.load()); });
VLOG(3) << "RPCServer WaitCond out " << rpc_name;
} }
void RPCServer::RegisterVar(const std::string& var_name, void RPCServer::RegisterVar(const std::string& var_name,
...@@ -151,7 +158,7 @@ void RPCServer::RegisterVar(const std::string& var_name, ...@@ -151,7 +158,7 @@ void RPCServer::RegisterVar(const std::string& var_name,
} }
rpc_cond_.notify_all(); rpc_cond_.notify_all();
VLOG(4) << "RegisterVar context:" << h.String(); VLOG(3) << "RegisterVar context:" << h.String();
} }
void RPCServer::IncreaseVarBarrier(const std::string& var_name) { void RPCServer::IncreaseVarBarrier(const std::string& var_name) {
...@@ -167,11 +174,11 @@ void RPCServer::IncreaseVarBarrier(const std::string& var_name) { ...@@ -167,11 +174,11 @@ void RPCServer::IncreaseVarBarrier(const std::string& var_name) {
barrier_cond_.notify_all(); barrier_cond_.notify_all();
} }
VLOG(4) << "IncreaseVarBarrier context:" << h.String(); VLOG(3) << "IncreaseVarBarrier context:" << h.String();
} }
void RPCServer::WaitVarBarrier(const std::string& var_name) { void RPCServer::WaitVarBarrier(const std::string& var_name) {
VLOG(4) << "WaitBarrier var_name:" << var_name; VLOG(3) << "WaitVarBarrier var_name:" << var_name;
std::unique_lock<std::mutex> lock(mutex_); std::unique_lock<std::mutex> lock(mutex_);
barrier_cond_.wait(lock, [&]() { barrier_cond_.wait(lock, [&]() {
...@@ -179,11 +186,11 @@ void RPCServer::WaitVarBarrier(const std::string& var_name) { ...@@ -179,11 +186,11 @@ void RPCServer::WaitVarBarrier(const std::string& var_name) {
exit_flag_.load()); exit_flag_.load());
}); });
VLOG(4) << "WaitBarrier context: " << var_map_[var_name].String(); VLOG(3) << "WaitVarBarrier context: " << var_map_[var_name].String();
} }
void RPCServer::SetVarCond(const std::string& var_name) { void RPCServer::SetVarCond(const std::string& var_name) {
VLOG(4) << "SetVarCond var_name:" << var_name; VLOG(3) << "SetVarCond var_name:" << var_name;
{ {
std::unique_lock<std::mutex> lock(mutex_); std::unique_lock<std::mutex> lock(mutex_);
if (var_map_.find(var_name) != var_map_.end()) { if (var_map_.find(var_name) != var_map_.end()) {
...@@ -193,14 +200,14 @@ void RPCServer::SetVarCond(const std::string& var_name) { ...@@ -193,14 +200,14 @@ void RPCServer::SetVarCond(const std::string& var_name) {
} }
void RPCServer::WaitVarCond(const std::string& var_name) { void RPCServer::WaitVarCond(const std::string& var_name) {
VLOG(4) << "WaitVarCond var_name:" << var_name; VLOG(3) << "WaitVarCond var_name:" << var_name;
std::unique_lock<std::mutex> lock(mutex_); std::unique_lock<std::mutex> lock(mutex_);
rpc_cond_.wait(lock, [=] { rpc_cond_.wait(lock, [=] {
return (var_map_.find(var_name) != var_map_.end() || exit_flag_.load()); return (var_map_.find(var_name) != var_map_.end() || exit_flag_.load());
}); });
VLOG(4) << "WaitVarCond var_name:" << var_name << " end"; VLOG(3) << "WaitVarCond var_name:" << var_name << " end";
} }
MonomerHandle RPCServer::GetMonomer(const std::string& var_name) { MonomerHandle RPCServer::GetMonomer(const std::string& var_name) {
......
...@@ -117,8 +117,9 @@ bool VariableResponse::CopyLodTensorData( ...@@ -117,8 +117,9 @@ bool VariableResponse::CopyLodTensorData(
tensor->mutable_data(ctx.GetPlace(), ToVarType(meta_.data_type())); tensor->mutable_data(ctx.GetPlace(), ToVarType(meta_.data_type()));
VLOG(6) << "Tensor.memory_size = " << tensor->memory_size() VLOG(6) << "Tensor.memory_size = " << tensor->memory_size()
<< ", Buffer Size = " << length; << ", Buffer Size = " << length << ", dims:" << dims
PADDLE_ENFORCE_EQ(tensor->memory_size(), static_cast<unsigned int>(length)); << ", numel:" << tensor->numel();
PADDLE_ENFORCE_GE(tensor->memory_size(), static_cast<unsigned int>(length));
return ReadRaw(input, ctx, tensor->place(), tensor_data, length); return ReadRaw(input, ctx, tensor->place(), tensor_data, length);
} }
......
...@@ -137,7 +137,9 @@ void ListenAndServOp::RunSyncLoop( ...@@ -137,7 +137,9 @@ void ListenAndServOp::RunSyncLoop(
while (true) { while (true) {
// Get from multiple trainers, we don't care about the order in which // Get from multiple trainers, we don't care about the order in which
// the gradients arrives, just add suffix 0~n and merge the gradient. // the gradients arrives, just add suffix 0~n and merge the gradient.
VLOG(3) << "wait all clients to send gradient";
rpc_service_->SetCond(distributed::kRequestSend); rpc_service_->SetCond(distributed::kRequestSend);
VLOG(3) << "wait all clients to send send_barrier";
rpc_service_->WaitBarrier(distributed::kRequestSend); rpc_service_->WaitBarrier(distributed::kRequestSend);
if (rpc_service_->IsExit()) { if (rpc_service_->IsExit()) {
...@@ -168,12 +170,16 @@ void ListenAndServOp::RunSyncLoop( ...@@ -168,12 +170,16 @@ void ListenAndServOp::RunSyncLoop(
} }
ParallelExecuteBlocks(parallel_blkids, executor, optimize_prepared, program, ParallelExecuteBlocks(parallel_blkids, executor, optimize_prepared, program,
recv_scope); recv_scope);
VLOG(2) << "run all blocks spent " << GetTimestamp() - ts << "(ms)"; VLOG(3) << "run all blocks spent " << GetTimestamp() - ts << "(ms)";
VLOG(3) << "ResetReceivedVars";
ResetReceivedVars(recv_scope, dev_ctx, rpc_service_->NeedResetAllVars()); ResetReceivedVars(recv_scope, dev_ctx, rpc_service_->NeedResetAllVars());
VLOG(3) << "wait all clients to get parameters back";
rpc_service_->SetCond(distributed::kRequestGet); rpc_service_->SetCond(distributed::kRequestGet);
VLOG(3) << "wait all clients to send fetch_barrier";
rpc_service_->WaitBarrier(distributed::kRequestGet); rpc_service_->WaitBarrier(distributed::kRequestGet);
VLOG(3) << "ResetBarrierCounter";
rpc_service_->ResetBarrierCounter(); rpc_service_->ResetBarrierCounter();
} // while(true) } // while(true)
} }
......
...@@ -43,9 +43,9 @@ class MergeIdsOpKernel : public framework::OpKernel<T> { ...@@ -43,9 +43,9 @@ class MergeIdsOpKernel : public framework::OpKernel<T> {
PADDLE_ENFORCE_EQ(ids.size(), outs.size(), PADDLE_ENFORCE_EQ(ids.size(), outs.size(),
"the number of Ids and Out should be the same"); "the number of Ids and Out should be the same");
size_t row_ids_size = 0; int64_t row_ids_size = 0;
int row_size = 0; int64_t row_size = 0;
int embedding_size = 0; int64_t embedding_size = 0;
for (size_t i = 0; i < x_tensors.size(); ++i) { for (size_t i = 0; i < x_tensors.size(); ++i) {
const auto *x_tensor = x_tensors[i]; const auto *x_tensor = x_tensors[i];
...@@ -69,7 +69,7 @@ class MergeIdsOpKernel : public framework::OpKernel<T> { ...@@ -69,7 +69,7 @@ class MergeIdsOpKernel : public framework::OpKernel<T> {
for (size_t i = 0; i < x_tensors.size(); ++i) { for (size_t i = 0; i < x_tensors.size(); ++i) {
const auto *row_id = row_ids[i]; const auto *row_id = row_ids[i];
for (int j = 0; j < row_id->numel(); ++j) { for (auto j = 0; j < row_id->numel(); ++j) {
int64_t key = row_id->data<int64_t>()[j]; int64_t key = row_id->data<int64_t>()[j];
std::tuple<int64_t, int64_t> val = std::make_tuple(i, j); std::tuple<int64_t, int64_t> val = std::make_tuple(i, j);
selected_rows_idx_map.insert(std::make_pair(key, val)); selected_rows_idx_map.insert(std::make_pair(key, val));
...@@ -84,13 +84,13 @@ class MergeIdsOpKernel : public framework::OpKernel<T> { ...@@ -84,13 +84,13 @@ class MergeIdsOpKernel : public framework::OpKernel<T> {
out->set_lod(out_ids->lod()); out->set_lod(out_ids->lod());
int nums = static_cast<int>(out_ids->dims()[0]); auto nums = out_ids->dims()[0];
auto *out_data = out->mutable_data<T>( auto *out_data = out->mutable_data<T>(
framework::make_ddim({nums, embedding_size}), place); framework::make_ddim({nums, embedding_size}), place);
for (int j = 0; j < nums; ++j) { for (auto j = 0; j < nums; ++j) {
int id = out_ids->data<int64_t>()[j]; auto id = out_ids->data<int64_t>()[j];
auto row_tuple = selected_rows_idx_map[id]; auto row_tuple = selected_rows_idx_map.at(id);
int64_t row_idx = std::get<1>(row_tuple); auto row_idx = std::get<1>(row_tuple);
const auto *x_tensor = x_tensors[std::get<0>(row_tuple)]; const auto *x_tensor = x_tensors[std::get<0>(row_tuple)];
memcpy(out_data + embedding_size * j, memcpy(out_data + embedding_size * j,
......
...@@ -277,68 +277,6 @@ class TransformFunctor { ...@@ -277,68 +277,6 @@ class TransformFunctor {
Functor func_; Functor func_;
}; };
#define EIGEN_FUNCTOR(name, eigen_op) \
struct Eigen##name##Functor { \
template <typename DeviceContext, typename T> \
inline void Run(const framework::Tensor *x, const framework::Tensor *y, \
framework::Tensor *z, \
const framework::ExecutionContext &ctx) { \
auto x_e = framework::EigenVector<T>::Flatten(*x); \
auto y_e = framework::EigenVector<T>::Flatten(*y); \
auto z_e = framework::EigenVector<T>::Flatten(*z); \
z_e.device( \
*ctx.template device_context<DeviceContext>().eigen_device()) = \
eigen_op(x_e, y_e); \
} \
template <typename DeviceContext, typename T> \
inline void RunBroadCast(const framework::Tensor *x, \
const framework::Tensor *y, framework::Tensor *z, \
const framework::ExecutionContext &ctx, int pre, \
int n) { \
auto x_e = framework::EigenVector<T>::Flatten(*x); \
auto y_e = framework::EigenVector<T>::Flatten(*y); \
auto z_e = framework::EigenVector<T>::Flatten(*z); \
auto y_bcast = y_e.reshape(Eigen::DSizes<int, 2>(1, n)) \
.broadcast(Eigen::DSizes<int, 2>(pre, 1)) \
.reshape(Eigen::DSizes<int, 1>(x_e.size())); \
z_e.device( \
*ctx.template device_context<DeviceContext>().eigen_device()) = \
eigen_op(x_e, y_bcast); \
} \
template <typename DeviceContext, typename T> \
inline void RunBroadCast2(const framework::Tensor *x, \
const framework::Tensor *y, \
framework::Tensor *z, \
const framework::ExecutionContext &ctx, int pre, \
int n, int post) { \
auto x_e = framework::EigenVector<T>::Flatten(*x); \
auto y_e = framework::EigenVector<T>::Flatten(*y); \
auto z_e = framework::EigenVector<T>::Flatten(*z); \
auto y_bcast = y_e.reshape(Eigen::DSizes<int, 3>(1, n, 1)) \
.broadcast(Eigen::DSizes<int, 3>(pre, 1, post)) \
.reshape(Eigen::DSizes<int, 1>(x_e.size())); \
z_e.device( \
*ctx.template device_context<DeviceContext>().eigen_device()) = \
eigen_op(x_e, y_bcast); \
} \
}
#define EIGEN_ADD(x, y) ((x) + (y))
EIGEN_FUNCTOR(Add, EIGEN_ADD);
#define EIGEN_SUB(x, y) ((x) - (y))
EIGEN_FUNCTOR(Sub, EIGEN_SUB);
#define EIGEN_MUL(x, y) ((x) * (y))
EIGEN_FUNCTOR(Mul, EIGEN_MUL);
#define EIGEN_DIV(x, y) ((x) / (y))
EIGEN_FUNCTOR(Div, EIGEN_DIV);
template <typename T, typename DX_OP, typename DY_OP> template <typename T, typename DX_OP, typename DY_OP>
struct ElemwiseGradNoBroadcast { struct ElemwiseGradNoBroadcast {
const T *x_; const T *x_;
......
...@@ -216,19 +216,18 @@ class CUDNNConvInceptionFusionOpKernel : public framework::OpKernel<T> { ...@@ -216,19 +216,18 @@ class CUDNNConvInceptionFusionOpKernel : public framework::OpKernel<T> {
out_datas.push_back( out_datas.push_back(
static_cast<void*>(output_data + (oc0 + oc1 + oc2) * h * w)); static_cast<void*>(output_data + (oc0 + oc1 + oc2) * h * w));
auto temp_allocation =
platform::DeviceTemporaryAllocator::Instance().Get(dev_ctx).Allocate(
workspace_size_in_bytes);
void* cudnn_workspace = temp_allocation->ptr();
for (int i = 0; i < 4; ++i) { for (int i = 0; i < 4; ++i) {
CUDNN_ENFORCE(platform::dynload::cudnnConvolutionBiasActivationForward( auto func = [&](void* cudnn_workspace) {
handle, &alpha, in_desc[i], in_datas[i], filter_desc[i], CUDNN_ENFORCE(platform::dynload::cudnnConvolutionBiasActivationForward(
static_cast<const void*>(filters[i]->data<T>()), conv_desc[i], handle, &alpha, in_desc[i], in_datas[i], filter_desc[i],
algo[i], cudnn_workspace, workspace_size_in_bytes, &beta, out_desc[i], static_cast<const void*>(filters[i]->data<T>()), conv_desc[i],
out_datas[i], bias_desc[i], algo[i], cudnn_workspace, workspace_size_in_bytes, &beta,
static_cast<const void*>(bias[i]->data<T>()), cudnn_act_desc, out_desc[i], out_datas[i], bias_desc[i],
out_desc[i], out_datas[i])); static_cast<const void*>(bias[i]->data<T>()), cudnn_act_desc,
out_desc[i], out_datas[i]));
};
auto workspace_handle = dev_ctx.cudnn_workspace_handle();
workspace_handle.RunFunc(func, workspace_size_in_bytes);
} }
cudnnTensorDescriptor_t x_desc; cudnnTensorDescriptor_t x_desc;
......
...@@ -43,12 +43,14 @@ class GridSampleOp : public framework::OperatorWithKernel { ...@@ -43,12 +43,14 @@ class GridSampleOp : public framework::OperatorWithKernel {
PADDLE_ENFORCE(grid_dims[3] == 2, "Input(Grid) dims[3] should be 2."); PADDLE_ENFORCE(grid_dims[3] == 2, "Input(Grid) dims[3] should be 2.");
PADDLE_ENFORCE_EQ(grid_dims[0], x_dims[0], PADDLE_ENFORCE_EQ(grid_dims[0], x_dims[0],
"Input(X) and Input(Grid) dims[0] should be equal."); "Input(X) and Input(Grid) dims[0] should be equal.");
PADDLE_ENFORCE_EQ( if (ctx->IsRuntime()) {
grid_dims[1], x_dims[2], PADDLE_ENFORCE_EQ(
"Input(X) dims[2] and Input(Grid) dims[1] should be equal."); grid_dims[1], x_dims[2],
PADDLE_ENFORCE_EQ( "Input(X) dims[2] and Input(Grid) dims[1] should be equal.");
grid_dims[2], x_dims[3], PADDLE_ENFORCE_EQ(
"Input(X) dims[3] and Input(Grid) dims[2] should be equal."); grid_dims[2], x_dims[3],
"Input(X) dims[3] and Input(Grid) dims[2] should be equal.");
}
ctx->SetOutputDim("Output", x_dims); ctx->SetOutputDim("Output", x_dims);
ctx->ShareLoD("X", "Output"); ctx->ShareLoD("X", "Output");
......
...@@ -21,5 +21,5 @@ endif() ...@@ -21,5 +21,5 @@ endif()
cc_library(jit_kernel_helper SRCS ${jit_kernel_cc_srcs} DEPS ${JIT_KERNEL_DEPS}) cc_library(jit_kernel_helper SRCS ${jit_kernel_cc_srcs} DEPS ${JIT_KERNEL_DEPS})
cc_test(jit_kernel_test SRCS test.cc DEPS jit_kernel_helper) cc_test(jit_kernel_test SRCS test.cc DEPS jit_kernel_helper)
if(NOT WIN32) if(NOT WIN32)
cc_binary(jit_kernel_benchmark SRCS benchmark.cc DEPS jit_kernel_helper device_tracer) cc_binary(jit_kernel_benchmark SRCS benchmark.cc DEPS jit_kernel_helper device_tracer tensor)
endif() endif()
...@@ -18,6 +18,7 @@ ...@@ -18,6 +18,7 @@
#include <vector> #include <vector>
#include "gflags/gflags.h" #include "gflags/gflags.h"
#include "glog/logging.h" #include "glog/logging.h"
#include "paddle/fluid/framework/tensor.h"
#include "paddle/fluid/operators/jit/kernels.h" #include "paddle/fluid/operators/jit/kernels.h"
#include "paddle/fluid/platform/device_tracer.h" #include "paddle/fluid/platform/device_tracer.h"
#include "paddle/fluid/platform/place.h" #include "paddle/fluid/platform/place.h"
...@@ -155,14 +156,22 @@ void BenchAllImpls(const typename KernelTuples::attr_type& attr, Args... args) { ...@@ -155,14 +156,22 @@ void BenchAllImpls(const typename KernelTuples::attr_type& attr, Args... args) {
LOG(INFO) << loginfos.str(); LOG(INFO) << loginfos.str();
} }
using Tensor = paddle::framework::Tensor;
template <paddle::operators::jit::KernelType KT, typename T, typename PlaceType> template <paddle::operators::jit::KernelType KT, typename T, typename PlaceType>
void BenchXYZNKernel() { void BenchXYZNKernel() {
for (int d : TestSizes()) { for (int d : TestSizes()) {
std::vector<T> x(d), y(d), z(d); Tensor x, y, z;
RandomVec<T>(d, x.data()); x.Resize({d});
RandomVec<T>(d, y.data()); y.Resize({d});
BenchAllImpls<KT, jit::XYZNTuples<T>, PlaceType>(d, x.data(), y.data(), z.Resize({d});
z.data(), d); T* x_data = x.mutable_data<T>(PlaceType());
T* y_data = y.mutable_data<T>(PlaceType());
T* z_data = z.mutable_data<T>(PlaceType());
RandomVec<T>(d, x_data);
RandomVec<T>(d, y_data);
BenchAllImpls<KT, jit::XYZNTuples<T>, PlaceType>(d, x.data<T>(),
y.data<T>(), z_data, d);
} }
} }
...@@ -170,9 +179,13 @@ template <paddle::operators::jit::KernelType KT, typename T, typename PlaceType> ...@@ -170,9 +179,13 @@ template <paddle::operators::jit::KernelType KT, typename T, typename PlaceType>
void BenchAXYNKernel() { void BenchAXYNKernel() {
for (int d : TestSizes()) { for (int d : TestSizes()) {
const T a = static_cast<T>(3); const T a = static_cast<T>(3);
std::vector<T> x(d), y(d); Tensor x, y;
RandomVec<T>(d, x.data()); x.Resize({d});
BenchAllImpls<KT, jit::AXYNTuples<T>, PlaceType>(d, &a, x.data(), y.data(), y.Resize({d});
T* x_data = x.mutable_data<T>(PlaceType());
T* y_data = y.mutable_data<T>(PlaceType());
RandomVec<T>(d, x_data);
BenchAllImpls<KT, jit::AXYNTuples<T>, PlaceType>(d, &a, x.data<T>(), y_data,
d); d);
} }
} }
...@@ -180,9 +193,13 @@ void BenchAXYNKernel() { ...@@ -180,9 +193,13 @@ void BenchAXYNKernel() {
template <paddle::operators::jit::KernelType KT, typename T, typename PlaceType> template <paddle::operators::jit::KernelType KT, typename T, typename PlaceType>
void BenchXYNKernel() { void BenchXYNKernel() {
for (int d : TestSizes()) { for (int d : TestSizes()) {
std::vector<T> x(d), y(d); Tensor x, y;
RandomVec<T>(d, x.data()); x.Resize({d});
BenchAllImpls<KT, jit::XYNTuples<T>, PlaceType>(d, x.data(), y.data(), d); y.Resize({d});
T* x_data = x.mutable_data<T>(PlaceType());
T* y_data = y.mutable_data<T>(PlaceType());
RandomVec<T>(d, x_data);
BenchAllImpls<KT, jit::XYNTuples<T>, PlaceType>(d, x.data<T>(), y_data, d);
} }
} }
...@@ -192,16 +209,23 @@ void BenchLSTMKernel() { ...@@ -192,16 +209,23 @@ void BenchLSTMKernel() {
for (int d : TestSizes()) { for (int d : TestSizes()) {
const jit::lstm_attr_t attr(d, jit::kVSigmoid, jit::kVTanh, jit::kVTanh, const jit::lstm_attr_t attr(d, jit::kVSigmoid, jit::kVTanh, jit::kVTanh,
use_peephole); use_peephole);
std::vector<T> x(4 * d), ct_1(d), ct(d), ht(d), wp(3 * d), checked(2 * d); Tensor x, ct_1, ct, ht, wp, checked;
RandomVec<T>(4 * d, x.data(), -2.f, 2.f); x.Resize({4 * d});
RandomVec<T>(3 * d, wp.data(), -2.f, 2.f); ct_1.Resize({d});
RandomVec<T>(d, ct_1.data(), -2.f, 2.f); ct.Resize({d});
const T* ct_1_data = ct_1.data(); ht.Resize({d});
const T* wp_data = wp.data(); wp.Resize({3 * d});
T* x_data = x.data(); checked.Resize({2 * d});
T* checked_data = checked.data(); auto place = PlaceType();
T* ct_data = ct.data(); RandomVec<T>(x.numel(), x.mutable_data<T>(place), -2.f, 2.f);
T* ht_data = ht.data(); RandomVec<T>(wp.numel(), wp.mutable_data<T>(place), -2.f, 2.f);
RandomVec<T>(ct_1.numel(), ct_1.mutable_data<T>(place), -2.f, 2.f);
const T* ct_1_data = ct_1.data<T>();
const T* wp_data = wp.data<T>();
T* x_data = x.mutable_data<T>(place);
T* checked_data = checked.mutable_data<T>(place);
T* ct_data = ct.mutable_data<T>(place);
T* ht_data = ht.mutable_data<T>(place);
jit::lstm_t step; jit::lstm_t step;
step.gates = x_data; step.gates = x_data;
step.ct_1 = ct_1_data; step.ct_1 = ct_1_data;
...@@ -220,12 +244,16 @@ template <paddle::operators::jit::KernelType KT, typename T, typename PlaceType> ...@@ -220,12 +244,16 @@ template <paddle::operators::jit::KernelType KT, typename T, typename PlaceType>
void BenchGRUKernel() { void BenchGRUKernel() {
for (int d : TestSizes()) { for (int d : TestSizes()) {
const jit::gru_attr_t attr(d, jit::kVSigmoid, jit::kVTanh); const jit::gru_attr_t attr(d, jit::kVSigmoid, jit::kVTanh);
std::vector<T> x(3 * d), ht_1(d), ht(d); auto place = PlaceType();
RandomVec<T>(3 * d, x.data(), -2.f, 2.f); Tensor x, ht_1, ht;
RandomVec<T>(d, ht_1.data(), -2.f, 2.f); x.Resize({3 * d});
const T* ht_1_data = ht_1.data(); ht_1.Resize({d});
T* x_data = x.data(); ht.Resize({d});
T* ht_data = ht.data(); RandomVec<T>(3 * d, x.mutable_data<T>(place), -2.f, 2.f);
RandomVec<T>(d, ht_1.mutable_data<T>(place), -2.f, 2.f);
const T* ht_1_data = ht_1.data<T>();
T* x_data = x.mutable_data<T>(place);
T* ht_data = ht.mutable_data<T>(place);
jit::gru_t step; jit::gru_t step;
step.gates = x_data; step.gates = x_data;
step.ht_1 = ht_1_data; step.ht_1 = ht_1_data;
...@@ -243,10 +271,12 @@ void BenchSeqPoolKernel() { ...@@ -243,10 +271,12 @@ void BenchSeqPoolKernel() {
jit::seq_pool_attr_t attr(w, type); jit::seq_pool_attr_t attr(w, type);
for (int h : TestSizes()) { for (int h : TestSizes()) {
attr.h = h; attr.h = h;
std::vector<T> x(h * w), y(w); Tensor x, y;
RandomVec<T>(h * w, x.data(), -2.f, 2.f); x.Resize({h * w});
const T* x_data = x.data(); y.Resize({w});
T* y_data = y.data(); RandomVec<T>(h * w, x.mutable_data<T>(PlaceType()), -2.f, 2.f);
const T* x_data = x.data<T>();
T* y_data = y.mutable_data<T>(PlaceType());
BenchAllImpls<KT, jit::SeqPoolTuples<T>, PlaceType>(attr, x_data, BenchAllImpls<KT, jit::SeqPoolTuples<T>, PlaceType>(attr, x_data,
y_data, &attr); y_data, &attr);
} }
...@@ -259,12 +289,15 @@ void BenchMatMulKernel() { ...@@ -259,12 +289,15 @@ void BenchMatMulKernel() {
for (int m : {1, 2, 3, 4}) { for (int m : {1, 2, 3, 4}) {
for (int n : TestSizes()) { for (int n : TestSizes()) {
for (int k : TestSizes()) { for (int k : TestSizes()) {
std::vector<T> a(m * k), b(k * n), c(m * n); Tensor a, b, c;
RandomVec<T>(m * k, a.data(), -2.f, 2.f); a.Resize({m * k});
RandomVec<T>(k * n, b.data(), -2.f, 2.f); b.Resize({k * n});
const T* a_data = a.data(); c.Resize({m * n});
const T* b_data = b.data(); RandomVec<T>(m * k, a.mutable_data<T>(PlaceType()), -2.f, 2.f);
T* c_data = c.data(); RandomVec<T>(k * n, b.mutable_data<T>(PlaceType()), -2.f, 2.f);
const T* a_data = a.data<T>();
const T* b_data = b.data<T>();
T* c_data = c.mutable_data<T>(PlaceType());
BenchAllImpls<KT, jit::MatMulTuples<T>, PlaceType>(k, a_data, b_data, BenchAllImpls<KT, jit::MatMulTuples<T>, PlaceType>(k, a_data, b_data,
c_data, m, n, k); c_data, m, n, k);
} }
......
...@@ -67,7 +67,13 @@ class LRNMKLDNNOpKernel : public paddle::framework::OpKernel<T> { ...@@ -67,7 +67,13 @@ class LRNMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
mid->mutable_data<T>(ctx.GetPlace()); mid->mutable_data<T>(ctx.GetPlace());
const int n = ctx.Attr<int>("n"); const int n = ctx.Attr<int>("n");
const float alpha = ctx.Attr<float>("alpha"); // MKL-DNN implements LRN in a caffe way:
// http://caffe.berkeleyvision.org/tutorial/layers/lrn.html
// Where sum of squares is divided by size of normalization window
// this is not the case for PaddlePaddle LRN.
// Hence we need to compensate for this diffrence by
// multipliing alpha by size of window(n)
const float alpha = ctx.Attr<float>("alpha") * static_cast<float>(n);
const float beta = ctx.Attr<float>("beta"); const float beta = ctx.Attr<float>("beta");
const float k = ctx.Attr<float>("k"); const float k = ctx.Attr<float>("k");
const bool is_test = ctx.Attr<bool>("is_test"); const bool is_test = ctx.Attr<bool>("is_test");
...@@ -78,10 +84,7 @@ class LRNMKLDNNOpKernel : public paddle::framework::OpKernel<T> { ...@@ -78,10 +84,7 @@ class LRNMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
auto dims = paddle::framework::vectorize2int(x->dims()); auto dims = paddle::framework::vectorize2int(x->dims());
auto src_md = paddle::platform::MKLDNNMemDesc( auto src_md = paddle::platform::MKLDNNMemDesc(
dims, mkldnn::memory::data_type::f32, mkldnn::memory::format::nchw); dims, mkldnn::memory::data_type::f32, x->format());
auto dst_md = paddle::platform::MKLDNNMemDesc(
dims, mkldnn::memory::data_type::f32, mkldnn::memory::format::nchw);
auto forward_desc = mkldnn::lrn_forward::desc{mkldnn::prop_kind::forward, auto forward_desc = mkldnn::lrn_forward::desc{mkldnn::prop_kind::forward,
mkldnn::lrn_across_channels, mkldnn::lrn_across_channels,
...@@ -92,8 +95,6 @@ class LRNMKLDNNOpKernel : public paddle::framework::OpKernel<T> { ...@@ -92,8 +95,6 @@ class LRNMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
k}; k};
auto src_memory_pd = mkldnn::memory::primitive_desc{src_md, mkldnn_engine}; auto src_memory_pd = mkldnn::memory::primitive_desc{src_md, mkldnn_engine};
auto dst_memory = mkldnn::memory{{dst_md, mkldnn_engine},
static_cast<void*>(output_data)};
if (!is_test) { if (!is_test) {
const std::string key = ctx.op().Output("Out"); const std::string key = ctx.op().Output("Out");
...@@ -110,11 +111,16 @@ class LRNMKLDNNOpKernel : public paddle::framework::OpKernel<T> { ...@@ -110,11 +111,16 @@ class LRNMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
src_memory->set_data_handle( src_memory->set_data_handle(
static_cast<void*>(const_cast<T*>(input_data))); static_cast<void*>(const_cast<T*>(input_data)));
auto dst_memory = mkldnn::memory(forward_pd->dst_primitive_desc(),
static_cast<void*>(output_data));
auto workspace_memory = insert_to_context<mkldnn::memory>( auto workspace_memory = insert_to_context<mkldnn::memory>(
key_workspace_memory, dev_ctx, key_workspace_memory, dev_ctx,
forward_pd->workspace_primitive_desc()); forward_pd->workspace_primitive_desc());
run_primitive(*forward_pd, *src_memory, *workspace_memory, dst_memory); run_primitive(*forward_pd, *src_memory, *workspace_memory, dst_memory);
out->set_layout(framework::DataLayout::kMKLDNN);
out->set_format(platform::GetMKLDNNFormat(dst_memory));
} else { } else {
auto forward_pd = auto forward_pd =
mkldnn::lrn_forward::primitive_desc{forward_desc, mkldnn_engine}; mkldnn::lrn_forward::primitive_desc{forward_desc, mkldnn_engine};
...@@ -122,8 +128,13 @@ class LRNMKLDNNOpKernel : public paddle::framework::OpKernel<T> { ...@@ -122,8 +128,13 @@ class LRNMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
src_memory_pd, static_cast<void*>(const_cast<T*>(input_data))}; src_memory_pd, static_cast<void*>(const_cast<T*>(input_data))};
auto workspace_memory = auto workspace_memory =
mkldnn::memory{forward_pd.workspace_primitive_desc()}; mkldnn::memory{forward_pd.workspace_primitive_desc()};
auto dst_memory = mkldnn::memory(forward_pd.dst_primitive_desc(),
static_cast<void*>(output_data));
run_primitive(forward_pd, src_memory, workspace_memory, dst_memory); run_primitive(forward_pd, src_memory, workspace_memory, dst_memory);
out->set_layout(framework::DataLayout::kMKLDNN);
out->set_format(platform::GetMKLDNNFormat(dst_memory));
} }
} }
}; };
...@@ -151,7 +162,7 @@ class LRNMKLDNNGradOpKernel : public paddle::framework::OpKernel<T> { ...@@ -151,7 +162,7 @@ class LRNMKLDNNGradOpKernel : public paddle::framework::OpKernel<T> {
const std::string key_workspace_memory = key + "@lrn_workspace_memory"; const std::string key_workspace_memory = key + "@lrn_workspace_memory";
const int n = ctx.Attr<int>("n"); const int n = ctx.Attr<int>("n");
const float alpha = ctx.Attr<float>("alpha"); const float alpha = ctx.Attr<float>("alpha") * static_cast<float>(n);
const float beta = ctx.Attr<float>("beta"); const float beta = ctx.Attr<float>("beta");
const float k = ctx.Attr<float>("k"); const float k = ctx.Attr<float>("k");
......
...@@ -54,6 +54,7 @@ math_library(sequence_padding) ...@@ -54,6 +54,7 @@ math_library(sequence_padding)
math_library(sequence_pooling DEPS math_function jit_kernel_helper) math_library(sequence_pooling DEPS math_function jit_kernel_helper)
math_library(sequence_scale) math_library(sequence_scale)
math_library(softmax DEPS math_function) math_library(softmax DEPS math_function)
math_library(beam_search DEPS math_function)
math_library(matrix_bit_code) math_library(matrix_bit_code)
...@@ -68,6 +69,7 @@ cc_test(im2col_test SRCS im2col_test.cc DEPS im2col) ...@@ -68,6 +69,7 @@ cc_test(im2col_test SRCS im2col_test.cc DEPS im2col)
cc_test(vol2col_test SRCS vol2col_test.cc DEPS vol2col) cc_test(vol2col_test SRCS vol2col_test.cc DEPS vol2col)
cc_test(sequence_padding_test SRCS sequence_padding_test.cc DEPS sequence_padding) cc_test(sequence_padding_test SRCS sequence_padding_test.cc DEPS sequence_padding)
cc_test(sequence_pooling_test SRCS sequence_pooling_test.cc DEPS sequence_pooling) cc_test(sequence_pooling_test SRCS sequence_pooling_test.cc DEPS sequence_pooling)
cc_test(beam_search_test SRCS beam_search_test.cc DEPS beam_search)
if(WITH_GPU) if(WITH_GPU)
nv_test(math_function_gpu_test SRCS math_function_test.cu DEPS math_function) nv_test(math_function_gpu_test SRCS math_function_test.cu DEPS math_function)
nv_test(selected_rows_functor_gpu_test SRCS selected_rows_functor_test.cu.cc DEPS selected_rows_functor math_function) nv_test(selected_rows_functor_gpu_test SRCS selected_rows_functor_test.cu.cc DEPS selected_rows_functor math_function)
......
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/operators/math/beam_search.h"
#include <algorithm>
#include <map>
namespace paddle {
namespace operators {
namespace math {
template <typename T>
class BeamSearchFunctor<platform::CPUDeviceContext, T> {
public:
void operator()(const platform::CPUDeviceContext &context,
const framework::LoDTensor *pre_ids,
const framework::LoDTensor *pre_scores,
const framework::LoDTensor *ids,
const framework::LoDTensor *scores,
framework::LoDTensor *selected_ids,
framework::LoDTensor *selected_scores, size_t level,
size_t beam_size, int end_id, bool is_accumulated) {
auto abs_lod = framework::ToAbsOffset(scores->lod());
auto &high_level = abs_lod[level];
auto items = SelectTopBeamSizeItems(pre_ids, pre_scores, ids, scores, level,
beam_size, end_id, is_accumulated);
auto selected_items = ToMap(items, high_level.back());
if (FLAGS_v == 3) {
VLOG(3) << "selected_items:";
for (size_t i = 0; i < selected_items.size(); ++i) {
VLOG(3) << "offset: " << i;
for (auto &item : selected_items[i]) {
VLOG(3) << item.ToString();
}
}
}
PruneEndBeams(pre_ids, abs_lod, &selected_items, level, end_id);
// calculate the output tensor's height
size_t num_instances = std::accumulate(
std::begin(selected_items), std::end(selected_items), 0,
[](size_t a, std::vector<Item> &b) { return a + b.size(); });
// the output tensor shape should be [num_instances, 1]
auto dims = framework::make_ddim(
std::vector<int64_t>({static_cast<int>(num_instances), 1}));
selected_ids->Resize(dims);
selected_scores->Resize(dims);
auto *selected_ids_data =
selected_ids->mutable_data<int64_t>(platform::CPUPlace());
auto *selected_scores_data =
selected_scores->mutable_data<float>(platform::CPUPlace());
// fill in data
std::vector<size_t> low_level;
size_t low_offset = 0;
for (auto &items : selected_items) {
low_level.push_back(low_offset);
for (auto &item : items) {
selected_ids_data[low_offset] = item.id;
selected_scores_data[low_offset] = item.score;
low_offset++;
}
}
low_level.push_back(low_offset);
// fill lod
framework::LoD lod(2);
lod[0].assign(high_level.begin(), high_level.end());
lod[1].assign(low_level.begin(), low_level.end());
if (!framework::CheckLoD(lod)) {
PADDLE_THROW("lod %s is not right", framework::LoDToString(lod));
}
selected_ids->set_lod(lod);
selected_scores->set_lod(lod);
}
/*
* The basic items help to sort.
*/
struct Item {
Item() {}
Item(size_t offset, size_t id, float score)
: offset(offset), id(id), score(score) {}
// offset in the higher lod level.
size_t offset;
// prefix id in the lower lod level.
// size_t prefix;
// the candidate id
size_t id;
// the corresponding score
float score;
inline bool operator<(const Item &in) const {
return (score < in.score) ||
((score == in.score) && (offset < in.offset));
}
inline void operator=(const Item &in) {
offset = in.offset;
id = in.id;
score = in.score;
}
std::string ToString() {
std::ostringstream os;
os << "{";
os << "offset: " << offset << ", ";
os << "id: " << id << ", ";
os << "score: " << score << "";
os << "}";
return os.str();
}
};
protected:
/*
* Prune the source sentences all branchs finished, and it is optional.
* Pruning must one step later than finishing (thus pre_ids is needed here),
* since the end tokens must be writed out.
*/
void PruneEndBeams(const framework::LoDTensor *pre_ids,
const framework::LoD &abs_lod,
std::vector<std::vector<Item>> *items, size_t lod_level,
int end_id) {
auto *pre_ids_data = pre_ids->data<int64_t>();
auto &high_level = abs_lod[lod_level];
for (size_t src_idx = 0; src_idx < high_level.size() - 1; ++src_idx) {
size_t src_prefix_start = high_level[src_idx];
size_t src_prefix_end = high_level[src_idx + 1];
bool finish_flag = true;
for (size_t offset = src_prefix_start; offset < src_prefix_end;
offset++) {
for (auto &item : items->at(offset)) {
if (item.id != static_cast<size_t>(end_id) ||
pre_ids_data[offset] != end_id) {
finish_flag = false;
break;
}
}
if (!finish_flag) break;
}
if (finish_flag) { // all branchs of the beam (source sentence) end and
// prune this beam
for (size_t offset = src_prefix_start; offset < src_prefix_end;
offset++)
items->at(offset).clear();
}
}
}
/*
* Transform the items into a map whose key is offset, value is the items.
* NOTE low performance.
*/
std::vector<std::vector<Item>> ToMap(
const std::vector<std::vector<Item>> &items, size_t element_num) {
std::vector<std::vector<Item>> result;
result.resize(element_num);
for (auto &entries : items) {
for (const auto &item : entries) {
result[item.offset].push_back(item);
}
}
return result;
}
void Insert(std::vector<Item> *top_beam_ptr, const Item &item,
size_t beam_size) {
std::vector<Item> &top_beam = *top_beam_ptr;
size_t num_beams = top_beam.size();
if (num_beams < beam_size) {
top_beam.resize(num_beams + 1);
num_beams++;
} else {
if (item < top_beam[beam_size - 1]) {
return;
}
}
for (int k = static_cast<int>(num_beams) - 2; k >= 0; --k) {
if (top_beam[k] < item) {
top_beam[k + 1] = top_beam[k];
} else {
top_beam[k + 1] = item;
return;
}
}
top_beam[0] = item;
}
/*
* For each source, select top beam_size records.
*/
std::vector<std::vector<Item>> SelectTopBeamSizeItems(
const framework::LoDTensor *pre_ids,
const framework::LoDTensor *pre_scores, const framework::LoDTensor *ids,
const framework::LoDTensor *scores, size_t lod_level, size_t beam_size,
int end_id, bool is_accumulated) {
std::vector<std::vector<Item>> result;
// find the current candidates
auto abs_lod = framework::ToAbsOffset(scores->lod());
auto *pre_ids_data = pre_ids->data<int64_t>();
auto *pre_scores_data = pre_scores->data<float>();
auto *ids_data = ids ? ids->data<int64_t>() : nullptr;
auto *scores_data = scores->data<float>();
size_t num_seqs = scores->NumElements(lod_level);
size_t seq_width = 1;
for (int i = 1; i < scores->dims().size(); i++) {
seq_width *= scores->dims()[i];
}
for (size_t seq_id = 0; seq_id < num_seqs; ++seq_id) {
size_t seq_offset_start = abs_lod[lod_level][seq_id];
size_t seq_offset_end = abs_lod[lod_level][seq_id + 1];
std::vector<Item> top_beam;
top_beam.reserve(beam_size);
for (size_t offset = seq_offset_start; offset < seq_offset_end;
++offset) {
auto pre_id = pre_ids_data[offset];
auto pre_score = pre_scores_data[offset];
if (pre_id == end_id) {
// Allocate all probability mass to end_id for finished branchs and
// the other candidate ids can be ignored.
Item item(offset, end_id, pre_score);
Insert(&top_beam, item, beam_size);
} else {
size_t index = offset * seq_width;
for (size_t d = 0; d < seq_width; d++, index++) {
int64_t id = ids_data ? ids_data[index] : static_cast<int64_t>(d);
float score = is_accumulated
? scores_data[index]
: pre_score + std::log(scores_data[index]);
Item item(offset, id, score);
Insert(&top_beam, item, beam_size);
}
}
}
result.emplace_back(top_beam);
}
if (FLAGS_v == 3) {
VLOG(3) << "SelectTopBeamSizeItems result size " << result.size();
for (auto &items : result) {
VLOG(3) << "item set:";
for (auto &item : items) {
VLOG(3) << item.ToString();
}
}
}
return result;
}
};
template class BeamSearchFunctor<platform::CPUDeviceContext, int>;
template class BeamSearchFunctor<platform::CPUDeviceContext, int64_t>;
template class BeamSearchFunctor<platform::CPUDeviceContext, float>;
template class BeamSearchFunctor<platform::CPUDeviceContext, double>;
} // namespace math
} // namespace operators
} // namespace paddle
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/operators/math/beam_search.h"
#include "paddle/fluid/platform/cuda_device_function.h"
namespace paddle {
namespace operators {
namespace math {
struct Triple {
__device__ __forceinline__ Triple() {}
__device__ __forceinline__ Triple(int o, int i, float s)
: offset(o), id(i), score(s) {}
__device__ __forceinline__ void set(int o, int i, float s) {
offset = o;
id = i;
score = s;
}
__device__ __forceinline__ void operator=(const Triple& in) {
offset = in.offset;
id = in.id;
score = in.score;
}
__device__ __forceinline__ bool operator<(const float s) const {
return score < s;
}
__device__ __forceinline__ bool operator<(const Triple& in) const {
return (score < in.score) || ((score == in.score) && (offset < in.offset));
}
int offset;
int id;
float score;
};
__device__ __forceinline__ void Insert(Triple* top_beam, const Triple& p,
int beam_size) {
if (p < top_beam[beam_size - 1]) {
return;
}
for (int k = beam_size - 2; k >= 0; --k) {
if (top_beam[k] < p) {
top_beam[k + 1] = top_beam[k];
} else {
top_beam[k + 1] = p;
return;
}
}
top_beam[0] = p;
}
template <int MaxThreadsPerSeq, bool IsAccumulated = true>
__device__ __forceinline__ int SelectTopBeam(
Triple* top_beam, const int64_t* pre_ids, const float* pre_scores,
const int64_t* ids, const float* scores, const int seq_offset_start,
const int seq_offset_end, const int seq_width, int beam_size, int end_id,
int used_threads) {
// top_beam is shared memory
const int tid = threadIdx.x;
const int tid_of_seq = threadIdx.x % MaxThreadsPerSeq;
int num_used_threads = used_threads;
Triple* top_beam_local = top_beam + tid * beam_size;
if (tid_of_seq < num_used_threads) {
for (int i = 0; i < beam_size; ++i) {
top_beam_local[i].set(-1, -1, -INFINITY);
}
for (int offset = seq_offset_start; offset < seq_offset_end; ++offset) {
int pre_id = static_cast<int>(pre_ids[offset]);
if (pre_id == end_id) {
if (tid_of_seq == 0) {
Triple tmp(offset, end_id, pre_scores[offset]);
Insert(top_beam_local, tmp, beam_size);
}
} else {
int index = offset * seq_width + tid_of_seq;
if (!IsAccumulated) {
float pre_score = pre_scores[offset];
for (int i = tid_of_seq; i < seq_width; i += num_used_threads) {
float score = pre_score + __logf(scores[index]);
int id = ids ? static_cast<int>(ids[index]) : i;
Triple tmp(offset, id, score);
Insert(top_beam_local, tmp, beam_size);
index += num_used_threads;
}
} else {
for (int i = tid_of_seq; i < seq_width; i += num_used_threads) {
int id = ids ? static_cast<int>(ids[index]) : i;
float score = scores[index];
Triple tmp(offset, id, score);
Insert(top_beam_local, tmp, beam_size);
index += num_used_threads;
}
}
}
}
}
while (num_used_threads > 1) {
if (num_used_threads > 16) {
__syncthreads();
}
num_used_threads = num_used_threads >> 1;
if (tid_of_seq < num_used_threads) {
int index_in_sh = (num_used_threads + tid) * beam_size;
for (int i = 0; i < beam_size; i++) {
Insert(top_beam_local, top_beam[index_in_sh], beam_size);
index_in_sh++;
}
}
}
if (tid_of_seq == 0) {
int num_items = 0;
for (int i = 0; i < beam_size; ++i) {
num_items =
(top_beam_local[i].score > -INFINITY) ? num_items + 1 : num_items;
}
return num_items;
}
return 0;
}
__device__ __forceinline__ bool PruneEndBeams(Triple* top_beam_local,
const int64_t* pre_ids,
const int end_id, int num_items) {
bool finish_flag = true;
for (int i = 0; i < num_items; ++i) {
int offset = top_beam_local[i].offset;
if (top_beam_local[i].id != end_id ||
static_cast<int>(pre_ids[offset]) != end_id) {
finish_flag = false;
break;
}
}
return finish_flag;
}
__device__ __forceinline__ void WriteBack(
int64_t* selected_ids, float* selected_scores, size_t* selected_offsets,
Triple* top_beam_local, const int seq_offset_start,
const int seq_offset_end, const int selected_seq_start,
const int selected_seq_length) {
const int tid = threadIdx.x; // use 1 thread only for each sequence
int global_index = selected_seq_start;
for (int global_offset = seq_offset_start; global_offset < seq_offset_end;
++global_offset) {
for (int local_index = 0; local_index < selected_seq_length;
++local_index) {
if (top_beam_local[local_index].offset == global_offset) {
selected_ids[global_index] =
static_cast<int64_t>(top_beam_local[local_index].id);
selected_scores[global_index] = top_beam_local[local_index].score;
global_index++;
}
}
selected_offsets[global_offset + 1] = static_cast<size_t>(global_index);
}
}
template <int MaxLength, int MaxThreadsPerSeq, int MaxSeqs>
__device__ void BeamSearchDetails(
int64_t* selected_ids, float* selected_scores, size_t* selected_offsets,
const int64_t* pre_ids, const float* pre_scores, const int64_t* ids,
const float* scores, const int seq_offset_start, const int seq_offset_end,
const int seq_width, int beam_size, int end_id, bool is_accumulated,
int num_used_threads) {
__shared__ Triple top_beam[MaxLength];
int num_items = 0;
if (is_accumulated) {
num_items = SelectTopBeam<MaxThreadsPerSeq, true>(
top_beam, pre_ids, pre_scores, ids, scores, seq_offset_start,
seq_offset_end, seq_width, beam_size, end_id, num_used_threads);
} else {
num_items = SelectTopBeam<MaxThreadsPerSeq, false>(
top_beam, pre_ids, pre_scores, ids, scores, seq_offset_start,
seq_offset_end, seq_width, beam_size, end_id, num_used_threads);
}
const int tid = threadIdx.x; // use 1 thread only for each sequence
const int tid_of_seq = tid % MaxThreadsPerSeq;
if (tid_of_seq == 0) {
// Use 1 thread for each sequence.
Triple* top_beam_local = top_beam + tid * beam_size;
bool finish_flag =
PruneEndBeams(top_beam_local, pre_ids, end_id, num_items);
int selected_seq_start = 0;
int selected_seq_length = finish_flag ? 0 : num_items;
if (MaxSeqs > 1) {
const int seq_id = (MaxSeqs > 1) ? tid / MaxThreadsPerSeq : tid;
__shared__ int shared_mem[MaxSeqs];
// [0, MaxSeqs - 1], length of each sequences
shared_mem[seq_id] = selected_seq_length;
__syncthreads();
for (int s = 0; s < seq_id; ++s) {
selected_seq_start += shared_mem[s];
}
if (seq_id == 0) {
selected_offsets[0] = 0;
}
} else {
selected_offsets[0] = 0;
}
WriteBack(selected_ids, selected_scores, selected_offsets, top_beam_local,
seq_offset_start, seq_offset_end, selected_seq_start,
selected_seq_length);
}
}
template <int MaxLength, int MaxThreadsPerSeq, int MaxSeqs>
__global__ void BeamSearchKernel(int64_t* selected_ids, float* selected_scores,
size_t* selected_offsets,
const int64_t* pre_ids,
const float* pre_scores, const int64_t* ids,
const float* scores, const size_t* seq_offsets,
const int num_seqs, const int seq_width,
int beam_size, int end_id, bool is_accumulated,
int num_used_threads) {
const int tid = threadIdx.x;
const int seq_id = (MaxSeqs > 1) ? tid / MaxThreadsPerSeq : tid;
int seq_offset_start = static_cast<int>(seq_offsets[seq_id]);
int seq_offset_end = static_cast<int>(seq_offsets[seq_id + 1]);
BeamSearchDetails<MaxLength, MaxThreadsPerSeq, MaxSeqs>(
selected_ids, selected_scores, selected_offsets, pre_ids, pre_scores, ids,
scores, seq_offset_start, seq_offset_end, seq_width, beam_size, end_id,
is_accumulated, num_used_threads);
}
template <int MaxLength, int MaxThreadsPerSeq>
__global__ void BeamSearchKernelSingle(
int64_t* selected_ids, float* selected_scores, size_t* selected_offsets,
const int64_t* pre_ids, const float* pre_scores, const int64_t* ids,
const float* scores, const int seq_length, const int seq_width,
int beam_size, int end_id, bool is_accumulated, int num_used_threads) {
const int seq_offset_start = 0;
const int seq_offset_end = seq_length;
BeamSearchDetails<MaxLength, MaxThreadsPerSeq, 1>(
selected_ids, selected_scores, selected_offsets, pre_ids, pre_scores, ids,
scores, seq_offset_start, seq_offset_end, seq_width, beam_size, end_id,
is_accumulated, num_used_threads);
}
static inline int GetNumUsedThreads(const int max_threads_per_seq,
const int seq_width, int beam_size) {
int num_used_threads = (seq_width + beam_size - 1) / beam_size;
num_used_threads = max_threads_per_seq < num_used_threads
? max_threads_per_seq
: num_used_threads;
num_used_threads =
num_used_threads > 32
? (num_used_threads >> 5) << 5
: (num_used_threads > 16
? 32
: (num_used_threads > 8
? 16
: (num_used_threads > 4
? 8
: (num_used_threads > 2 ? 4
: num_used_threads))));
return num_used_threads;
}
template <typename T>
class BeamSearchFunctor<platform::CUDADeviceContext, T> {
public:
void operator()(const platform::CUDADeviceContext& context,
const framework::LoDTensor* pre_ids,
const framework::LoDTensor* pre_scores,
const framework::LoDTensor* ids,
const framework::LoDTensor* scores,
framework::LoDTensor* selected_ids,
framework::LoDTensor* selected_scores, size_t level,
size_t beam_size, int end_id, bool is_accumulated) {
auto abs_lod = framework::ToAbsOffset(scores->lod());
const int64_t* pre_ids_data = pre_ids->data<int64_t>();
const float* pre_scores_data = pre_scores->data<float>();
const int64_t* ids_data = ids ? ids->data<int64_t>() : nullptr;
const float* scores_data = scores->data<float>();
const size_t num_seqs = abs_lod[level].size() - 1;
size_t seq_width = 1;
for (int i = 1; i < scores->dims().size(); i++) {
seq_width *= scores->dims()[i];
}
// Reserve a big enough memory.
auto selected_dims =
framework::make_ddim({static_cast<int64_t>(num_seqs * beam_size), 1});
int64_t* selected_ids_data =
selected_ids->mutable_data<int64_t>(selected_dims, context.GetPlace());
float* selected_scores_data =
selected_scores->mutable_data<float>(selected_dims, context.GetPlace());
framework::LoD selected_lod(2);
selected_lod[0].assign(abs_lod[level].begin(), abs_lod[level].end());
selected_lod[1].resize(scores->dims()[0] + 1);
size_t* selected_offsets =
selected_lod[1].CUDAMutableData(context.GetPlace());
if (num_seqs == 1) {
const int seq_length = static_cast<int>(abs_lod[level][1]);
const int kMaxThreadsPerSeq = 1024;
int num_used_threads =
GetNumUsedThreads(kMaxThreadsPerSeq, static_cast<int>(seq_width),
static_cast<int>(beam_size));
switch (platform::RoundToPowerOfTwo(beam_size * seq_width)) {
CUDA_LAUNCH_KERNEL_HELPER(
BeamSearchKernelSingle<kPowerOfTwoDim, kMaxThreadsPerSeq><<<
1, kMaxThreadsPerSeq, 0, context.stream()>>>(
selected_ids_data, selected_scores_data, selected_offsets,
pre_ids_data, pre_scores_data, ids_data, scores_data,
seq_length, static_cast<int>(seq_width),
static_cast<int>(beam_size), static_cast<int>(end_id),
is_accumulated, num_used_threads));
}
} else if (num_seqs <= 4) {
const size_t* seq_offsets = abs_lod[level].CUDAData(context.GetPlace());
// Use only 1 block
const int kMaxThreadsPerSeq = 32;
const int kMaxSeqs = 4;
int num_used_threads =
GetNumUsedThreads(kMaxThreadsPerSeq, static_cast<int>(seq_width),
static_cast<int>(beam_size));
switch (platform::RoundToPowerOfTwo(beam_size * num_seqs * 32)) {
CUDA_LAUNCH_KERNEL_HELPER(
BeamSearchKernel<kPowerOfTwoDim, kMaxThreadsPerSeq, kMaxSeqs><<<
1, num_seqs * kMaxThreadsPerSeq, 0, context.stream()>>>(
selected_ids_data, selected_scores_data, selected_offsets,
pre_ids_data, pre_scores_data, ids_data, scores_data,
seq_offsets, static_cast<int>(num_seqs),
static_cast<int>(seq_width), static_cast<int>(beam_size),
end_id, is_accumulated, num_used_threads));
}
} else {
LOG(FATAL) << "Not implemented.";
}
context.Wait();
if (!framework::CheckLoD(selected_lod)) {
PADDLE_THROW("lod %s is not right", framework::LoDToString(selected_lod));
}
selected_ids->set_lod(selected_lod);
selected_scores->set_lod(selected_lod);
if (selected_lod[1].back() < num_seqs * beam_size) {
auto final_selected_dims = framework::make_ddim(
{static_cast<int64_t>(selected_lod[1].back()), 1});
selected_ids->Resize(final_selected_dims);
selected_scores->Resize(final_selected_dims);
}
}
};
template class BeamSearchFunctor<platform::CUDADeviceContext, int>;
template class BeamSearchFunctor<platform::CUDADeviceContext, int64_t>;
template class BeamSearchFunctor<platform::CUDADeviceContext, float>;
template class BeamSearchFunctor<platform::CUDADeviceContext, double>;
} // namespace math
} // namespace operators
} // namespace paddle
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include <string>
#include <vector>
#include "paddle/fluid/framework/lod_tensor.h"
#include "paddle/fluid/platform/device_context.h"
namespace paddle {
namespace operators {
namespace math {
/*
* This is an implementation of beam search.
*
* To explain the details, lets take machine translation task for example, in
* this task, one source sentence is translated to multiple target sentences,
* during this period, one sentence will be translated to multiple translation
* prefixes(target sentence that have not ended), in each time step a prefix
* will have some candidates, input the candidate ids and their corresponding
* scores (probabilities), it will sort and select the top beam_size candidates
* for each source sentence, and store the selected candidates's score and their
* corresponding ids to LoDTensors.
*
* A detailed example:
*
* Input
*
* ids:
* - LoD (should have 2 levels)
* - first level: [0, 1, 4]
* - second level: [0, 1, 2, 3, 4]
* - tensor's data:
* [[4, 2, 5]
* [2, 1, 3]
* [3, 5, 2]
* [8, 2, 1]]
*
* scores:
* - LoD same as `ids`
* - tensor's data
* [[0.5, 0.3, 0.2]
* [0.6, 0.3, 0.1]
* [0.9, 0.5, 0.1]
* [0.7, 0.5, 0.1]]
*
* The inputs means that there are 2 source sentences to translate, and the
* first source has 1 prefix, the second source has 2 prefix.
*
* Lets assume beam size is 2, and the beam search's output should be
* - LoD
* - first level: [0, 1, 2]
* - second level: [0, 2, 4]
* - id tensor's data
* [[4,
* 1,
* 3,
* 8]]
* - score tensor's data
* [[0.5,
* 0.3,
* 0.9,
* 0.7]]
*
* TODO all the prune operations should be in the beam search, so it is better
* to split the beam search algorithm into a sequence of smaller operators, and
* the prune operators can be inserted in this sequence.
*/
template <typename DeviceContext, typename T>
class BeamSearchFunctor {
public:
/*
* The main function of beam search.
*
* @selected_ids: a [None, 1]-shaped tensor with LoD.
* In a machine translation model, it might be the candidate term id sets,
* each set stored as a varience-length sequence.
* The format might be described with a two-level LoD
* - [[0 1],
* [0 1 2]]
* - [[]
* [0 1]]
* the first level of LoD tells that there are two source sentences. The
* second level describes the details of the candidate id set's offsets in
* the source sentences.
*
* @selected_scores: a LoD tensor with the same shape and LoD with
* selected_ids.
* It stores the corresponding scores of candidate ids in selected_ids.
*
* Return false if all the input tensor is empty, in machine translation task
* that means no candidates is provided, and the task will stop running.
*/
void operator()(const DeviceContext& context,
const framework::LoDTensor* pre_ids,
const framework::LoDTensor* pre_scores,
const framework::LoDTensor* ids,
const framework::LoDTensor* scores,
framework::LoDTensor* selected_ids,
framework::LoDTensor* selected_scores, size_t level,
size_t beam_size, int end_id, bool is_accumulated);
};
} // namespace math
} // namespace operators
} // namespace paddle
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/operators/math/beam_search.h"
#include <gtest/gtest.h>
#include <vector>
void PrepareCPUTensors(paddle::framework::LoDTensor* ids,
paddle::framework::LoDTensor* scores,
paddle::framework::LoDTensor* pre_ids,
paddle::framework::LoDTensor* pre_scores) {
// lod
paddle::framework::LoD lod;
std::vector<size_t> level0({0, 2, 4});
std::vector<size_t> level1({0, 1, 2, 3, 4});
lod.push_back(level0);
lod.push_back(level1);
ids->set_lod(lod);
scores->set_lod(lod);
auto dims = paddle::framework::make_ddim({4, 3});
ids->Resize(dims);
scores->Resize(dims);
paddle::platform::CPUPlace place;
auto* ids_data = ids->mutable_data<int64_t>(place);
auto* scores_data = scores->mutable_data<float>(place);
std::vector<int64_t> ids_vec_data({4, 2, 5, 2, 1, 3, 3, 5, 2, 8, 2, 1});
std::vector<float> scores_vec_data(
{0.6f, 0.3f, 0.5f, 0.2f, 0.3f, 0.1f, 0.9f, 0.5f, 0.1f, 0.7f, 0.5f, 0.1f});
CHECK_EQ(static_cast<size_t>(ids->numel()), ids_vec_data.size());
CHECK_EQ(static_cast<size_t>(ids->numel()), scores_vec_data.size());
for (int i = 0; i < ids->numel(); i++) {
ids_data[i] = ids_vec_data[i];
scores_data[i] = scores_vec_data[i];
}
// pre_ids
pre_ids->Resize(paddle::framework::make_ddim({4, 1}));
for (int i = 0; i < 4; i++) {
pre_ids->mutable_data<int64_t>(place)[i] = i + 1;
}
// pre_scores
pre_scores->Resize(paddle::framework::make_ddim({4, 1}));
for (int i = 0; i < 4; i++) {
pre_scores->mutable_data<float>(place)[i] = 0.1 * (i + 1);
}
}
template <typename DeviceContext, typename Place>
void TestBeamSearch() {
paddle::framework::LoDTensor ids;
paddle::framework::LoDTensor scores;
paddle::framework::LoDTensor pre_ids;
paddle::framework::LoDTensor pre_scores;
auto* place = new Place();
DeviceContext* context = new DeviceContext(*place);
if (paddle::platform::is_cpu_place(*place)) {
PrepareCPUTensors(&ids, &scores, &pre_ids, &pre_scores);
} else {
paddle::framework::LoDTensor cpu_ids;
paddle::framework::LoDTensor cpu_scores;
paddle::framework::LoDTensor cpu_pre_ids;
paddle::framework::LoDTensor cpu_pre_scores;
PrepareCPUTensors(&cpu_ids, &cpu_scores, &cpu_pre_ids, &cpu_pre_scores);
TensorCopySync(cpu_ids, *place, &ids);
TensorCopySync(cpu_scores, *place, &scores);
TensorCopySync(cpu_pre_ids, *place, &pre_ids);
TensorCopySync(cpu_pre_scores, *place, &pre_scores);
ids.set_lod(cpu_ids.lod());
scores.set_lod(cpu_scores.lod());
pre_ids.set_lod(cpu_pre_ids.lod());
pre_scores.set_lod(cpu_pre_scores.lod());
}
paddle::framework::LoDTensor selected_ids;
paddle::framework::LoDTensor selected_scores;
size_t level = 0;
size_t beam_size = 2;
int end_id = 0;
paddle::operators::math::BeamSearchFunctor<DeviceContext, float> beamsearch;
beamsearch(*context, &pre_ids, &pre_scores, &ids, &scores, &selected_ids,
&selected_scores, level, beam_size, end_id, true);
ASSERT_EQ(selected_ids.lod(), selected_scores.lod());
paddle::framework::LoDTensor cpu_selected_ids;
paddle::framework::LoDTensor cpu_selected_scores;
if (paddle::platform::is_cpu_place(*place)) {
cpu_selected_ids = selected_ids;
cpu_selected_scores = selected_scores;
} else {
TensorCopySync(selected_ids, paddle::platform::CPUPlace(),
&cpu_selected_ids);
TensorCopySync(selected_scores, paddle::platform::CPUPlace(),
&cpu_selected_scores);
cpu_selected_ids.set_lod(selected_ids.lod());
cpu_selected_scores.set_lod(selected_scores.lod());
}
std::vector<int64_t> expected_ids({4, 5, 3, 8});
std::vector<float> expected_scores({0.6f, 0.5f, 0.9f, 0.7f});
for (int i = 0; i < 4; i++) {
ASSERT_EQ(expected_ids[i], cpu_selected_ids.data<int64_t>()[i]);
ASSERT_EQ(expected_scores[i], cpu_selected_scores.data<float>()[i]);
}
delete place;
delete context;
}
TEST(BeamSearch, CPU) {
TestBeamSearch<paddle::platform::CPUDeviceContext,
paddle::platform::CPUPlace>();
}
#ifdef PADDLE_WITH_CUDA
TEST(BeamSearch, GPU) {
TestBeamSearch<paddle::platform::CUDADeviceContext,
paddle::platform::CUDAPlace>();
}
#endif
...@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and ...@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#include "paddle/fluid/operators/math/sampler.h" #include "paddle/fluid/operators/math/sampler.h"
#include <glog/logging.h>
#include <iostream> #include <iostream>
#include <queue> #include <queue>
#include <utility> #include <utility>
...@@ -77,7 +78,14 @@ int64_t CustomSampler::Sample() const { ...@@ -77,7 +78,14 @@ int64_t CustomSampler::Sample() const {
auto index = (*int_dist_)(*random_engine_); auto index = (*int_dist_)(*random_engine_);
auto p = (*real_dist_)(*random_engine_); auto p = (*real_dist_)(*random_engine_);
if (p > alias_probs_[index]) { if (p > alias_probs_[index]) {
return alias_[index]; int alias = alias_[index];
if (alias == exceptional_val) {
LOG(WARNING) << "WARNING: CustomSampler get alias " << exceptional_val;
return index;
}
return alias;
} else { } else {
return index; return index;
} }
......
...@@ -116,6 +116,7 @@ class CustomSampler : public Sampler { ...@@ -116,6 +116,7 @@ class CustomSampler : public Sampler {
const float* alias_probs_; const float* alias_probs_;
const int* alias_; const int* alias_;
const float* probs_; const float* probs_;
const int exceptional_val = -1;
std::shared_ptr<std::mt19937> random_engine_; std::shared_ptr<std::mt19937> random_engine_;
std::shared_ptr<std::uniform_real_distribution<>> real_dist_; std::shared_ptr<std::uniform_real_distribution<>> real_dist_;
std::shared_ptr<std::uniform_int_distribution<>> int_dist_; std::shared_ptr<std::uniform_int_distribution<>> int_dist_;
......
...@@ -354,7 +354,7 @@ TEST(selected_rows_functor, cpu_merge_add_multi) { ...@@ -354,7 +354,7 @@ TEST(selected_rows_functor, cpu_merge_add_multi) {
auto* out_data = output->value().data<float>(); auto* out_data = output->value().data<float>();
for (size_t i = 0; i < ret_rows.size(); ++i) { for (size_t i = 0; i < ret_rows.size(); ++i) {
for (size_t j = 0; j < row_numel; ++j) { for (size_t j = 0; j < static_cast<size_t>(row_numel); ++j) {
EXPECT_EQ(out_data[i * row_numel + j], ret_rows[i]); EXPECT_EQ(out_data[i * row_numel + j], ret_rows[i]);
} }
} }
......
...@@ -301,7 +301,7 @@ TEST(selected_rows_functor, gpu_merge_add) { ...@@ -301,7 +301,7 @@ TEST(selected_rows_functor, gpu_merge_add) {
auto* out_data = output_cpu.data<float>(); auto* out_data = output_cpu.data<float>();
for (size_t i = 0; i < ret_rows.size(); ++i) { for (size_t i = 0; i < ret_rows.size(); ++i) {
for (size_t j = 0; j < row_numel; ++j) { for (size_t j = 0; j < static_cast<size_t>(row_numel); ++j) {
EXPECT_EQ(out_data[i * row_numel + j], ret_rows[i]); EXPECT_EQ(out_data[i * row_numel + j], ret_rows[i]);
} }
} }
......
...@@ -66,7 +66,7 @@ void TestSequencePoolingSum(const paddle::framework::LoD& lod) { ...@@ -66,7 +66,7 @@ void TestSequencePoolingSum(const paddle::framework::LoD& lod) {
cpu_in_grad.set_lod(in_grad.lod()); cpu_in_grad.set_lod(in_grad.lod());
} }
EXPECT_EQ(in_grad.numel(), lod[0].back() * second_dim); EXPECT_EQ(in_grad.numel(), static_cast<int64_t>(lod[0].back() * second_dim));
EXPECT_EQ(in_grad.lod(), lod); EXPECT_EQ(in_grad.lod(), lod);
if (paddle::platform::is_cpu_place(*place)) { if (paddle::platform::is_cpu_place(*place)) {
......
...@@ -119,6 +119,11 @@ class NCEKernel : public framework::OpKernel<T> { ...@@ -119,6 +119,11 @@ class NCEKernel : public framework::OpKernel<T> {
PrepareSamples<DeviceContext, T>(context, sampler); PrepareSamples<DeviceContext, T>(context, sampler);
auto sample_labels = context.Output<Tensor>("SampleLabels"); auto sample_labels = context.Output<Tensor>("SampleLabels");
const int64_t *sample_labels_data = sample_labels->data<int64_t>(); const int64_t *sample_labels_data = sample_labels->data<int64_t>();
for (int x = 0; x < sample_labels->numel(); x++) {
PADDLE_ENFORCE_GE(sample_labels_data[x], 0, "nce sample label %d", x);
}
auto sample_out = context.Output<Tensor>("SampleLogits"); auto sample_out = context.Output<Tensor>("SampleLogits");
T *sample_out_data = sample_out->mutable_data<T>(context.GetPlace()); T *sample_out_data = sample_out->mutable_data<T>(context.GetPlace());
auto label = context.Input<Tensor>("Label"); auto label = context.Input<Tensor>("Label");
......
if(WITH_NGRAPH)
cc_library(ngraph_bridge SRCS ngraph_bridge.cc DEPS operator framework_proto ngraph)
cc_library(ngraph_engine SRCS ngraph_engine.cc DEPS ngraph_bridge framework_proto)
op_library(ngraph_engine_op DEPS ngraph_engine op_registry op_info device_context)
endif()
...@@ -17,39 +17,39 @@ limitations under the License. */ ...@@ -17,39 +17,39 @@ limitations under the License. */
#include <vector> #include <vector>
#include "ngraph/ngraph.hpp" #include "ngraph/ngraph.hpp"
#include "paddle/fluid/framework/ngraph_bridge.h" #include "paddle/fluid/operators/ngraph/ngraph_bridge.h"
#include "paddle/fluid/framework/operator.h"
#include "paddle/fluid/operators/ngraph/ngraph_ops.h" #include "paddle/fluid/operators/ngraph/ngraph_ops.h"
#include "paddle/fluid/platform/enforce.h" #include "paddle/fluid/platform/enforce.h"
#include "paddle/fluid/platform/ngraph_helper.h" #include "paddle/fluid/platform/ngraph_helper.h"
namespace paddle { namespace paddle {
namespace framework { namespace operators {
namespace NG_OPS = paddle::operators::ngraphs; namespace NG_OPS = paddle::operators::ngraphs;
std::map<std::string, std::map<std::string,
std::function<void(const std::shared_ptr<OperatorBase>&, std::function<void(const std::shared_ptr<framework::OperatorBase>&,
std::shared_ptr<std::unordered_map< std::shared_ptr<std::unordered_map<
std::string, std::shared_ptr<ngraph::Node>>>)>> std::string, std::shared_ptr<ngraph::Node>>>)>>
NgraphBridge::NG_NODE_MAP = { NgraphBridge::NG_NODE_MAP = {
{"elementwise_add", NG_OPS::BuildElementwiseAddNode}, {"elementwise_add", NG_OPS::BuildElementwiseAddNode},
{"elementwise_add_grad", NG_OPS::BuildElementwiseAddGradNode}, {"elementwise_add_grad", NG_OPS::BuildElementwiseAddGradNode},
{"fill_constant", paddle::operators::ngraphs::BuildFillConstantNode}, {"fill_constant", NG_OPS::BuildFillConstantNode},
{"mean", paddle::operators::ngraphs::BuildMeanNode}, {"mean", NG_OPS::BuildMeanNode},
{"mean_grad", paddle::operators::ngraphs::BuildMeanGradNode}, {"mean_grad", NG_OPS::BuildMeanGradNode},
{"mul", paddle::operators::ngraphs::BuildMulNode}, {"mul", NG_OPS::BuildMulNode},
{"mul_grad", paddle::operators::ngraphs::BuildMulGradNode}, {"mul_grad", NG_OPS::BuildMulGradNode},
{"softmax", paddle::operators::ngraphs::BuildSoftmaxNode}, {"softmax", NG_OPS::BuildSoftmaxNode},
{"softmax_grad", paddle::operators::ngraphs::BuildSoftmaxGradNode}, {"softmax_grad", NG_OPS::BuildSoftmaxGradNode},
{"scale", paddle::operators::ngraphs::BuildScaleNode}, {"scale", NG_OPS::BuildScaleNode},
{"relu", paddle::operators::ngraphs::BuildUnaryNode<ngraph::op::Relu>}, {"relu", NG_OPS::BuildUnaryNode<ngraph::op::Relu>},
{"tanh", paddle::operators::ngraphs::BuildUnaryNode<ngraph::op::Tanh>}, {"tanh", NG_OPS::BuildUnaryNode<ngraph::op::Tanh>},
{"top_k", paddle::operators::ngraphs::BuildTopKNode}}; {"top_k", NG_OPS::BuildTopKNode}};
void NgraphBridge::BuildNgNode(const std::shared_ptr<OperatorBase>& op) { void NgraphBridge::BuildNgNode(
const std::shared_ptr<framework::OperatorBase>& op) {
auto& op_type = op->Type(); auto& op_type = op->Type();
NG_NODE_MAP[op_type](op, ngb_node_map_); NG_NODE_MAP[op_type](op, ngb_node_map_);
} }
} // namespace framework } // namespace operators
} // namespace paddle } // namespace paddle
...@@ -21,16 +21,16 @@ limitations under the License. */ ...@@ -21,16 +21,16 @@ limitations under the License. */
#include "ngraph/node.hpp" #include "ngraph/node.hpp"
namespace paddle { #include "paddle/fluid/framework/operator.h"
namespace framework {
class OperatorBase; namespace paddle {
namespace operators {
class NgraphBridge { class NgraphBridge {
public: public:
static std::map< static std::map<
std::string, std::string,
std::function<void(const std::shared_ptr<OperatorBase>&, std::function<void(const std::shared_ptr<framework::OperatorBase>&,
std::shared_ptr<std::unordered_map< std::shared_ptr<std::unordered_map<
std::string, std::shared_ptr<ngraph::Node>>>)>> std::string, std::shared_ptr<ngraph::Node>>>)>>
NG_NODE_MAP; NG_NODE_MAP;
...@@ -41,7 +41,7 @@ class NgraphBridge { ...@@ -41,7 +41,7 @@ class NgraphBridge {
var_node_map) var_node_map)
: ngb_node_map_(var_node_map) {} : ngb_node_map_(var_node_map) {}
void BuildNgNode(const std::shared_ptr<OperatorBase>& op); void BuildNgNode(const std::shared_ptr<framework::OperatorBase>& op);
private: private:
std::shared_ptr< std::shared_ptr<
...@@ -49,5 +49,5 @@ class NgraphBridge { ...@@ -49,5 +49,5 @@ class NgraphBridge {
ngb_node_map_; ngb_node_map_;
}; };
} // namespace framework } // namespace operators
} // namespace paddle } // namespace paddle
...@@ -16,22 +16,25 @@ limitations under the License. */ ...@@ -16,22 +16,25 @@ limitations under the License. */
#include <algorithm> #include <algorithm>
#include <map> #include <map>
#include <string>
#include <vector>
#include "paddle/fluid/framework/block_desc.h"
#include "paddle/fluid/framework/ddim.h"
#include "paddle/fluid/framework/feed_fetch_type.h" #include "paddle/fluid/framework/feed_fetch_type.h"
#include "paddle/fluid/framework/framework.pb.h" #include "paddle/fluid/framework/framework.pb.h"
#include "paddle/fluid/framework/lod_tensor.h" #include "paddle/fluid/framework/lod_tensor.h"
#include "paddle/fluid/framework/ngraph_bridge.h" #include "paddle/fluid/framework/op_desc.h"
#include "paddle/fluid/framework/ngraph_operator.h" #include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/framework/tensor.h"
#include "paddle/fluid/framework/var_desc.h" #include "paddle/fluid/framework/var_desc.h"
#include "paddle/fluid/framework/var_type.h" #include "paddle/fluid/framework/var_type.h"
#include "paddle/fluid/operators/ngraph/ngraph_bridge.h"
#include "ngraph/ngraph.hpp" #include "paddle/fluid/operators/ngraph/ngraph_engine.h"
namespace paddle { namespace paddle {
namespace framework { namespace operators {
static ngraph::Shape Ddim2Shape(const DDim& dims) { static ngraph::Shape Ddim2Shape(const framework::DDim& dims) {
ngraph::Shape sp; ngraph::Shape sp;
for (int i = 0; i < dims.size(); ++i) { for (int i = 0; i < dims.size(); ++i) {
int k = dims[i]; int k = dims[i];
...@@ -41,117 +44,39 @@ static ngraph::Shape Ddim2Shape(const DDim& dims) { ...@@ -41,117 +44,39 @@ static ngraph::Shape Ddim2Shape(const DDim& dims) {
return sp; return sp;
} }
static std::map<proto::VarType::Type, ngraph::element::Type> pd2ng_type_map = { static std::map<framework::proto::VarType::Type, ngraph::element::Type>
{proto::VarType::FP32, ngraph::element::f32}, pd2ng_type_map = {
{proto::VarType::FP64, ngraph::element::f64}, {framework::proto::VarType::FP32, ngraph::element::f32},
{proto::VarType::INT32, ngraph::element::i32}, {framework::proto::VarType::FP64, ngraph::element::f64},
{proto::VarType::INT64, ngraph::element::i64}, {framework::proto::VarType::INT32, ngraph::element::i32},
{proto::VarType::BOOL, ngraph::element::boolean}, {framework::proto::VarType::INT64, ngraph::element::i64},
{framework::proto::VarType::BOOL, ngraph::element::boolean},
}; };
typedef enum { /* nGraph support state on ops */ std::unordered_map<std::string, std::shared_ptr<ngraph::Function>>
FULL_TRAIN, /* Support full ops for train */ NgraphEngine::func_cache_ = {};
PARTIAL_TRAIN, /* Support partial ops for train */
FULL_TEST, /* Support full list of ops for test */
PARTIAL_TEST /* Support partial list of ops for test */
} op_state;
// perform graph build through bridge and execute computation
class NgraphEngine {
public:
explicit NgraphEngine(const Scope& scope, const platform::Place& place,
const std::vector<std::shared_ptr<OperatorBase>>& ops,
const std::unordered_map<
std::string, ngraph::element::Type>& var_type_map,
const std::unordered_set<std::string>& persist,
const std::unordered_set<std::string>& fetches,
const std::unordered_set<std::string>& post_op_inputs,
op_state ng_op_state)
: scope_(scope),
place_(place),
fused_ops_(ops),
var_type_map_(var_type_map),
persistables_(persist),
fetches_(fetches),
post_op_inputs_(post_op_inputs),
ng_op_state_(ng_op_state) {
var_in_node_map_ = std::make_shared<
std::unordered_map<std::string, std::shared_ptr<ngraph::Node>>>();
var_node_map_ = std::make_shared<
std::unordered_map<std::string, std::shared_ptr<ngraph::Node>>>();
BuildNgIO();
GetNgFunction();
}
void Run(const Scope& scope, const platform::Place& place) const; std::shared_ptr<ngraph::runtime::Backend> NgraphEngine::backend_ =
ngraph::runtime::Backend::create("CPU");
private:
static std::unordered_map<std::string, std::shared_ptr<ngraph::Function>>
func_cache_;
const Scope& scope_;
const platform::Place& place_;
std::vector<std::shared_ptr<OperatorBase>> fused_ops_;
std::unordered_map<std::string, ngraph::element::Type> var_type_map_;
std::unordered_set<std::string> persistables_;
std::unordered_set<std::string> fetches_;
std::unordered_set<std::string> post_op_inputs_;
op_state ng_op_state_;
// ngraph backend eg. CPU
static std::shared_ptr<ngraph::runtime::Backend> backend_;
// ngraph function to call and execute
std::shared_ptr<ngraph::Function> ngraph_function_;
// var_name of inputs
std::vector<std::string> var_in_;
// var_name of outputs from fetch in order
std::vector<std::string> var_out_;
// map input vars to nodes
std::shared_ptr<
std::unordered_map<std::string, std::shared_ptr<ngraph::Node>>>
var_in_node_map_;
// map each var name with a ngraph node
std::shared_ptr<
std::unordered_map<std::string, std::shared_ptr<ngraph::Node>>>
var_node_map_;
// cache key to check if function is cached
std::shared_ptr<std::string> GetCacheKey();
// get ngraph input and define ngraph input parameters
void GetNgInputShape(std::shared_ptr<OperatorBase> op);
// Call ngraph bridge to map ops
void BuildNgNodes();
// get the ngraph input and output var list
void BuildNgIO();
// build ngraph function call
void BuildNgFunction();
// Check cache for ngraph function or otherwise build the function
void GetNgFunction();
};
std::vector<std::vector<std::vector<std::unique_ptr<OperatorBase>>::iterator>> static std::vector<std::vector<int>> NgraphOpIntervals(
NgraphOperator::NgraphOpIntervals( framework::BlockDesc* block) {
std::vector<std::unique_ptr<paddle::framework::OperatorBase>>* ops) { std::vector<std::vector<int>> intervals;
std::vector<std::vector<std::vector<std::unique_ptr<OperatorBase>>::iterator>> auto ops = block->AllOps();
intervals; int size = ops.size();
if (ops->empty()) { int left = 0;
return intervals; while (left < size && ops.at(left)->Type() != framework::kFeedOpType) {
}
size_t size = ops->size();
size_t left = 0;
while (left < size && ops->at(left)->Type() != kFeedOpType) {
++left; ++left;
} }
if (left == size) { if (left == size) {
return intervals; return intervals;
} }
while (left < size && ops->at(left)->Type() == kFeedOpType) { while (left < size && ops.at(left)->Type() == framework::kFeedOpType) {
++left; ++left;
} }
size_t right = left; int right = left;
while (right < size && ops->at(right)->Type() != kFetchOpType) { while (right < size && ops.at(right)->Type() != framework::kFetchOpType) {
++right; ++right;
} }
if (right == size) { if (right == size) {
...@@ -160,66 +85,89 @@ NgraphOperator::NgraphOpIntervals( ...@@ -160,66 +85,89 @@ NgraphOperator::NgraphOpIntervals(
if (left >= right) return intervals; if (left >= right) return intervals;
// (left, right - 1) represents indices between feed and fetch // (left, right - 1) represents indices between feed and fetch
size_t pivot = left; int pivot = left;
while (pivot < right) { while (pivot < right) {
auto op_type = ops->at(pivot)->Type(); auto op_type = ops.at(pivot)->Type();
if (paddle::framework::NgraphBridge::NG_NODE_MAP.find(op_type) == if (NgraphBridge::NG_NODE_MAP.find(op_type) ==
paddle::framework::NgraphBridge::NG_NODE_MAP.end()) { NgraphBridge::NG_NODE_MAP.end()) {
++pivot; ++pivot;
} else { } else {
size_t start = pivot, end = start; int start = pivot, end = start;
while (pivot < right && while (pivot < right &&
(paddle::framework::NgraphBridge::NG_NODE_MAP.find( (NgraphBridge::NG_NODE_MAP.find(ops.at(pivot)->Type()) !=
ops->at(pivot)->Type()) != NgraphBridge::NG_NODE_MAP.end())) {
paddle::framework::NgraphBridge::NG_NODE_MAP.end())) {
++pivot; ++pivot;
++end; ++end;
} }
std::vector<std::vector<std::unique_ptr<OperatorBase>>::iterator> std::vector<int> interval = {start, end};
interval = {ops->begin() + start, ops->begin() + end};
intervals.push_back(interval); intervals.push_back(interval);
} }
} // end while } // end while
return intervals; return intervals;
} }
NgraphOperator::NgraphOperator( static void SubstituteNgraphOp(framework::BlockDesc* block,
const ProgramDesc& prog, size_t block_id, std::string block_str,
std::vector<std::unique_ptr<OperatorBase>>::iterator start, std::vector<int> interval) {
std::vector<std::unique_ptr<OperatorBase>>::iterator end, framework::ProgramDesc program;
const std::string& type, const VariableNameMap& inputs, block->RemoveOp(interval.at(0), interval.at(1));
const VariableNameMap& outputs, const AttributeMap& attrs) auto* ng_op = block->InsertOp(interval.at(0));
: OperatorBase(type, inputs, outputs, attrs), ng_op->SetType("ngraph_engine");
pdesc_(prog), ng_op->SetAttr("interval", interval);
block_(block_id) { ng_op->SetAttr("graph", block_str);
for (std::vector<std::unique_ptr<OperatorBase>>::iterator it = start; }
it != end; ++it) {
fused_ops_.push_back(std::move(*it));
}
for (std::vector<std::unique_ptr<OperatorBase>>::iterator it = end; // TODO(baojun-nervana): Move EnableNgraph to compile time per PR #15089
(*it)->Type() != kFetchOpType; ++it) { void NgraphEngine::EnableNgraph(const framework::ProgramDesc& program) {
for (auto& var_name_item : (*it)->Inputs()) { #ifdef PADDLE_WITH_NGRAPH
for (auto& var_name : var_name_item.second) { VLOG(4) << "use_ngraph=True";
post_op_inputs_.insert(var_name); for (size_t bid = 0; bid < program.Size(); ++bid) {
} // TODO(baojun-nervana): Remove the const_cast
auto* block =
const_cast<framework::ProgramDesc&>(program).MutableBlock(bid);
std::string block_str = block->Proto()->SerializeAsString();
auto intervals = NgraphOpIntervals(block);
for (auto it = intervals.rbegin(); it != intervals.rend(); ++it) {
SubstituteNgraphOp(block, block_str, *it);
} }
} }
#else
LOG(WARNING)
<< "'NGRAPH' is not supported, Please re-compile with WITH_NGRAPH option";
#endif
}
if ((*(start - 1))->Type() == kFeedOpType && (*end)->Type() == kFetchOpType) { NgraphEngine::NgraphEngine(const framework::Scope& scope,
is_full_ = true; const platform::Place& place,
} const std::string& serialized_graph,
const std::vector<int>& interval)
: scope_(scope), place_(place) {
var_in_node_map_ = std::make_shared<
std::unordered_map<std::string, std::shared_ptr<ngraph::Node>>>();
var_node_map_ = std::make_shared<
std::unordered_map<std::string, std::shared_ptr<ngraph::Node>>>();
Process(); func_cache_key_ = std::to_string(interval[0]) + std::to_string(interval[1]) +
serialized_graph;
framework::proto::BlockDesc bdesc;
bdesc.ParseFromString(serialized_graph);
framework::BlockDesc block(nullptr, &bdesc);
Prepare(block, interval);
BuildNgIO();
GetNgFunction();
} }
void NgraphOperator::Process() { void NgraphEngine::Prepare(const framework::BlockDesc& block,
auto& bdesc = pdesc_.Block(block_); const std::vector<int>& interval) {
for (auto& var : bdesc.AllVars()) { for (auto& var : block.AllVars()) {
if (!(var->GetType() == proto::VarType::SELECTED_ROWS || if (!(var->GetType() == framework::proto::VarType::SELECTED_ROWS ||
var->GetType() == proto::VarType::LOD_TENSOR || var->GetType() == framework::proto::VarType::LOD_TENSOR ||
var->GetType() == proto::VarType::LOD_TENSOR_ARRAY)) { var->GetType() == framework::proto::VarType::LOD_TENSOR_ARRAY)) {
continue; continue;
} }
...@@ -228,7 +176,8 @@ void NgraphOperator::Process() { ...@@ -228,7 +176,8 @@ void NgraphOperator::Process() {
continue; continue;
} }
if (var_name != "fetch" && var_name != "feed") { if (var_name != framework::kFeedOpType &&
var_name != framework::kFetchOpType) {
auto pd_type = var->GetDataType(); auto pd_type = var->GetDataType();
if (pd2ng_type_map.find(pd_type) == pd2ng_type_map.end()) { if (pd2ng_type_map.find(pd_type) == pd2ng_type_map.end()) {
PADDLE_THROW("Data type of var %s not found in pd2ng_type_map", PADDLE_THROW("Data type of var %s not found in pd2ng_type_map",
...@@ -242,53 +191,66 @@ void NgraphOperator::Process() { ...@@ -242,53 +191,66 @@ void NgraphOperator::Process() {
} }
} }
for (auto* op : bdesc.AllOps()) { auto ops_desc = block.AllOps();
if (op->Type() == kFetchOpType) { int idx = interval[0];
std::string fetch_target_name = op->Input("X")[0]; while (idx < interval[1]) {
fetches_.insert(fetch_target_name); auto op_desc = ops_desc.at(idx);
} auto op = framework::OpRegistry::CreateOp(*op_desc);
fused_ops_.push_back(std::move(op));
++idx;
} }
}
void NgraphOperator::RunImpl(const Scope& scope, while (ops_desc.at(idx)->Type() != framework::kFetchOpType) {
const platform::Place& place) const { auto op_desc = ops_desc.at(idx);
op_state ng_op_state = PARTIAL_TEST; for (auto& var_name_item : op_desc->Inputs()) {
auto& bdesc = pdesc_.Block(block_); for (auto& var_name : var_name_item.second) {
for (auto* op : bdesc.AllOps()) { post_op_inputs_.insert(var_name);
if (op->Type().find("_grad") != std::string::npos) { }
ng_op_state = PARTIAL_TRAIN;
break;
} }
++idx;
} }
if (is_full_) { while (idx < static_cast<int>(ops_desc.size()) &&
ng_op_state = ng_op_state == PARTIAL_TEST ? FULL_TEST : FULL_TRAIN; ops_desc.at(idx)->Type() == framework::kFetchOpType) {
std::string fetch_target_name = ops_desc.at(idx)->Input("X")[0];
fetches_.insert(fetch_target_name);
++idx;
} }
NgraphEngine ngraph_engine(scope, place, fused_ops_, var_type_map_, if (ops_desc.at(interval.at(0) - 1)->Type() == framework::kFeedOpType &&
persistables_, fetches_, post_op_inputs_, ops_desc.at(interval.at(1))->Type() == framework::kFetchOpType) {
ng_op_state); ng_op_state_ = OpState::FULL;
ngraph_engine.Run(scope, place); }
}
std::unordered_map<std::string, std::shared_ptr<ngraph::Function>> for (auto* op_desc : ops_desc) {
NgraphEngine::func_cache_ = {}; if (op_desc->Type().find("_grad") != std::string::npos) {
ng_op_state_ = ng_op_state_ == OpState::FULL ? OpState::FULL_TRAIN
: OpState::PARTIAL_TRAIN;
break;
}
}
std::shared_ptr<ngraph::runtime::Backend> NgraphEngine::backend_ = if (ng_op_state_ != OpState::FULL_TRAIN &&
ngraph::runtime::Backend::create("CPU"); ng_op_state_ != OpState::PARTIAL_TRAIN) {
ng_op_state_ = ng_op_state_ == OpState::FULL ? OpState::FULL_TEST
: OpState::PARTIAL_TEST;
}
}
void NgraphEngine::GetNgInputShape(std::shared_ptr<OperatorBase> op) { void NgraphEngine::GetNgInputShape(
RuntimeContext ctx(op->Inputs(), op->Outputs(), scope_); std::shared_ptr<framework::OperatorBase> op) {
framework::RuntimeContext ctx(op->Inputs(), op->Outputs(), scope_);
op->RuntimeInferShape(scope_, place_, ctx); op->RuntimeInferShape(scope_, place_, ctx);
for (auto& var_name_item : op->Inputs()) { for (auto& var_name_item : op->Inputs()) {
for (auto& var_name : var_name_item.second) { for (auto& var_name : var_name_item.second) {
auto* var = scope_.FindVar(var_name); auto* var = scope_.FindVar(var_name);
if (var && var->IsType<LoDTensor>()) { if (var && var->IsType<framework::LoDTensor>()) {
auto* tensor_pd = GetLoDTensorOrSelectedRowsValueFromVar(*var); auto* tensor_pd = GetLoDTensorOrSelectedRowsValueFromVar(*var);
auto sp = Ddim2Shape(tensor_pd->dims()); auto sp = Ddim2Shape(tensor_pd->dims());
if (std::find(var_in_.begin(), var_in_.end(), var_name) != if (std::find(var_in_.begin(), var_in_.end(), var_name) !=
var_in_.end()) { var_in_.end()) {
if (var_node_map_->find(var_name) == var_node_map_->end()) { if (var_node_map_->find(var_name) == var_node_map_->end()) {
// auto ng_type = pd2ng_type_map.at(GetDataTypeOfVar(var));
auto ng_type = var_type_map_.at(var_name); auto ng_type = var_type_map_.at(var_name);
auto prm = auto prm =
std::make_shared<ngraph::op::Parameter>(ng_type, sp, true); std::make_shared<ngraph::op::Parameter>(ng_type, sp, true);
...@@ -302,22 +264,25 @@ void NgraphEngine::GetNgInputShape(std::shared_ptr<OperatorBase> op) { ...@@ -302,22 +264,25 @@ void NgraphEngine::GetNgInputShape(std::shared_ptr<OperatorBase> op) {
} }
void NgraphEngine::BuildNgNodes() { void NgraphEngine::BuildNgNodes() {
for (auto& var_name : var_out_) { for (auto& op : fused_ops_) {
if (var_node_map_->find(var_name) == var_node_map_->end()) { for (auto& var_name_item : op->Outputs()) {
auto* var = scope_.FindVar(var_name); for (auto& var_name : var_name_item.second) {
if (var && var->IsType<LoDTensor>()) { if (var_node_map_->find(var_name) == var_node_map_->end()) {
auto* tensor_pd = GetLoDTensorOrSelectedRowsValueFromVar(*var); auto* var = scope_.FindVar(var_name);
auto& ddim = tensor_pd->dims(); if (var && var->IsType<framework::LoDTensor>()) {
auto ng_shape = Ddim2Shape(ddim); auto* tensor_pd = GetLoDTensorOrSelectedRowsValueFromVar(*var);
auto ng_type = var_type_map_.at(var_name); auto& ddim = tensor_pd->dims();
auto prm = auto ng_shape = Ddim2Shape(ddim);
std::make_shared<ngraph::op::Parameter>(ng_type, ng_shape, true); auto ng_type = var_type_map_.at(var_name);
(*var_node_map_)[var_name] = prm; auto prm = std::make_shared<ngraph::op::Parameter>(ng_type,
ng_shape, true);
(*var_node_map_)[var_name] = prm;
}
}
} }
} }
} }
NgraphBridge ngb(var_node_map_);
paddle::framework::NgraphBridge ngb(var_node_map_);
for (auto& op : fused_ops_) { for (auto& op : fused_ops_) {
ngb.BuildNgNode(op); ngb.BuildNgNode(op);
} }
...@@ -363,25 +328,25 @@ void NgraphEngine::BuildNgIO() { ...@@ -363,25 +328,25 @@ void NgraphEngine::BuildNgIO() {
op->Type()); op->Type());
for (auto& var_name : var_name_item.second) { for (auto& var_name : var_name_item.second) {
switch (ng_op_state_) { switch (ng_op_state_) {
case PARTIAL_TEST: case OpState::PARTIAL_TEST:
if (post_op_inputs_.find(var_name) != post_op_inputs_.end() || if (post_op_inputs_.find(var_name) != post_op_inputs_.end() ||
fetches_.find(var_name) != fetches_.end()) { fetches_.find(var_name) != fetches_.end()) {
var_out_.push_back(var_name); var_out_.push_back(var_name);
} }
break; break;
case FULL_TEST: case OpState::FULL_TEST:
if (fetches_.find(var_name) != fetches_.end()) { if (fetches_.find(var_name) != fetches_.end()) {
var_out_.push_back(var_name); var_out_.push_back(var_name);
} }
break; break;
case PARTIAL_TRAIN: case OpState::PARTIAL_TRAIN:
if (fetches_.find(var_name) != fetches_.end() || if (fetches_.find(var_name) != fetches_.end() ||
post_op_inputs_.find(var_name) != post_op_inputs_.end() || post_op_inputs_.find(var_name) != post_op_inputs_.end() ||
persistables_.find(var_name) != persistables_.end()) { persistables_.find(var_name) != persistables_.end()) {
var_out_.push_back(var_name); var_out_.push_back(var_name);
} }
break; break;
case FULL_TRAIN: case OpState::FULL_TRAIN:
if (fetches_.find(var_name) != fetches_.end() || if (fetches_.find(var_name) != fetches_.end() ||
persistables_.find(var_name) != persistables_.end()) { persistables_.find(var_name) != persistables_.end()) {
var_out_.push_back(var_name); var_out_.push_back(var_name);
...@@ -416,50 +381,30 @@ void NgraphEngine::BuildNgFunction() { ...@@ -416,50 +381,30 @@ void NgraphEngine::BuildNgFunction() {
std::make_shared<ngraph::Function>(func_outputs, func_inputs); std::make_shared<ngraph::Function>(func_outputs, func_inputs);
} }
std::shared_ptr<std::string> NgraphEngine::GetCacheKey() {
auto cache_key = std::make_shared<std::string>("");
*cache_key += std::to_string(fused_ops_.size());
for (auto& op : fused_ops_) {
*cache_key += op->Type();
}
for (auto& var_name : var_in_) {
auto shape = var_node_map_->at(var_name)->get_shape();
*cache_key += var_name;
*cache_key += var_type_map_.at(var_name).c_type_string();
for (size_t i = 0; i < shape.size(); ++i) {
*cache_key += std::to_string(shape.at(i));
}
}
for (auto& var_name : var_out_) {
auto* var = scope_.FindVar(var_name);
if (var && var->IsType<LoDTensor>()) {
auto* tensor_pd = GetLoDTensorOrSelectedRowsValueFromVar(*var);
auto& ddim = tensor_pd->dims();
for (int i = 0; i < ddim.size(); ++i) {
*cache_key += std::to_string(ddim[i]);
}
}
}
return cache_key;
}
void NgraphEngine::GetNgFunction() { void NgraphEngine::GetNgFunction() {
bool cache_on = true; bool cache_on = true;
if (cache_on) { if (cache_on) {
std::string cache_key_val = *GetCacheKey(); std::string input_shape_str;
if (func_cache_.find(cache_key_val) != func_cache_.end()) { for (auto& var_name : var_in_) {
ngraph_function_ = func_cache_.at(cache_key_val); auto shape = var_node_map_->at(var_name)->get_shape();
for (size_t i = 0; i < shape.size(); ++i) {
input_shape_str += std::to_string(shape.at(i));
}
}
func_cache_key_ = input_shape_str + func_cache_key_;
if (func_cache_.find(func_cache_key_) != func_cache_.end()) {
ngraph_function_ = func_cache_.at(func_cache_key_);
} else { } else {
BuildNgFunction(); BuildNgFunction();
func_cache_[cache_key_val] = ngraph_function_; func_cache_[func_cache_key_] = ngraph_function_;
} }
} else { } else {
BuildNgFunction(); BuildNgFunction();
} }
} }
void NgraphEngine::Run(const Scope& scope, const platform::Place& place) const { void NgraphEngine::Run(const framework::Scope& scope,
const platform::Place& place) const {
std::vector<std::shared_ptr<ngraph::runtime::Tensor>> t_in; std::vector<std::shared_ptr<ngraph::runtime::Tensor>> t_in;
std::vector<std::shared_ptr<ngraph::runtime::Tensor>> t_out; std::vector<std::shared_ptr<ngraph::runtime::Tensor>> t_out;
...@@ -468,37 +413,35 @@ void NgraphEngine::Run(const Scope& scope, const platform::Place& place) const { ...@@ -468,37 +413,35 @@ void NgraphEngine::Run(const Scope& scope, const platform::Place& place) const {
auto sp = var_node_map_->at(vi)->get_shape(); auto sp = var_node_map_->at(vi)->get_shape();
std::shared_ptr<ngraph::runtime::Tensor> ti; std::shared_ptr<ngraph::runtime::Tensor> ti;
auto* var = scope.FindVar(vi); auto* var = scope.FindVar(vi);
if (var && var->IsType<LoDTensor>()) { if (var && var->IsType<framework::LoDTensor>()) {
auto* tensor_pd = GetLoDTensorOrSelectedRowsValueFromVar(*var); auto* tensor_pd = GetMutableLoDTensorOrSelectedRowsValueFromVar(var);
PADDLE_ENFORCE(sp == Ddim2Shape(tensor_pd->dims()), PADDLE_ENFORCE(sp == Ddim2Shape(tensor_pd->dims()),
"Ensure ngraph tensor layout align with paddle tensor"); "Ensure ngraph tensor layout align with paddle tensor");
if (tensor_pd->type() == proto::VarType::FP32) { auto ng_type = var_type_map_.at(vi);
const float* arr = tensor_pd->data<float>(); if (ng_type == ngraph::element::f32) {
ti = backend_->create_tensor(ngraph::element::f32, sp, auto pd_arr = tensor_pd->mutable_data<float>(place);
const_cast<float*>(arr)); ti = backend_->create_tensor(ngraph::element::f32, sp, pd_arr);
} else if (tensor_pd->type() == proto::VarType::INT32) { } else if (ng_type == ngraph::element::i32) {
const int* arr = tensor_pd->data<int>(); const int* arr = tensor_pd->data<int>();
ti = backend_->create_tensor(ngraph::element::i32, sp, ti = backend_->create_tensor(ngraph::element::i32, sp,
const_cast<int*>(arr)); const_cast<int*>(arr));
} else if (tensor_pd->type() == proto::VarType::INT64) { } else if (ng_type == ngraph::element::i64) {
const int64_t* arr = tensor_pd->data<int64_t>(); auto pd_arr = tensor_pd->mutable_data<int64_t>(place);
ti = backend_->create_tensor(ngraph::element::i64, sp, ti = backend_->create_tensor(ngraph::element::i64, sp, pd_arr);
const_cast<int64_t*>(arr)); } else if (ng_type == ngraph::element::f64) {
} else if (tensor_pd->type() == proto::VarType::FP64) { auto pd_arr = tensor_pd->mutable_data<double>(place);
const double* arr = tensor_pd->data<double>(); ti = backend_->create_tensor(ngraph::element::f64, sp, pd_arr);
ti = backend_->create_tensor(ngraph::element::f64, sp, } else if (ng_type == ngraph::element::boolean) {
const_cast<double*>(arr)); auto pd_arr = tensor_pd->mutable_data<bool>(place);
} else if (tensor_pd->type() == proto::VarType::BOOL) { ti = backend_->create_tensor(ngraph::element::boolean, sp, pd_arr);
const bool* arr = tensor_pd->data<bool>();
ti = backend_->create_tensor(ngraph::element::boolean, sp,
const_cast<bool*>(arr));
} else { } else {
PADDLE_THROW("Data type not handling for var %s", vi); PADDLE_THROW("Data type not handling for var %s", vi);
} }
} else { } else {
PADDLE_THROW("Cannot find var or tensor with var name %s", vi); PADDLE_THROW("Cannot find var or tensor with var name %s", vi);
} }
bool is_test = (ng_op_state_ == PARTIAL_TEST || ng_op_state_ == FULL_TEST) bool is_test = (ng_op_state_ == OpState::PARTIAL_TEST ||
ng_op_state_ == OpState::FULL_TEST)
? true ? true
: false; : false;
bool is_persistable = bool is_persistable =
...@@ -510,36 +453,39 @@ void NgraphEngine::Run(const Scope& scope, const platform::Place& place) const { ...@@ -510,36 +453,39 @@ void NgraphEngine::Run(const Scope& scope, const platform::Place& place) const {
} }
for (size_t i = 0; i < var_out_.size(); ++i) { for (size_t i = 0; i < var_out_.size(); ++i) {
auto var_name = var_out_[i]; auto vo = var_out_[i];
auto* var = scope.FindVar(var_name); auto* var = scope.FindVar(vo);
std::shared_ptr<ngraph::runtime::Tensor> to; std::shared_ptr<ngraph::runtime::Tensor> to;
if (var && var->IsType<LoDTensor>()) { if (var && var->IsType<framework::LoDTensor>()) {
auto* tensor_pd = GetMutableLoDTensorOrSelectedRowsValueFromVar(var); auto* tensor_pd = GetMutableLoDTensorOrSelectedRowsValueFromVar(var);
auto dd = tensor_pd->dims(); auto dd = tensor_pd->dims();
ngraph::Shape sp = Ddim2Shape(dd); ngraph::Shape sp = Ddim2Shape(dd);
auto ng_type = var_type_map_.at(var_name); auto ng_type = var_type_map_.at(vo);
if (ng_type == ngraph::element::f32) { if (ng_type == ngraph::element::f32) {
auto pd_arr = tensor_pd->mutable_data<float>(place); auto pd_arr = tensor_pd->mutable_data<float>(place);
to = backend_->create_tensor(ngraph::element::f32, sp, pd_arr); to = backend_->create_tensor(ng_type, sp, pd_arr);
} else if (ng_type == ngraph::element::i64) { } else if (ng_type == ngraph::element::i64) {
auto pd_arr = tensor_pd->mutable_data<int64_t>(place); auto pd_arr = tensor_pd->mutable_data<int64_t>(place);
to = backend_->create_tensor(ngraph::element::i64, sp, pd_arr); to = backend_->create_tensor(ng_type, sp, pd_arr);
} else if (ng_type == ngraph::element::i32) {
auto pd_arr = tensor_pd->mutable_data<int>(place);
to = backend_->create_tensor(ng_type, sp, pd_arr);
} else if (ng_type == ngraph::element::f64) { } else if (ng_type == ngraph::element::f64) {
auto pd_arr = tensor_pd->mutable_data<double>(place); auto pd_arr = tensor_pd->mutable_data<double>(place);
to = backend_->create_tensor(ngraph::element::f64, sp, pd_arr); to = backend_->create_tensor(ng_type, sp, pd_arr);
} else if (ng_type == ngraph::element::boolean) { } else if (ng_type == ngraph::element::boolean) {
auto pd_arr = tensor_pd->mutable_data<bool>(place); auto pd_arr = tensor_pd->mutable_data<bool>(place);
to = backend_->create_tensor(ngraph::element::boolean, sp, pd_arr); to = backend_->create_tensor(ng_type, sp, pd_arr);
} else { } else {
PADDLE_THROW("Data type not handled in for var %s", var_name); PADDLE_THROW("Data type not handled in for var %s", vo);
} }
t_out.push_back(to); t_out.push_back(to);
} else { } else {
PADDLE_THROW("Cannot find var or tensor with var name %s", var_name); PADDLE_THROW("Cannot find var or tensor with var name %s", vo);
} }
} }
backend_->call(backend_->compile(ngraph_function_), t_out, t_in); backend_->call(backend_->compile(ngraph_function_), t_out, t_in);
} // NgraphEngine::RunImpl } // NgraphEngine::Run
} // namespace framework } // namespace operators
} // namespace paddle } // namespace paddle
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include <string>
#include <unordered_map>
#include <vector>
#include "paddle/fluid/framework/operator.h"
#include "paddle/fluid/framework/program_desc.h"
#include "ngraph/ngraph.hpp"
namespace paddle {
namespace operators {
enum class OpState { /* nGraph support state on ops */
FULL_TRAIN, /* Support full ops for train */
PARTIAL_TRAIN, /* Support partial ops for train */
FULL_TEST, /* Support full list of ops for test */
PARTIAL_TEST, /* Support partial list of ops for test */
FULL, /* All ops supported from feed to fetch */
UNKNOWN /* Output all for debug purpose */
};
// perform graph build through bridge and execute computation
class NgraphEngine {
public:
explicit NgraphEngine(const framework::Scope& scope,
const platform::Place& place,
const std::string& serialized_graph,
const std::vector<int>& interval);
void Run(const framework::Scope& scope, const platform::Place& place) const;
static void EnableNgraph(const framework::ProgramDesc& program);
private:
static std::unordered_map<std::string, std::shared_ptr<ngraph::Function>>
func_cache_;
const framework::Scope& scope_;
const platform::Place& place_;
std::vector<std::shared_ptr<framework::OperatorBase>> fused_ops_;
std::unordered_map<std::string, ngraph::element::Type> var_type_map_;
std::unordered_set<std::string> persistables_;
std::unordered_set<std::string> fetches_;
std::unordered_set<std::string> post_op_inputs_;
OpState ng_op_state_ = OpState::UNKNOWN;
std::string func_cache_key_;
// ngraph backend eg. CPU
static std::shared_ptr<ngraph::runtime::Backend> backend_;
// ngraph function to call and execute
std::shared_ptr<ngraph::Function> ngraph_function_;
// var_name of inputs
std::vector<std::string> var_in_;
// var_name of outputs from fetch in order
std::vector<std::string> var_out_;
// map input vars to nodes
std::shared_ptr<
std::unordered_map<std::string, std::shared_ptr<ngraph::Node>>>
var_in_node_map_;
// map each var name with a ngraph node
std::shared_ptr<
std::unordered_map<std::string, std::shared_ptr<ngraph::Node>>>
var_node_map_;
// prepare info for nraph engine
void Prepare(const framework::BlockDesc& block,
const std::vector<int>& interval);
// get ngraph input and define ngraph input parameters
void GetNgInputShape(std::shared_ptr<framework::OperatorBase> op);
// Call ngraph bridge to map ops
void BuildNgNodes();
// get the ngraph input and output var list
void BuildNgIO();
// build ngraph function call
void BuildNgFunction();
// Check cache for ngraph function or otherwise build the function
void GetNgFunction();
};
} // namespace operators
} // namespace paddle
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include <string>
#include "paddle/fluid/framework/block_desc.h"
#include "paddle/fluid/framework/op_desc.h"
#include "paddle/fluid/framework/op_info.h"
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/operators/ngraph/ngraph_engine_op.h"
namespace paddle {
namespace operators {
class NgraphEngineOpMaker : public framework::OpProtoAndCheckerMaker {
public:
void Make() override {
AddInput("Xs", "A list of inputs.").AsDispensable();
AddOutput("Ys", "A list of outputs").AsDispensable();
AddAttr<std::string>("graph", "the graph.");
AddAttr<std::vector<int>>("interval", "op interval supported by ngraph");
AddComment("ngraph engine operator.");
}
};
class NgraphEngineInferVarType : public framework::VarTypeInference {
public:
void operator()(const framework::OpDesc &op_desc,
framework::BlockDesc *block) const override {}
};
} // namespace operators
} // namespace paddle
namespace ops = paddle::operators;
REGISTER_OPERATOR(ngraph_engine, ops::NgraphEngineOp, ops::NgraphEngineOpMaker,
ops::NgraphEngineOpMaker);
REGISTER_OP_CPU_KERNEL(
ngraph_engine,
ops::NgraphEngineKernel<paddle::platform::CPUDeviceContext, float>);
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include <string>
#include <vector>
#include "paddle/fluid/framework/scope.h"
#include "paddle/fluid/operators/ngraph/ngraph_engine.h"
#include "paddle/fluid/platform/device_context.h"
#include "paddle/fluid/platform/place.h"
namespace paddle {
namespace operators {
class NgraphEngineOp : public framework::OperatorWithKernel {
public:
using framework::OperatorWithKernel::OperatorWithKernel;
protected:
void InferShape(framework::InferShapeContext* ctx) const override {}
framework::OpKernelType GetExpectedKernelType(
const framework::ExecutionContext& ctx) const override {
framework::OpKernelType kt = framework::OpKernelType(
framework::proto::VarType::FP32, ctx.GetPlace());
return kt;
}
};
template <typename DeviceContext, typename T>
class NgraphEngineKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext& ctx) const override {
auto& scope = ctx.scope();
auto place = ctx.GetPlace();
std::string serialized_graph = ctx.Attr<std::string>("graph");
auto interval = ctx.Attr<std::vector<int>>("interval");
NgraphEngine ngraph_engine(scope, place, serialized_graph, interval);
ngraph_engine.Run(scope, place);
}
};
} // namespace operators
} // namespace paddle
...@@ -41,13 +41,19 @@ class CreateCTRReaderOp : public framework::OperatorBase { ...@@ -41,13 +41,19 @@ class CreateCTRReaderOp : public framework::OperatorBase {
auto* queue_holder = auto* queue_holder =
queue_holder_var->template GetMutable<LoDTensorBlockingQueueHolder>(); queue_holder_var->template GetMutable<LoDTensorBlockingQueueHolder>();
int thread_num = Attr<int>("thread_num"); auto thread_num = Attr<int>("thread_num");
std::vector<std::string> slots = Attr<std::vector<std::string>>("slots"); auto sparse_slots = Attr<std::vector<std::string>>("sparse_slots");
int batch_size = Attr<int>("batch_size"); auto dense_slot_index = Attr<std::vector<int>>("dense_slot_index");
std::vector<std::string> file_list = auto sparse_slot_index = Attr<std::vector<int>>("sparse_slot_index");
Attr<std::vector<std::string>>("file_list"); auto batch_size = Attr<int>("batch_size");
out->Reset(std::make_shared<CTRReader>(queue_holder->GetQueue(), batch_size, auto file_type = Attr<std::string>("file_type");
thread_num, slots, file_list)); auto file_format = Attr<std::string>("file_format");
auto file_list = Attr<std::vector<std::string>>("file_list");
DataDesc data_desc(batch_size, file_list, file_type, file_format,
dense_slot_index, sparse_slot_index, sparse_slots);
VLOG(1) << data_desc;
out->Reset(std::make_shared<CTRReader>(queue_holder->GetQueue(), thread_num,
data_desc));
} }
}; };
...@@ -58,10 +64,22 @@ class CreateCTRReaderOpMaker : public FileReaderMakerBase { ...@@ -58,10 +64,22 @@ class CreateCTRReaderOpMaker : public FileReaderMakerBase {
"Name of the `LoDTensorBlockingQueueHolder` variable"); "Name of the `LoDTensorBlockingQueueHolder` variable");
AddAttr<int>("thread_num", "the thread num to read data"); AddAttr<int>("thread_num", "the thread num to read data");
AddAttr<int>("batch_size", "the batch size of read data"); AddAttr<int>("batch_size", "the batch size of read data");
AddAttr<std::string>("file_type", "plain or gzip").SetDefault("plain");
AddAttr<std::string>("file_format", "svm or csv").SetDefault("csv");
AddAttr<std::vector<std::string>>("file_list", AddAttr<std::vector<std::string>>("file_list",
"The list of files that need to read"); "The list of files that need to read");
AddAttr<std::vector<std::string>>( AddAttr<std::vector<int>>(
"slots", "the slots that should be extract from file"); "dense_slot_index",
"the dense slots id that should be extract from file")
.SetDefault({});
AddAttr<std::vector<int>>(
"sparse_slot_index",
"the sparse slots id that should be extract from file")
.SetDefault({});
AddAttr<std::vector<std::string>>("sparse_slots",
"the sparse slots id that should be "
"extract from file, used when file "
"format is svm");
AddComment(R"DOC( AddComment(R"DOC(
Create CTRReader to support read ctr data with cpp. Create CTRReader to support read ctr data with cpp.
......
...@@ -73,6 +73,9 @@ static inline void parse_line( ...@@ -73,6 +73,9 @@ static inline void parse_line(
} }
} }
// label slot1:fea_sign slot2:fea_sign slot1:fea_sign
static inline void parse_svm_line(const std::string& line) {}
class Reader { class Reader {
public: public:
virtual ~Reader() {} virtual ~Reader() {}
...@@ -95,11 +98,27 @@ class GzipReader : public Reader { ...@@ -95,11 +98,27 @@ class GzipReader : public Reader {
igzstream gzstream_; igzstream gzstream_;
}; };
class MultiGzipReader : public Reader { class PlainFileReader : public Reader {
public: public:
explicit MultiGzipReader(const std::vector<std::string>& file_list) { explicit PlainFileReader(const std::string& file_name)
: stream_(file_name.c_str()) {}
~PlainFileReader() {}
bool HasNext() override { return stream_.peek() != EOF; }
void NextLine(std::string* line) override { std::getline(stream_, *line); }
private:
std::ifstream stream_;
};
template <typename SingleFileReader>
class MultiFileReader : public Reader {
public:
explicit MultiFileReader(const std::vector<std::string>& file_list) {
for (auto& file : file_list) { for (auto& file : file_list) {
readers_.emplace_back(std::make_shared<GzipReader>(file)); readers_.emplace_back(std::make_shared<SingleFileReader>(file));
} }
} }
...@@ -119,46 +138,35 @@ class MultiGzipReader : public Reader { ...@@ -119,46 +138,35 @@ class MultiGzipReader : public Reader {
} }
private: private:
std::vector<std::shared_ptr<GzipReader>> readers_; std::vector<std::shared_ptr<SingleFileReader>> readers_;
size_t current_reader_index_ = 0; size_t current_reader_index_ = 0;
}; };
void MonitorThread(std::vector<ReaderThreadStatus>* thread_status, void MonitorThread(std::vector<ReaderThreadStatus>* thread_status,
std::shared_ptr<LoDTensorBlockingQueue> queue) { std::shared_ptr<LoDTensorBlockingQueue> queue) {
VLOG(30) << "monitor thread in"; VLOG(3) << "monitor thread in";
bool reader_thread_is_running = true; bool reader_thread_is_running = true;
while (reader_thread_is_running) { while (reader_thread_is_running) {
VLOG(30) << "reader_thread_is_running"; VLOG(3) << "reader_thread_is_running";
reader_thread_is_running = false; reader_thread_is_running = false;
for (size_t i = 0; i < (*thread_status).size(); ++i) { for (size_t i = 0; i < (*thread_status).size(); ++i) {
if ((*thread_status)[i] == Running) { if ((*thread_status)[i] == Running) {
VLOG(30) << "reader is running!"; VLOG(3) << "reader is running!";
reader_thread_is_running = true; reader_thread_is_running = true;
} }
} }
std::this_thread::sleep_for(std::chrono::milliseconds(1000)); std::this_thread::sleep_for(std::chrono::milliseconds(1000));
} }
VLOG(30) << "all reader thread is stopped, push empty data into queue"; VLOG(3) << "all reader thread is stopped, close the queue";
queue->Push({}); queue->Close();
VLOG(30) << "monitor thread exited"; VLOG(3) << "monitor thread exited";
} }
void ReadThread(const std::vector<std::string>& file_list, void ReadSvmData(const DataDesc& data_desc, std::shared_ptr<Reader> reader,
const std::vector<std::string>& slots, int batch_size, std::shared_ptr<LoDTensorBlockingQueue> queue) {
int thread_id, std::vector<ReaderThreadStatus>* thread_status,
std::shared_ptr<LoDTensorBlockingQueue> queue) {
VLOG(30) << "[" << thread_id << "]"
<< " reader thread start! thread_id = " << thread_id;
for (auto& file : file_list) {
VLOG(30) << "[" << thread_id << "]"
<< " file " << file;
}
(*thread_status)[thread_id] = Running;
VLOG(30) << "set status to running";
std::unordered_map<std::string, size_t> slot_to_index; std::unordered_map<std::string, size_t> slot_to_index;
for (size_t i = 0; i < slots.size(); ++i) { for (size_t i = 0; i < data_desc.sparse_slot_ids_.size(); ++i) {
slot_to_index[slots[i]] = i; slot_to_index[data_desc.sparse_slot_ids_[i]] = i;
} }
std::string line; std::string line;
...@@ -166,21 +174,17 @@ void ReadThread(const std::vector<std::string>& file_list, ...@@ -166,21 +174,17 @@ void ReadThread(const std::vector<std::string>& file_list,
std::vector<std::unordered_map<std::string, std::vector<int64_t>>> batch_data; std::vector<std::unordered_map<std::string, std::vector<int64_t>>> batch_data;
std::vector<int64_t> batch_label; std::vector<int64_t> batch_label;
MultiGzipReader reader(file_list); while (reader->HasNext()) {
VLOG(30) << "reader inited";
while (reader.HasNext()) {
batch_data.clear(); batch_data.clear();
batch_data.reserve(batch_size); batch_data.reserve(data_desc.batch_size_);
batch_label.clear(); batch_label.clear();
batch_label.reserve(batch_size); batch_label.reserve(data_desc.batch_size_);
// read batch_size data // read batch_size data
for (int i = 0; i < batch_size; ++i) { for (int i = 0; i < data_desc.batch_size_; ++i) {
if (reader.HasNext()) { if (reader->HasNext()) {
reader.NextLine(&line); reader->NextLine(&line);
std::unordered_map<std::string, std::vector<int64_t>> slot_to_data; std::unordered_map<std::string, std::vector<int64_t>> slot_to_data;
int64_t label; int64_t label;
parse_line(line, slot_to_index, &label, &slot_to_data); parse_line(line, slot_to_index, &label, &slot_to_data);
...@@ -193,8 +197,8 @@ void ReadThread(const std::vector<std::string>& file_list, ...@@ -193,8 +197,8 @@ void ReadThread(const std::vector<std::string>& file_list,
std::vector<framework::LoDTensor> lod_datas; std::vector<framework::LoDTensor> lod_datas;
// first insert tensor for each slots // first insert tensor for each sparse_slots
for (auto& slot : slots) { for (auto& slot : data_desc.sparse_slot_ids_) {
std::vector<size_t> lod_data{0}; std::vector<size_t> lod_data{0};
std::vector<int64_t> batch_feasign; std::vector<int64_t> batch_feasign;
...@@ -226,11 +230,167 @@ void ReadThread(const std::vector<std::string>& file_list, ...@@ -226,11 +230,167 @@ void ReadThread(const std::vector<std::string>& file_list,
lod_datas.push_back(label_tensor); lod_datas.push_back(label_tensor);
queue->Push(lod_datas); queue->Push(lod_datas);
VLOG(40) << "push one data, queue_size=" << queue->Size(); VLOG(4) << "push one data, queue_size=" << queue->Size();
}
}
// label dense_fea,dense_fea sparse_fea,sparse_fea
static inline void parse_csv_line(
const std::string& line, const DataDesc& data_desc, int64_t* label,
std::vector<std::vector<float>>* dense_datas,
std::vector<std::vector<int64_t>>* sparse_datas) {
std::vector<std::string> ret;
string_split(line, ' ', &ret);
*label = std::stol(ret[0]);
dense_datas->resize(data_desc.dense_slot_index_.size());
for (size_t i = 0; i < data_desc.dense_slot_index_.size(); ++i) {
int slot_idx = data_desc.dense_slot_index_[i];
auto& slot_data = ret[slot_idx];
std::vector<std::string> data_in_slot_str;
string_split(slot_data, ',', &data_in_slot_str);
std::vector<float> data_in_slot;
for (auto& data_str : data_in_slot_str) {
(*dense_datas)[i].push_back(std::stof(data_str));
}
}
sparse_datas->resize(data_desc.sparse_slot_index_.size());
for (size_t i = 0; i < data_desc.sparse_slot_index_.size(); ++i) {
int slot_idx = data_desc.sparse_slot_index_[i];
auto& slot_data = ret[slot_idx];
std::vector<std::string> data_in_slot_str;
string_split(slot_data, ',', &data_in_slot_str);
std::vector<int64_t> data_in_slot;
for (auto& data_str : data_in_slot_str) {
auto id = std::stol(data_str);
(*sparse_datas)[i].push_back(id);
}
}
}
void ReadCsvData(const DataDesc& data_desc, std::shared_ptr<Reader> reader,
std::shared_ptr<LoDTensorBlockingQueue> queue) {
std::string line;
while (reader->HasNext()) {
std::vector<int64_t> batch_label;
batch_label.reserve(data_desc.batch_size_);
std::vector<std::vector<std::vector<float>>> batch_dense_data;
batch_dense_data.reserve(data_desc.batch_size_);
std::vector<std::vector<std::vector<int64_t>>> batch_sparse_data;
batch_sparse_data.reserve(data_desc.batch_size_);
// read batch_size data
for (int i = 0; i < data_desc.batch_size_; ++i) {
if (reader->HasNext()) {
reader->NextLine(&line);
int64_t label;
std::vector<std::vector<float>> dense_datas;
std::vector<std::vector<int64_t>> sparse_datas;
parse_csv_line(line, data_desc, &label, &dense_datas, &sparse_datas);
batch_label.push_back(label);
if (!batch_dense_data.empty()) {
PADDLE_ENFORCE_EQ(batch_dense_data[0].size(), dense_datas.size(),
"dense data should have the same shape");
}
batch_dense_data.push_back(dense_datas);
batch_sparse_data.push_back(sparse_datas);
} else {
break;
}
}
// the order of output data is label, dense_datas, sparse_datas
std::vector<framework::LoDTensor> lod_datas;
// insert label tensor
framework::LoDTensor label_tensor;
auto* label_tensor_data = label_tensor.mutable_data<int64_t>(
framework::make_ddim({static_cast<int64_t>(batch_label.size()), 1}),
platform::CPUPlace());
memcpy(label_tensor_data, batch_label.data(),
batch_label.size() * sizeof(int64_t));
lod_datas.push_back(label_tensor);
// insert tensor for each dense_slots
for (size_t i = 0; i < data_desc.dense_slot_index_.size(); ++i) {
framework::LoDTensor lod_tensor;
size_t width = batch_dense_data[0][i].size();
auto* tensor_data = lod_tensor.mutable_data<float>(
framework::make_ddim(
{static_cast<int64_t>(batch_dense_data.size()), // batch_size
static_cast<int64_t>(width)}),
platform::CPUPlace());
for (size_t j = 0; j < batch_dense_data.size(); ++j) {
auto& dense_data_row = batch_dense_data[j][i];
memcpy(tensor_data + j * width, dense_data_row.data(),
width * sizeof(float));
}
lod_datas.push_back(lod_tensor);
}
// insert tensor for each sparse_slots
for (size_t i = 0; i < data_desc.sparse_slot_index_.size(); ++i) {
std::vector<size_t> lod_data{0};
std::vector<int64_t> batch_feasign;
for (size_t row_idx = 0; row_idx < batch_sparse_data.size(); ++row_idx) {
auto& sparse_ids = batch_sparse_data[row_idx][i];
lod_data.push_back(lod_data.back() + sparse_ids.size());
batch_feasign.insert(batch_feasign.end(), sparse_ids.begin(),
sparse_ids.end());
}
framework::LoDTensor lod_tensor;
framework::LoD lod{lod_data};
lod_tensor.set_lod(lod);
int64_t* tensor_data = lod_tensor.mutable_data<int64_t>(
framework::make_ddim({static_cast<int64_t>(batch_feasign.size()), 1}),
platform::CPUPlace());
memcpy(tensor_data, batch_feasign.data(),
batch_feasign.size() * sizeof(int64_t));
lod_datas.push_back(lod_tensor);
}
queue->Push(lod_datas);
VLOG(4) << "push one data, queue_size=" << queue->Size();
}
}
void ReadThread(const std::vector<std::string>& file_list,
const DataDesc& data_desc, int thread_id,
std::vector<ReaderThreadStatus>* thread_status,
std::shared_ptr<LoDTensorBlockingQueue> queue) {
VLOG(3) << "[" << thread_id << "]"
<< " reader thread start! thread_id = " << thread_id;
for (auto& file : file_list) {
VLOG(3) << "[" << thread_id << "]"
<< " file " << file;
}
(*thread_status)[thread_id] = Running;
VLOG(3) << "set status to running";
std::shared_ptr<Reader> reader;
if (data_desc.file_type_ == "gzip") {
reader.reset(new MultiFileReader<GzipReader>(file_list));
} else if (data_desc.file_type_ == "plain") {
reader.reset(new MultiFileReader<PlainFileReader>(file_list));
} else {
PADDLE_THROW("do not support file format %s", data_desc.file_type_);
}
VLOG(3) << "reader inited";
if (data_desc.file_format_ == "svm") {
ReadSvmData(data_desc, reader, queue);
} else if (data_desc.file_format_ == "csv") {
ReadCsvData(data_desc, reader, queue);
} }
(*thread_status)[thread_id] = Stopped; (*thread_status)[thread_id] = Stopped;
VLOG(30) << "set status to stopped, thread " << thread_id << " exited"; VLOG(3) << "set status to stopped, thread " << thread_id << " exited";
} }
} // namespace reader } // namespace reader
......
...@@ -36,9 +36,63 @@ namespace reader { ...@@ -36,9 +36,63 @@ namespace reader {
enum ReaderThreadStatus { Running, Stopped }; enum ReaderThreadStatus { Running, Stopped };
struct DataDesc {
DataDesc(int batch_size, const std::vector<std::string>& file_names,
const std::string& file_type, const std::string& file_format,
const std::vector<int>& dense_slot_index,
const std::vector<int>& sparse_slot_index,
const std::vector<std::string>& sparse_slot_ids)
: batch_size_(batch_size),
file_names_(file_names),
file_type_(file_type),
file_format_(file_format),
dense_slot_index_(dense_slot_index),
sparse_slot_index_(sparse_slot_index),
sparse_slot_ids_(sparse_slot_ids) {}
const int batch_size_;
const std::vector<std::string> file_names_;
const std::string file_type_; // gzip or plain
const std::string file_format_; // csv or svm
// used for csv data format
const std::vector<int> dense_slot_index_;
const std::vector<int> sparse_slot_index_;
// used for svm data format
const std::vector<std::string> sparse_slot_ids_;
};
inline std::ostream& operator<<(std::ostream& os, const DataDesc& data_desc) {
os << "data_desc:\n";
os << "\tbatch_size -> " << data_desc.batch_size_ << "\n";
os << "\tfile_type -> " << data_desc.file_type_ << "\n";
os << "\tfile_format -> " << data_desc.file_format_ << "\n";
os << "\tfile_names -> {";
for (auto& file_name : data_desc.file_names_) {
os << file_name << ",";
}
os << "}\n";
os << "\tdense_slot_index -> {";
for (auto& slot : data_desc.dense_slot_index_) {
os << slot << ",";
}
os << "}\n";
os << "\tsparse_slot_index_ -> {";
for (auto& slot : data_desc.sparse_slot_index_) {
os << slot << ",";
}
os << "}\n";
os << "\tsparse_slot_ids_ -> {";
for (auto& slot : data_desc.sparse_slot_ids_) {
os << slot << ",";
}
os << "}\n";
return os;
}
void ReadThread(const std::vector<std::string>& file_list, void ReadThread(const std::vector<std::string>& file_list,
const std::vector<std::string>& slots, int batch_size, const DataDesc& data_desc, int thread_id,
int thread_id, std::vector<ReaderThreadStatus>* thread_status, std::vector<ReaderThreadStatus>* thread_status,
std::shared_ptr<LoDTensorBlockingQueue> queue); std::shared_ptr<LoDTensorBlockingQueue> queue);
// monitor all running thread, if they are all stopped, // monitor all running thread, if they are all stopped,
...@@ -48,15 +102,15 @@ void MonitorThread(std::vector<ReaderThreadStatus>* thread_status, ...@@ -48,15 +102,15 @@ void MonitorThread(std::vector<ReaderThreadStatus>* thread_status,
class CTRReader : public framework::FileReader { class CTRReader : public framework::FileReader {
public: public:
explicit CTRReader(const std::shared_ptr<LoDTensorBlockingQueue>& queue, CTRReader(const std::shared_ptr<LoDTensorBlockingQueue>& queue,
int batch_size, size_t thread_num, int thread_num, const DataDesc& data_desc)
const std::vector<std::string>& slots, : data_desc_(data_desc) {
const std::vector<std::string>& file_list)
: batch_size_(batch_size), slots_(slots), file_list_(file_list) {
PADDLE_ENFORCE_GT(thread_num, 0, "thread num should be larger then 0!"); PADDLE_ENFORCE_GT(thread_num, 0, "thread num should be larger then 0!");
PADDLE_ENFORCE(queue != nullptr, "LoDTensorBlockingQueue must not be null"); PADDLE_ENFORCE(queue != nullptr, "LoDTensorBlockingQueue must not be null");
PADDLE_ENFORCE_GT(file_list.size(), 0, "file list should not be empty"); PADDLE_ENFORCE_GT(data_desc_.file_names_.size(), 0,
thread_num_ = std::min<size_t>(file_list_.size(), thread_num); "file list should not be empty");
thread_num_ = std::min<size_t>(data_desc_.file_names_.size(), thread_num);
queue_ = queue; queue_ = queue;
SplitFiles(); SplitFiles();
for (size_t i = 0; i < thread_num_; ++i) { for (size_t i = 0; i < thread_num_; ++i) {
...@@ -64,7 +118,7 @@ class CTRReader : public framework::FileReader { ...@@ -64,7 +118,7 @@ class CTRReader : public framework::FileReader {
} }
} }
~CTRReader() {} ~CTRReader() { Shutdown(); }
void ReadNext(std::vector<framework::LoDTensor>* out) override { void ReadNext(std::vector<framework::LoDTensor>* out) override {
bool success; bool success;
...@@ -81,7 +135,10 @@ class CTRReader : public framework::FileReader { ...@@ -81,7 +135,10 @@ class CTRReader : public framework::FileReader {
for (auto& read_thread : read_threads_) { for (auto& read_thread : read_threads_) {
read_thread->join(); read_thread->join();
} }
monitor_thread_->join();
if (monitor_thread_) {
monitor_thread_->join();
}
read_threads_.clear(); read_threads_.clear();
monitor_thread_.reset(nullptr); monitor_thread_.reset(nullptr);
...@@ -95,9 +152,9 @@ class CTRReader : public framework::FileReader { ...@@ -95,9 +152,9 @@ class CTRReader : public framework::FileReader {
queue_->ReOpen(); queue_->ReOpen();
VLOG(3) << "reopen success"; VLOG(3) << "reopen success";
VLOG(3) << "thread_num " << thread_num_; VLOG(3) << "thread_num " << thread_num_;
for (size_t thread_id = 0; thread_id < thread_num_; thread_id++) { for (int thread_id = 0; thread_id < thread_num_; thread_id++) {
read_threads_.emplace_back(new std::thread(std::bind( read_threads_.emplace_back(new std::thread(std::bind(
&ReadThread, file_groups_[thread_id], slots_, batch_size_, &ReadThread, file_groups_[thread_id], data_desc_,
static_cast<int>(thread_id), &read_thread_status_, queue_))); static_cast<int>(thread_id), &read_thread_status_, queue_)));
} }
monitor_thread_.reset(new std::thread( monitor_thread_.reset(new std::thread(
...@@ -108,8 +165,8 @@ class CTRReader : public framework::FileReader { ...@@ -108,8 +165,8 @@ class CTRReader : public framework::FileReader {
private: private:
void SplitFiles() { void SplitFiles() {
file_groups_.resize(thread_num_); file_groups_.resize(thread_num_);
for (size_t i = 0; i < file_list_.size(); ++i) { for (size_t i = 0; i < data_desc_.file_names_.size(); ++i) {
auto& file_name = file_list_[i]; auto& file_name = data_desc_.file_names_[i];
std::ifstream f(file_name.c_str()); std::ifstream f(file_name.c_str());
PADDLE_ENFORCE(f.good(), "file %s not exist!", file_name); PADDLE_ENFORCE(f.good(), "file %s not exist!", file_name);
file_groups_[i % thread_num_].push_back(file_name); file_groups_[i % thread_num_].push_back(file_name);
...@@ -118,9 +175,7 @@ class CTRReader : public framework::FileReader { ...@@ -118,9 +175,7 @@ class CTRReader : public framework::FileReader {
private: private:
size_t thread_num_; size_t thread_num_;
const int batch_size_; const DataDesc data_desc_;
const std::vector<std::string> slots_;
const std::vector<std::string> file_list_;
std::shared_ptr<LoDTensorBlockingQueue> queue_; std::shared_ptr<LoDTensorBlockingQueue> queue_;
std::vector<std::unique_ptr<std::thread>> read_threads_; std::vector<std::unique_ptr<std::thread>> read_threads_;
std::unique_ptr<std::thread> monitor_thread_; std::unique_ptr<std::thread> monitor_thread_;
......
...@@ -36,6 +36,7 @@ using paddle::framework::LoD; ...@@ -36,6 +36,7 @@ using paddle::framework::LoD;
using paddle::framework::DDim; using paddle::framework::DDim;
using paddle::platform::CPUPlace; using paddle::platform::CPUPlace;
using paddle::framework::make_ddim; using paddle::framework::make_ddim;
using paddle::operators::reader::DataDesc;
static void generatedata(const std::vector<std::string>& data, static void generatedata(const std::vector<std::string>& data,
const std::string& file_name) { const std::string& file_name) {
...@@ -126,30 +127,103 @@ TEST(CTR_READER, read_data) { ...@@ -126,30 +127,103 @@ TEST(CTR_READER, read_data) {
LoDTensorBlockingQueueHolder queue_holder; LoDTensorBlockingQueueHolder queue_holder;
int capacity = 64; int capacity = 64;
queue_holder.InitOnce(capacity, {}, false); queue_holder.InitOnce(capacity, false);
std::shared_ptr<LoDTensorBlockingQueue> queue = queue_holder.GetQueue(); std::shared_ptr<LoDTensorBlockingQueue> queue = queue_holder.GetQueue();
int batch_size = 3; int batch_size = 3;
int thread_num = 1; int thread_num = 1;
std::vector<std::string> slots = {"6002", "6003"}; std::vector<std::string> sparse_slots = {"6002", "6003"};
std::vector<std::string> file_list; std::vector<std::string> file_list;
for (int i = 0; i < thread_num; ++i) { for (int i = 0; i < thread_num; ++i) {
file_list.push_back(gz_file_name); file_list.push_back(gz_file_name);
} }
CTRReader reader(queue, batch_size, thread_num, slots, file_list); DataDesc data_desc(batch_size, file_list, "gzip", "svm", {}, {},
sparse_slots);
CTRReader reader(queue, thread_num, data_desc);
reader.Start(); reader.Start();
size_t batch_num = size_t batch_num =
std::ceil(static_cast<float>(ctr_data.size()) / batch_size) * thread_num; std::ceil(static_cast<float>(ctr_data.size()) / batch_size) * thread_num;
check_all_data(ctr_data, slots, label_dims, label_value, data_slot_6002, check_all_data(ctr_data, sparse_slots, label_dims, label_value,
data_slot_6003, batch_num, batch_size, queue, &reader); data_slot_6002, data_slot_6003, batch_num, batch_size, queue,
&reader);
reader.Shutdown(); reader.Shutdown();
reader.Start(); reader.Start();
check_all_data(ctr_data, slots, label_dims, label_value, data_slot_6002, check_all_data(ctr_data, sparse_slots, label_dims, label_value,
data_slot_6003, batch_num, batch_size, queue, &reader); data_slot_6002, data_slot_6003, batch_num, batch_size, queue,
&reader);
reader.Shutdown(); reader.Shutdown();
} }
static void GenereteCsvData(const std::string& file_name,
const std::vector<std::string>& data) {
std::ofstream out(file_name.c_str());
PADDLE_ENFORCE(out.good(), "open file %s failed!", file_name);
for (auto& c : data) {
out << c;
}
out.close();
PADDLE_ENFORCE(out.good(), "save file %s failed!", file_name);
}
static void CheckReadCsvOut(const std::vector<LoDTensor>& out) {
ASSERT_EQ(out.size(), 3);
ASSERT_EQ(out[0].dims()[1], 1);
ASSERT_EQ(out[1].dims()[1], 2);
ASSERT_EQ(out[2].dims()[1], 1);
for (size_t i = 0; i < out[0].numel(); ++i) {
int64_t label = out[0].data<int64_t>()[i];
auto& dense_dim = out[1].dims();
for (size_t j = 0; j < dense_dim[1]; ++j) {
ASSERT_EQ(out[1].data<float>()[i * dense_dim[1] + j],
static_cast<float>(label + 0.1));
}
auto& sparse_lod = out[2].lod();
for (size_t j = sparse_lod[0][i]; j < sparse_lod[0][i + 1]; ++j) {
ASSERT_EQ(out[2].data<int64_t>()[j], label);
}
}
}
TEST(CTR_READER, read_csv_data) {
std::string file_name = "test_ctr_reader_data.csv";
const std::vector<std::string> csv_data = {
"0 0.1,0.1 0,0,0,0\n", "1 1.1,1.1 1,1,1,1\n", "2 2.1,2.1 2,2,2,2\n",
"3 3.1,3.1 3,3,3,3\n",
};
GenereteCsvData(file_name, csv_data);
LoDTensorBlockingQueueHolder queue_holder;
int capacity = 64;
queue_holder.InitOnce(capacity, false);
std::shared_ptr<LoDTensorBlockingQueue> queue = queue_holder.GetQueue();
int batch_size = 3;
int thread_num = 1;
std::vector<std::string> file_list;
for (int i = 0; i < thread_num; ++i) {
file_list.push_back(file_name);
}
DataDesc data_desc(batch_size, file_list, "plain", "csv", {1}, {2}, {});
CTRReader reader(queue, thread_num, data_desc);
for (size_t i = 0; i < 2; ++i) {
reader.Start();
std::vector<LoDTensor> out;
while (true) {
reader.ReadNext(&out);
if (out.empty()) {
break;
}
CheckReadCsvOut(out);
}
reader.Shutdown();
}
}
...@@ -32,10 +32,8 @@ class LoDTensorBlockingQueue { ...@@ -32,10 +32,8 @@ class LoDTensorBlockingQueue {
friend class LoDTensorBlockingQueueHolder; friend class LoDTensorBlockingQueueHolder;
private: private:
LoDTensorBlockingQueue(size_t capacity, explicit LoDTensorBlockingQueue(size_t capacity, bool speed_test_mode = false)
const std::vector<framework::DDim>& dims, : queue_(capacity, speed_test_mode) {}
bool speed_test_mode = false)
: queue_(capacity, speed_test_mode), dims_(dims) {}
public: public:
bool Push(const std::vector<framework::LoDTensor>& lod_tensor_vec) { bool Push(const std::vector<framework::LoDTensor>& lod_tensor_vec) {
...@@ -65,17 +63,15 @@ class LoDTensorBlockingQueue { ...@@ -65,17 +63,15 @@ class LoDTensorBlockingQueue {
private: private:
BlockingQueue<std::vector<framework::LoDTensor>> queue_; BlockingQueue<std::vector<framework::LoDTensor>> queue_;
std::vector<framework::DDim> dims_;
}; };
class LoDTensorBlockingQueueHolder { class LoDTensorBlockingQueueHolder {
public: public:
void InitOnce(size_t capacity, const std::vector<framework::DDim>& dims, void InitOnce(size_t capacity, bool speed_test_mode = false) {
bool speed_test_mode = false) {
PADDLE_ENFORCE( PADDLE_ENFORCE(
queue_ == nullptr, queue_ == nullptr,
"LoDTensorBlockingQueueHolder::InitOnce() can only be called once"); "LoDTensorBlockingQueueHolder::InitOnce() can only be called once");
queue_.reset(new LoDTensorBlockingQueue(capacity, dims, speed_test_mode)); queue_.reset(new LoDTensorBlockingQueue(capacity, speed_test_mode));
} }
inline const std::shared_ptr<LoDTensorBlockingQueue>& GetQueue() const { inline const std::shared_ptr<LoDTensorBlockingQueue>& GetQueue() const {
......
...@@ -27,13 +27,13 @@ class ReadInferShape : public framework::InferShapeBase { ...@@ -27,13 +27,13 @@ class ReadInferShape : public framework::InferShapeBase {
"The ReadOp must take a reader as input."); "The ReadOp must take a reader as input.");
PADDLE_ENFORCE(ctx->HasOutputs("Out"), PADDLE_ENFORCE(ctx->HasOutputs("Out"),
"The ReadOp should be assigned with output."); "The ReadOp should be assigned with output.");
std::vector<framework::DDim> reader_dims = ctx->GetReaderDims("Reader"); if (!ctx->IsRuntime() && ctx->Attrs().Get<bool>("infer_out")) {
std::vector<std::string> out_names = ctx->Outputs("Out"); std::vector<framework::DDim> reader_dims = ctx->GetReaderDims("Reader");
PADDLE_ENFORCE_EQ( std::vector<std::string> out_names = ctx->Outputs("Out");
reader_dims.size(), out_names.size(), PADDLE_ENFORCE_EQ(
"The reader's dim number doesn't match the output number."); reader_dims.size(), out_names.size(),
ctx->SetOutputsDim("Out", reader_dims); "The reader's dim number doesn't match the output number.");
if (!ctx->IsRuntime()) { ctx->SetOutputsDim("Out", reader_dims);
auto in_desc = auto in_desc =
boost::get<framework::VarDesc*>(ctx->GetInputVarPtrs("Reader")[0]); boost::get<framework::VarDesc*>(ctx->GetInputVarPtrs("Reader")[0]);
auto in_lod_levels = in_desc->GetLoDLevels(); auto in_lod_levels = in_desc->GetLoDLevels();
...@@ -53,15 +53,18 @@ class ReadInferVarType : public framework::VarTypeInference { ...@@ -53,15 +53,18 @@ class ReadInferVarType : public framework::VarTypeInference {
public: public:
void operator()(const framework::OpDesc& op_desc, void operator()(const framework::OpDesc& op_desc,
framework::BlockDesc* block) const override { framework::BlockDesc* block) const override {
std::string reader_name = op_desc.Input("Reader")[0]; bool infer_out = boost::get<bool>(op_desc.GetAttr("infer_out"));
std::vector<std::string> out_names = op_desc.Output("Out"); if (infer_out) {
framework::VarDesc* reader = block->FindVarRecursive(reader_name); std::string reader_name = op_desc.Input("Reader")[0];
auto dtypes = reader->GetDataTypes(); std::vector<std::string> out_names = op_desc.Output("Out");
PADDLE_ENFORCE_EQ(dtypes.size(), out_names.size()); framework::VarDesc* reader = block->FindVarRecursive(reader_name);
for (size_t i = 0; i < dtypes.size(); ++i) { auto dtypes = reader->GetDataTypes();
framework::VarDesc& out = block->FindRecursiveOrCreateVar(out_names[i]); PADDLE_ENFORCE_EQ(dtypes.size(), out_names.size());
out.SetType(framework::proto::VarType::LOD_TENSOR); for (size_t i = 0; i < dtypes.size(); ++i) {
out.SetDataType(dtypes[i]); framework::VarDesc& out = block->FindRecursiveOrCreateVar(out_names[i]);
out.SetType(framework::proto::VarType::LOD_TENSOR);
out.SetDataType(dtypes[i]);
}
} }
} }
}; };
...@@ -73,6 +76,7 @@ class ReadOp : public framework::OperatorBase { ...@@ -73,6 +76,7 @@ class ReadOp : public framework::OperatorBase {
private: private:
void RunImpl(const framework::Scope& scope, void RunImpl(const framework::Scope& scope,
const platform::Place& dev_place) const override { const platform::Place& dev_place) const override {
VLOG(3) << "read op in";
framework::ReaderHolder* reader = framework::ReaderHolder* reader =
detail::Ref(scope.FindVar(Input("Reader")), detail::Ref(scope.FindVar(Input("Reader")),
"Cannot find reader variable %s", Input("Reader")) "Cannot find reader variable %s", Input("Reader"))
...@@ -87,7 +91,9 @@ class ReadOp : public framework::OperatorBase { ...@@ -87,7 +91,9 @@ class ReadOp : public framework::OperatorBase {
reader->ReadNext(&ins); reader->ReadNext(&ins);
if (ins.empty()) { if (ins.empty()) {
VLOG(3) << "read empty data in";
if (Attr<bool>("throw_eof_exp")) { if (Attr<bool>("throw_eof_exp")) {
VLOG(3) << "throw_eof_exp";
PADDLE_THROW_EOF(); PADDLE_THROW_EOF();
} else { } else {
ins.resize(out_arg_names.size()); ins.resize(out_arg_names.size());
...@@ -96,6 +102,7 @@ class ReadOp : public framework::OperatorBase { ...@@ -96,6 +102,7 @@ class ReadOp : public framework::OperatorBase {
tensor.mutable_data<float>(framework::make_ddim({0}), dev_place); tensor.mutable_data<float>(framework::make_ddim({0}), dev_place);
} }
} }
VLOG(3) << "read empty data out";
} }
PADDLE_ENFORCE_EQ(ins.size(), out_arg_names.size()); PADDLE_ENFORCE_EQ(ins.size(), out_arg_names.size());
for (size_t i = 0; i < out_arg_names.size(); ++i) { for (size_t i = 0; i < out_arg_names.size(); ++i) {
...@@ -120,6 +127,7 @@ class ReadOpMaker : public framework::OpProtoAndCheckerMaker { ...@@ -120,6 +127,7 @@ class ReadOpMaker : public framework::OpProtoAndCheckerMaker {
" only when the data-balance is enabled in ParallelExecutor" " only when the data-balance is enabled in ParallelExecutor"
" and it is set by ParallelExecutor instance, not users.") " and it is set by ParallelExecutor instance, not users.")
.SetDefault(true); .SetDefault(true);
AddAttr<bool>("infer_out", "").SetDefault(true);
AddComment(R"DOC( AddComment(R"DOC(
Read Operator Read Operator
......
...@@ -65,6 +65,10 @@ void FileReaderMakerBase::Make() { ...@@ -65,6 +65,10 @@ void FileReaderMakerBase::Make() {
"It means the reader will generate two data each time," "It means the reader will generate two data each time,"
"whose shapes are [2,3,4] and [5,6] respectively."); "whose shapes are [2,3,4] and [5,6] respectively.");
AddAttr<std::vector<int>>("lod_levels", "The LoD levels of each data."); AddAttr<std::vector<int>>("lod_levels", "The LoD levels of each data.");
AddAttr<bool>(
"use_data_config",
"Use the config of all datas like shape_concat/ranks/lod_levels")
.SetDefault(true);
Apply(); Apply();
} }
...@@ -75,19 +79,23 @@ void FileReaderInferShape::operator()(framework::InferShapeContext* ctx) const { ...@@ -75,19 +79,23 @@ void FileReaderInferShape::operator()(framework::InferShapeContext* ctx) const {
PADDLE_ENFORCE(ctx->HasOutput("Out"), PADDLE_ENFORCE(ctx->HasOutput("Out"),
"The output file reader should not be null."); "The output file reader should not be null.");
const auto shape_concat = ctx->Attrs().Get<std::vector<int>>("shape_concat"); bool use_data_config = ctx->Attrs().Get<bool>("use_data_config");
const auto ranks = ctx->Attrs().Get<std::vector<int>>("ranks"); if (use_data_config) {
std::vector<framework::DDim> shapes = RestoreShapes(shape_concat, ranks); const auto shape_concat =
ctx->SetReaderDims("Out", shapes); ctx->Attrs().Get<std::vector<int>>("shape_concat");
const auto ranks = ctx->Attrs().Get<std::vector<int>>("ranks");
const auto lod_levels = ctx->Attrs().Get<std::vector<int>>("lod_levels"); std::vector<framework::DDim> shapes = RestoreShapes(shape_concat, ranks);
PADDLE_ENFORCE_EQ(lod_levels.size(), shapes.size(), ctx->SetReaderDims("Out", shapes);
"The number of 'lod_levels'(%d) doesn't match the number "
"of 'shapes'(%d).", const auto lod_levels = ctx->Attrs().Get<std::vector<int>>("lod_levels");
lod_levels.size(), shapes.size()); PADDLE_ENFORCE_EQ(lod_levels.size(), shapes.size(),
framework::VarDesc* reader = "The number of 'lod_levels'(%d) doesn't match the number "
boost::get<framework::VarDesc*>(ctx->GetOutputVarPtrs("Out")[0]); "of 'shapes'(%d).",
reader->SetLoDLevels(lod_levels); lod_levels.size(), shapes.size());
framework::VarDesc* reader =
boost::get<framework::VarDesc*>(ctx->GetOutputVarPtrs("Out")[0]);
reader->SetLoDLevels(lod_levels);
}
} }
void FileReaderInferVarType::operator()(const framework::OpDesc& op_desc, void FileReaderInferVarType::operator()(const framework::OpDesc& op_desc,
......
/*Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/operators/shuffle_channel_op.h"
namespace paddle {
namespace operators {
class ShuffleChannelOp : public framework::OperatorWithKernel {
public:
using framework::OperatorWithKernel::OperatorWithKernel;
void InferShape(framework::InferShapeContext* ctx) const override {
PADDLE_ENFORCE(ctx->HasInput("X"),
"Input(X) of ShuffleChannelOp should not be null.");
PADDLE_ENFORCE(ctx->HasOutput("Out"),
"Output(Out) of ShuffleChannelOp should not be null.");
auto input_dims = ctx->GetInputDim("X");
PADDLE_ENFORCE(input_dims.size() == 4, "The layout of input is NCHW.");
ctx->SetOutputDim("Out", input_dims);
}
protected:
framework::OpKernelType GetExpectedKernelType(
const framework::ExecutionContext& ctx) const override {
return framework::OpKernelType(ctx.Input<framework::Tensor>("X")->type(),
ctx.device_context());
}
};
class ShuffleChannelOpMaker : public framework::OpProtoAndCheckerMaker {
public:
void Make() override {
AddInput("X",
"(Tensor, default Tensor<float>), "
"the input feature data of ShuffleChannelOp, the layout is NCHW.");
AddOutput("Out",
"(Tensor, default Tensor<float>), the output of "
"ShuffleChannelOp. The layout is NCHW.");
AddAttr<int>("group", "the number of groups.")
.SetDefault(1)
.AddCustomChecker([](const int& group) {
PADDLE_ENFORCE_GE(group, 1, "group should be larger than 0.");
});
AddComment(R"DOC(
Shuffle Channel operator
This opearator shuffles the channels of input x.
It divide the input channels in each group into several subgroups,
and obtain a new order by selecting element from every subgroup one by one.
Shuffle channel operation makes it possible to build more powerful structures
with multiple group convolutional layers.
please get more information from the following paper:
https://arxiv.org/pdf/1707.01083.pdf
)DOC");
}
};
class ShuffleChannelGradOp : public framework::OperatorWithKernel {
public:
using framework::OperatorWithKernel::OperatorWithKernel;
void InferShape(framework::InferShapeContext* ctx) const override {
PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("Out")),
"Input(Out@Grad) should not be null");
PADDLE_ENFORCE(ctx->HasOutput(framework::GradVarName("X")),
"Output(X@Grad) should not be null");
auto input_dims = ctx->GetInputDim("X");
PADDLE_ENFORCE(input_dims.size() == 4, "The layout of input is NCHW.");
ctx->SetOutputDim(framework::GradVarName("X"), input_dims);
}
protected:
framework::OpKernelType GetExpectedKernelType(
const framework::ExecutionContext& ctx) const override {
return framework::OpKernelType(ctx.Input<framework::Tensor>("X")->type(),
ctx.device_context());
}
};
} // namespace operators
} // namespace paddle
namespace ops = paddle::operators;
REGISTER_OPERATOR(shuffle_channel, ops::ShuffleChannelOp,
ops::ShuffleChannelOpMaker,
paddle::framework::DefaultGradOpDescMaker<true>);
REGISTER_OPERATOR(shuffle_channel_grad, ops::ShuffleChannelGradOp);
REGISTER_OP_CPU_KERNEL(
shuffle_channel,
ops::ShuffleChannelOpKernel<paddle::platform::CPUDeviceContext, float>,
ops::ShuffleChannelOpKernel<paddle::platform::CPUDeviceContext, double>);
REGISTER_OP_CPU_KERNEL(
shuffle_channel_grad,
ops::ShuffleChannelGradOpKernel<paddle::platform::CPUDeviceContext, float>,
ops::ShuffleChannelGradOpKernel<paddle::platform::CPUDeviceContext,
double>);
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/operators/shuffle_channel_op.h"
#include "paddle/fluid/platform/cuda_primitives.h"
#include "paddle/fluid/platform/gpu_info.h"
namespace paddle {
namespace operators {
using Tensor = framework::Tensor;
static constexpr int kNumCUDAThreads = 512;
static constexpr int kNumMaximumNumBlocks = 4096;
static inline int NumBlocks(const int N) {
return std::min((N + kNumCUDAThreads - 1) / kNumCUDAThreads,
kNumMaximumNumBlocks);
}
template <typename T>
__global__ void ShuffleChannel(const int nthreads, const int feature_map_size,
T* output, const T* input, int group_row,
int group_column, int len) {
int index = blockIdx.x * blockDim.x + threadIdx.x;
int offset = blockDim.x * gridDim.x;
for (size_t ii = index; ii < nthreads; ii += offset) {
const int n = index / group_row / group_column / len;
const int i = (index / group_column / len) % group_row;
const int j = index / len % group_column;
const int k = index - (n * feature_map_size + (i * group_column + j) * len);
T* p_o = output + n * feature_map_size + (j * group_row + i) * len;
p_o[k] = input[index];
}
}
template <typename DeviceContext, typename T>
class ShuffleChannelOpCUDAKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext& ctx) const override {
auto* input = ctx.Input<framework::Tensor>("X");
auto* output = ctx.Output<framework::Tensor>("Out");
int group = ctx.Attr<int>("group");
auto input_dims = input->dims();
auto num = input_dims[0];
auto channel = input_dims[1];
auto height = input_dims[2];
auto weight = input_dims[3];
auto feature_map_size = channel * height * weight;
auto sp_sz = height * weight;
int group_row = group;
int group_column = channel / group_row;
// count is the product of NCHW same as numel()
int count = num * group_column * group_row * sp_sz;
int blocks = NumBlocks(output->numel());
int threads = kNumCUDAThreads;
const T* input_data = input->data<T>();
T* output_data = output->mutable_data<T>(ctx.GetPlace());
ShuffleChannel<
T><<<blocks, threads, 0, ctx.cuda_device_context().stream()>>>(
count, feature_map_size, output_data, input_data, group_row,
group_column, sp_sz);
}
};
template <typename DeviceContext, typename T>
class ShuffleChannelGradOpCUDAKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext& ctx) const override {
auto* input = ctx.Input<framework::Tensor>("X");
int group = ctx.Attr<int>("group");
auto input_dims = input->dims();
auto num = input_dims[0];
auto channel = input_dims[1];
auto height = input_dims[2];
auto weight = input_dims[3];
auto feature_map_size = channel * height * weight;
auto sp_sz = height * weight;
int group_row = group;
int group_column = channel / group_row;
auto* output_grad =
ctx.Input<framework::Tensor>(framework::GradVarName("Out"));
auto* input_grad =
ctx.Output<framework::Tensor>(framework::GradVarName("X"));
T* input_grad_data = input_grad->mutable_data<T>(ctx.GetPlace());
const T* output_grad_data = output_grad->data<T>();
int blocks = NumBlocks(output_grad->numel());
int threads = kNumCUDAThreads;
int count = num * group_column * group_row * sp_sz;
ShuffleChannel<
T><<<blocks, threads, 0, ctx.cuda_device_context().stream()>>>(
count, feature_map_size, input_grad_data, output_grad_data, group_row,
group_column, sp_sz);
}
};
} // namespace operators
} // namespace paddle
namespace ops = paddle::operators;
REGISTER_OP_CUDA_KERNEL(
shuffle_channel,
ops::ShuffleChannelOpCUDAKernel<paddle::platform::CUDADeviceContext, float>,
ops::ShuffleChannelOpCUDAKernel<paddle::platform::CUDADeviceContext,
double>);
REGISTER_OP_CUDA_KERNEL(
shuffle_channel_grad,
ops::ShuffleChannelGradOpCUDAKernel<paddle::platform::CUDADeviceContext,
float>,
ops::ShuffleChannelGradOpCUDAKernel<paddle::platform::CUDADeviceContext,
double>);
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include <algorithm>
#include <vector>
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/operators/math/math_function.h"
namespace paddle {
namespace operators {
template <typename DeviceContext, typename T>
class ShuffleChannelOpKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext& ctx) const override {
auto* input = ctx.Input<framework::Tensor>("X");
auto* output = ctx.Output<framework::Tensor>("Out");
int group = ctx.Attr<int>("group");
auto input_dims = input->dims();
auto num = input_dims[0];
auto channel = input_dims[1];
auto height = input_dims[2];
auto weight = input_dims[3];
auto feature_map_size = channel * height * weight;
auto sp_sz = height * weight;
int group_row = group;
int group_column = channel / group_row;
const T* input_data = input->data<T>();
T* output_data = output->mutable_data<T>(ctx.GetPlace());
for (int n = 0; n < num; ++n) {
for (int i = 0; i < group_row; ++i) {
for (int j = 0; j < group_column; ++j) {
const T* p_i = input_data + n * feature_map_size +
(i * group_column + j) * sp_sz;
T* p_o =
output_data + n * feature_map_size + (j * group_row + i) * sp_sz;
memcpy(p_o, p_i, sizeof(int) * sp_sz);
}
}
}
}
};
template <typename DeviceContext, typename T>
class ShuffleChannelGradOpKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext& ctx) const override {
auto* input = ctx.Input<framework::Tensor>("X");
int group = ctx.Attr<int>("group");
auto input_dims = input->dims();
auto num = input_dims[0];
auto channel = input_dims[1];
auto height = input_dims[2];
auto weight = input_dims[3];
auto feature_map_size = channel * height * weight;
auto sp_sz = height * weight;
int group_row = group;
int group_column = channel / group_row;
auto* output_grad =
ctx.Input<framework::Tensor>(framework::GradVarName("Out"));
auto* input_grad =
ctx.Output<framework::Tensor>(framework::GradVarName("X"));
T* input_grad_data = input_grad->mutable_data<T>(ctx.GetPlace());
const T* output_grad_data = output_grad->data<T>();
for (int n = 0; n < num; ++n) {
for (int i = 0; i < group_row; ++i) {
for (int j = 0; j < group_column; ++j) {
const T* p_i = output_grad_data + n * feature_map_size +
(i * group_column + j) * sp_sz;
T* p_o = input_grad_data + n * feature_map_size +
(j * group_row + i) * sp_sz;
memcpy(p_o, p_i, sizeof(int) * sp_sz);
}
}
}
}
};
} // namespace operators
} // namespace paddle
...@@ -29,8 +29,14 @@ class TensorRTEngineOpMaker : public framework::OpProtoAndCheckerMaker { ...@@ -29,8 +29,14 @@ class TensorRTEngineOpMaker : public framework::OpProtoAndCheckerMaker {
AddInput("Xs", "A list of inputs.").AsDuplicable(); AddInput("Xs", "A list of inputs.").AsDuplicable();
AddOutput("Ys", "A list of outputs").AsDuplicable(); AddOutput("Ys", "A list of outputs").AsDuplicable();
AddAttr<std::string>("subgraph", "the subgraph."); AddAttr<std::string>("subgraph", "the subgraph.");
AddAttr<std::string>("calibration_data", "the calibration data for int8");
AddAttr<std::string>(
"engine_key",
"The engine_key here is used to distinguish different TRT Engines");
AddAttr<int>("max_batch_size", "the maximum batch size."); AddAttr<int>("max_batch_size", "the maximum batch size.");
AddAttr<int>("workspace_size", "the workspace size."); AddAttr<int>("workspace_size", "the workspace size.");
AddAttr<framework::BlockDesc *>("sub_block", "the trt block");
AddAttr<bool>("enable_int8", "whether swith to int8 mode");
AddComment("TensorRT engine operator."); AddComment("TensorRT engine operator.");
} }
}; };
...@@ -47,6 +53,6 @@ class TensorRTEngineInferVarType : public framework::VarTypeInference { ...@@ -47,6 +53,6 @@ class TensorRTEngineInferVarType : public framework::VarTypeInference {
namespace ops = paddle::operators; namespace ops = paddle::operators;
REGISTER_OPERATOR(tensorrt_engine, ops::TensorRTEngineOp, REGISTER_OPERATOR(tensorrt_engine, ops::TensorRTEngineOp,
ops::TensorRTEngineOpMaker); ops::TensorRTEngineOpMaker, ops::TensorRTEngineOpMaker);
#endif // PADDLE_WITH_CUDA #endif // PADDLE_WITH_CUDA
...@@ -17,8 +17,10 @@ ...@@ -17,8 +17,10 @@
#ifdef PADDLE_WITH_CUDA #ifdef PADDLE_WITH_CUDA
#include <string> #include <string>
#include <unordered_map>
#include <vector> #include <vector>
#include "paddle/fluid/framework/executor.h"
#include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/framework/operator.h" #include "paddle/fluid/framework/operator.h"
#include "paddle/fluid/inference/analysis/helper.h" #include "paddle/fluid/inference/analysis/helper.h"
...@@ -62,6 +64,9 @@ nvinfer1::Dims Vec2TRT_Dims(const std::vector<int64_t> &shape) { ...@@ -62,6 +64,9 @@ nvinfer1::Dims Vec2TRT_Dims(const std::vector<int64_t> &shape) {
using inference::Singleton; using inference::Singleton;
using inference::tensorrt::TensorRTEngine; using inference::tensorrt::TensorRTEngine;
using inference::tensorrt::TRTInt8Calibrator;
using inference::tensorrt::TRTCalibratorEngine;
using inference::tensorrt::TRTCalibratorEngineManager;
class TensorRTEngineOp : public framework::OperatorBase { class TensorRTEngineOp : public framework::OperatorBase {
private: private:
...@@ -70,6 +75,11 @@ class TensorRTEngineOp : public framework::OperatorBase { ...@@ -70,6 +75,11 @@ class TensorRTEngineOp : public framework::OperatorBase {
mutable std::unique_ptr<TensorRTEngine> trt_engine_; mutable std::unique_ptr<TensorRTEngine> trt_engine_;
int max_batch_size_; int max_batch_size_;
int workspace_size_; int workspace_size_;
std::unique_ptr<TRTInt8Calibrator> calibrator_;
bool enable_int8_;
std::string calibration_data_;
std::string engine_key_;
bool calibration_mode_;
public: public:
TensorRTEngineOp(const std::string &type, TensorRTEngineOp(const std::string &type,
...@@ -80,19 +90,96 @@ class TensorRTEngineOp : public framework::OperatorBase { ...@@ -80,19 +90,96 @@ class TensorRTEngineOp : public framework::OperatorBase {
input_names_ = Inputs("Xs"); input_names_ = Inputs("Xs");
max_batch_size_ = Attr<int>("max_batch_size"); max_batch_size_ = Attr<int>("max_batch_size");
workspace_size_ = Attr<int>("workspace_size"); workspace_size_ = Attr<int>("workspace_size");
enable_int8_ = Attr<bool>("enable_int8");
calibration_data_ = Attr<std::string>("calibration_data");
engine_key_ = Attr<std::string>("engine_key");
auto params = Attr<std::vector<std::string>>("parameters"); auto params = Attr<std::vector<std::string>>("parameters");
for (const auto &param : params) { for (const auto &param : params) {
param_names_.insert(param); param_names_.insert(param);
} }
// calibration_mode is ture represents we need to
// generate the calibration table data.
calibration_mode_ = (enable_int8_ && calibration_data_.size() == 0);
VLOG(4) << "calibration_mode: " << calibration_mode_;
if (enable_int8_ && calibration_data_.size()) {
calibrator_.reset(new TRTInt8Calibrator(calibration_data_));
}
} }
protected: protected:
void RunNativeImpl(const framework::Scope &scope,
const platform::Place &dev_place) const {
framework::Executor executor(dev_place);
auto *block = Attr<framework::BlockDesc *>("sub_block");
auto *program = block->Program();
auto &current_scope = scope.NewScope();
auto ctx = executor.Prepare(*program, block->ID());
executor.RunPreparedContext(ctx.get(), &current_scope, false, true, true);
}
void RunImpl(const framework::Scope &scope, void RunImpl(const framework::Scope &scope,
const platform::Place &dev_place) const override { const platform::Place &dev_place) const override {
if (calibration_mode_ == true) {
RunCalibration(scope, dev_place);
return;
}
RunTrt(scope, dev_place); RunTrt(scope, dev_place);
} }
void RunCalibration(const framework::Scope &scope,
const platform::Place &dev_place) const {
// This process will builds a 32-bit trt engine, runs it on the calibration
// set, and records a histogram for each
// tensor of the distribution of activation values.
LOG_FIRST_N(INFO, 1) << "The TRT engine: " << engine_key_
<< " is running calibration trt int8... ";
int runtime_batch = 1;
platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance();
auto &dev_ctx = *pool.Get(dev_place);
auto stream =
reinterpret_cast<const platform::CUDADeviceContext &>(dev_ctx).stream();
if (!Singleton<TRTCalibratorEngineManager>::Global().Has(engine_key_)) {
TRTCalibratorEngine *calib_res =
Singleton<TRTCalibratorEngineManager>::Global().Create(engine_key_);
std::unordered_map<std::string, size_t> calib_buffers;
for (auto &x : input_names_) {
if (param_names_.count(x)) continue;
auto &t =
inference::analysis::GetFromScope<framework::LoDTensor>(scope, x);
calib_buffers[x] = t.memory_size();
auto t_shape = framework::vectorize(t.dims());
runtime_batch = t_shape[0];
}
calib_res->calib_.reset(new TRTInt8Calibrator(
calib_buffers, runtime_batch, engine_key_, dev_place));
calib_res->thr_.reset(new std::thread([&]() {
calib_res->engine_.reset(new TensorRTEngine(
max_batch_size_, workspace_size_, stream,
boost::get<platform::CUDAPlace>(dev_place).device, enable_int8_,
calib_res->calib_.get()));
VLOG(3) << "start the calib trt engine thread";
Prepare(scope, dev_place, calib_res->engine_.get());
}));
}
TRTInt8Calibrator *temp_calibrator =
Singleton<TRTCalibratorEngineManager>::Global()
.Get(engine_key_)
->calib_.get();
std::unordered_map<std::string, void *> calib_data;
for (auto &x : Inputs("Xs")) {
if (param_names_.count(x)) continue;
auto &t =
inference::analysis::GetFromScope<framework::LoDTensor>(scope, x);
calib_data.emplace(x, t.data<void>());
}
temp_calibrator->setBatch(calib_data);
RunNativeImpl(scope, dev_place);
}
void RunTrt(const framework::Scope &scope, void RunTrt(const framework::Scope &scope,
const platform::Place &dev_place) const { const platform::Place &dev_place) const {
int runtime_batch = 1; int runtime_batch = 1;
...@@ -101,9 +188,10 @@ class TensorRTEngineOp : public framework::OperatorBase { ...@@ -101,9 +188,10 @@ class TensorRTEngineOp : public framework::OperatorBase {
auto stream = auto stream =
reinterpret_cast<const platform::CUDADeviceContext &>(dev_ctx).stream(); reinterpret_cast<const platform::CUDADeviceContext &>(dev_ctx).stream();
if (trt_engine_.get() == nullptr) { if (trt_engine_.get() == nullptr) {
trt_engine_.reset(new TensorRTEngine( trt_engine_.reset(
max_batch_size_, workspace_size_, stream, new TensorRTEngine(max_batch_size_, workspace_size_, stream,
boost::get<platform::CUDAPlace>(dev_place).device)); boost::get<platform::CUDAPlace>(dev_place).device,
enable_int8_, calibrator_.get()));
Prepare(scope, dev_place, trt_engine_.get()); Prepare(scope, dev_place, trt_engine_.get());
} }
...@@ -173,7 +261,8 @@ class TensorRTEngineOp : public framework::OperatorBase { ...@@ -173,7 +261,8 @@ class TensorRTEngineOp : public framework::OperatorBase {
void Prepare(const framework::Scope &scope, const platform::Place &dev_place, void Prepare(const framework::Scope &scope, const platform::Place &dev_place,
TensorRTEngine *engine) const { TensorRTEngine *engine) const {
VLOG(4) << "Prepare engine"; LOG(INFO) << "Prepare TRT engine (Optimize model structure, Select OP "
"kernel etc). This process may cost a lot of time.";
framework::proto::BlockDesc block_desc; framework::proto::BlockDesc block_desc;
block_desc.ParseFromString(Attr<std::string>("subgraph")); block_desc.ParseFromString(Attr<std::string>("subgraph"));
......
...@@ -96,19 +96,20 @@ TEST(TensorRTEngineOp, manual) { ...@@ -96,19 +96,20 @@ TEST(TensorRTEngineOp, manual) {
engine_op_desc.SetType("tensorrt_engine"); engine_op_desc.SetType("tensorrt_engine");
engine_op_desc.SetInput("Xs", std::vector<std::string>({"x"})); engine_op_desc.SetInput("Xs", std::vector<std::string>({"x"}));
engine_op_desc.SetOutput("Ys", std::vector<std::string>({"z0"})); engine_op_desc.SetOutput("Ys", std::vector<std::string>({"z0"}));
SetAttr<std::string>(engine_op_desc.Proto(), "subgraph",
block_->SerializeAsString()); engine_op_desc.SetBlockAttr("sub_block", &block_desc);
SetAttr<int>(engine_op_desc.Proto(), "max_batch_size", 2); engine_op_desc.SetAttr("max_batch_size", static_cast<int>(2));
SetAttr<int>(engine_op_desc.Proto(), "workspace_size", 1 << 20); engine_op_desc.SetAttr("workspace_size", static_cast<int>(1 << 20));
SetAttr<std::string>(engine_op_desc.Proto(), "engine_uniq_key", "a_engine"); engine_op_desc.SetAttr("parameters", std::vector<std::string>({}));
SetAttr<std::vector<std::string>>(engine_op_desc.Proto(), "parameters", engine_op_desc.SetAttr("engine_key", std::string("a_engine"));
std::vector<std::string>({})); engine_op_desc.SetAttr("calibration_data", std::string(""));
SetAttr<std::vector<std::string>>(engine_op_desc.Proto(), engine_op_desc.SetAttr("enable_int8", static_cast<bool>(false));
"output_name_mapping", engine_op_desc.SetAttr("output_name_mapping",
std::vector<std::string>({"z0"})); std::vector<std::string>({"z0"}));
engine_op_desc.SetAttr("subgraph", std::string(block_->SerializeAsString()));
LOG(INFO) << "create engine op"; LOG(INFO) << "create engine op";
auto engine_op = framework::OpRegistry::CreateOp(*engine_op_desc.Proto()); auto engine_op = framework::OpRegistry::CreateOp(engine_op_desc);
LOG(INFO) << "engine_op " << engine_op.get(); LOG(INFO) << "engine_op " << engine_op.get();
framework::Scope scope; framework::Scope scope;
...@@ -190,20 +191,19 @@ void Execute(int batch_size, int input_dim, int output_dim, int nlayers = 1) { ...@@ -190,20 +191,19 @@ void Execute(int batch_size, int input_dim, int output_dim, int nlayers = 1) {
engine_op_desc.SetInput("Xs", std::vector<std::string>({"x0"})); engine_op_desc.SetInput("Xs", std::vector<std::string>({"x0"}));
engine_op_desc.SetOutput("Ys", std::vector<std::string>({"z3"})); engine_op_desc.SetOutput("Ys", std::vector<std::string>({"z3"}));
SetAttr<std::string>(engine_op_desc.Proto(), "subgraph", engine_op_desc.SetBlockAttr("sub_block", &block_desc);
block_->SerializeAsString()); engine_op_desc.SetAttr("max_batch_size", static_cast<int>(batch_size));
SetAttr<int>(engine_op_desc.Proto(), "max_batch_size", batch_size); engine_op_desc.SetAttr("workspace_size", static_cast<int>(1 << 20));
SetAttr<int>(engine_op_desc.Proto(), "workspace_size", 1 << 20); engine_op_desc.SetAttr("parameters",
SetAttr<std::vector<std::string>>( std::vector<std::string>({"y0", "y1", "y2", "y3"}));
engine_op_desc.Proto(), "parameters", engine_op_desc.SetAttr("engine_key", std::string("b_engine"));
std::vector<std::string>({"y0", "y1", "y2", "y3"})); engine_op_desc.SetAttr("calibration_data", std::string(""));
SetAttr<std::string>(engine_op_desc.Proto(), "engine_uniq_key", "b_engine"); engine_op_desc.SetAttr("enable_int8", static_cast<bool>(false));
engine_op_desc.SetAttr("output_name_mapping",
SetAttr<std::vector<std::string>>(engine_op_desc.Proto(), std::vector<std::string>({"z3"}));
"output_name_mapping", engine_op_desc.SetAttr("subgraph", std::string(block_->SerializeAsString()));
std::vector<std::string>({"z3"}));
auto engine_op = framework::OpRegistry::CreateOp(engine_op_desc);
auto engine_op = framework::OpRegistry::CreateOp(*engine_op_desc.Proto());
// Execute them. // Execute them.
engine_op->Run(scope, place); engine_op->Run(scope, place);
......
...@@ -144,19 +144,17 @@ class CudnnCTCKernel : public framework::OpKernel<T> { ...@@ -144,19 +144,17 @@ class CudnnCTCKernel : public framework::OpKernel<T> {
CUDNN_CTC_LOSS_ALGO_DETERMINISTIC, cu_ctcloss_desc, &workspace_size)); CUDNN_CTC_LOSS_ALGO_DETERMINISTIC, cu_ctcloss_desc, &workspace_size));
T* loss_data = loss->mutable_data<T>(loss_dims, ctx.GetPlace()); T* loss_data = loss->mutable_data<T>(loss_dims, ctx.GetPlace());
math::SetConstant<DeviceContext, T>()(
ctx.template device_context<DeviceContext>(), loss, static_cast<T>(0)); auto workspace_handle = dev_ctx.cudnn_workspace_handle();
auto cudnn_func = [&](void* cudnn_workspace) {
auto temp_allocation = CUDNN_ENFORCE(platform::dynload::cudnnCTCLoss(
platform::DeviceTemporaryAllocator::Instance().Get(dev_ctx).Allocate( handle, cu_logits_desc, warpctc_logits_data, warpctc_label_data,
workspace_size); warpctc_label_lengths.data(), warpctc_logits_lengths.data(),
void* cudnn_workspace = temp_allocation->ptr(); loss_data, cu_grad_desc, warpctc_grad_data,
CUDNN_CTC_LOSS_ALGO_DETERMINISTIC, cu_ctcloss_desc, cudnn_workspace,
CUDNN_ENFORCE(platform::dynload::cudnnCTCLoss( workspace_size));
handle, cu_logits_desc, warpctc_logits_data, warpctc_label_data, };
warpctc_label_lengths.data(), warpctc_logits_lengths.data(), loss_data, workspace_handle.RunFunc(cudnn_func, workspace_size);
cu_grad_desc, warpctc_grad_data, CUDNN_CTC_LOSS_ALGO_DETERMINISTIC,
cu_ctcloss_desc, cudnn_workspace, workspace_size));
} }
}; };
......
...@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and ...@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#pragma once #pragma once
#include <cuda.h> #include <cuda.h>
// NOTE(): support float16 to half in header file. // NOTE(): support float16 to half in header file.
#define PADDLE_CUDA_FP16 #define PADDLE_CUDA_FP16
...@@ -30,6 +31,34 @@ namespace platform { ...@@ -30,6 +31,34 @@ namespace platform {
mask = __ballot_sync(FULL_WARP_MASK, (predicate)) mask = __ballot_sync(FULL_WARP_MASK, (predicate))
#endif #endif
inline static int RoundToPowerOfTwo(int dim) {
if (dim > 512) {
return 1024;
} else if (dim > 256) {
return 512;
} else if (dim > 128) {
return 256;
} else if (dim > 64) {
return 128;
} else if (dim > 32) {
return 64;
} else {
return 32;
}
}
#define CUDA_LAUNCH_KERNEL_BASE(dim, ...) \
case (dim): { \
constexpr auto kPowerOfTwoDim = (dim); \
__VA_ARGS__; \
} break
#define CUDA_LAUNCH_KERNEL_HELPER(...) \
CUDA_LAUNCH_KERNEL_BASE(256, ##__VA_ARGS__); \
CUDA_LAUNCH_KERNEL_BASE(128, ##__VA_ARGS__); \
CUDA_LAUNCH_KERNEL_BASE(64, ##__VA_ARGS__); \
CUDA_LAUNCH_KERNEL_BASE(32, ##__VA_ARGS__);
template <typename T> template <typename T>
__forceinline__ __device__ T CudaShuffleDownSync(unsigned mask, T val, __forceinline__ __device__ T CudaShuffleDownSync(unsigned mask, T val,
int delta, int width = 32) { int delta, int width = 32) {
......
...@@ -30,8 +30,9 @@ platform::DeviceContext* DeviceContextPool::Get(const platform::Place& place) { ...@@ -30,8 +30,9 @@ platform::DeviceContext* DeviceContextPool::Get(const platform::Place& place) {
auto it = device_contexts_.find(place); auto it = device_contexts_.find(place);
if (it == device_contexts_.end()) { if (it == device_contexts_.end()) {
PADDLE_THROW( PADDLE_THROW(
"'Place' is not supported, Please re-compile with WITH_GPU " "Place %s is not supported, Please re-compile with WITH_GPU "
"option"); "option",
place);
} }
return it->second.get().get(); return it->second.get().get();
} }
......
...@@ -15,6 +15,8 @@ limitations under the License. */ ...@@ -15,6 +15,8 @@ limitations under the License. */
#include "paddle/fluid/platform/gpu_info.h" #include "paddle/fluid/platform/gpu_info.h"
#include <algorithm> #include <algorithm>
#include <cstdlib>
#include <string>
#include "gflags/gflags.h" #include "gflags/gflags.h"
#include "paddle/fluid/platform/enforce.h" #include "paddle/fluid/platform/enforce.h"
...@@ -58,7 +60,18 @@ DEFINE_string(selected_gpus, "", ...@@ -58,7 +60,18 @@ DEFINE_string(selected_gpus, "",
namespace paddle { namespace paddle {
namespace platform { namespace platform {
int GetCUDADeviceCount() { static int GetCUDADeviceCountImpl() {
const auto *cuda_visible_devices = std::getenv("CUDA_VISIBLE_DEVICES");
if (cuda_visible_devices != nullptr) {
std::string cuda_visible_devices_str(cuda_visible_devices);
if (std::all_of(cuda_visible_devices_str.begin(),
cuda_visible_devices_str.end(),
[](char ch) { return ch == ' '; })) {
VLOG(2) << "CUDA_VISIBLE_DEVICES is set to be empty. No GPU detected.";
return 0;
}
}
int count; int count;
PADDLE_ENFORCE( PADDLE_ENFORCE(
cudaGetDeviceCount(&count), cudaGetDeviceCount(&count),
...@@ -66,6 +79,11 @@ int GetCUDADeviceCount() { ...@@ -66,6 +79,11 @@ int GetCUDADeviceCount() {
return count; return count;
} }
int GetCUDADeviceCount() {
static auto dev_cnt = GetCUDADeviceCountImpl();
return dev_cnt;
}
int GetCUDAComputeCapability(int id) { int GetCUDAComputeCapability(int id) {
PADDLE_ENFORCE_LT(id, GetCUDADeviceCount(), "id must less than GPU count"); PADDLE_ENFORCE_LT(id, GetCUDADeviceCount(), "id must less than GPU count");
cudaDeviceProp device_prop; cudaDeviceProp device_prop;
...@@ -203,13 +221,17 @@ size_t GpuMaxChunkSize() { ...@@ -203,13 +221,17 @@ size_t GpuMaxChunkSize() {
void GpuMemcpyAsync(void *dst, const void *src, size_t count, void GpuMemcpyAsync(void *dst, const void *src, size_t count,
enum cudaMemcpyKind kind, cudaStream_t stream) { enum cudaMemcpyKind kind, cudaStream_t stream) {
PADDLE_ENFORCE(cudaMemcpyAsync(dst, src, count, kind, stream), PADDLE_ENFORCE(cudaMemcpyAsync(dst, src, count, kind, stream),
"cudaMemcpyAsync failed in paddle::platform::GpuMemcpyAsync"); "cudaMemcpyAsync failed in paddle::platform::GpuMemcpyAsync "
"(%p -> %p, length: %d)",
src, dst, static_cast<int>(count));
} }
void GpuMemcpySync(void *dst, const void *src, size_t count, void GpuMemcpySync(void *dst, const void *src, size_t count,
enum cudaMemcpyKind kind) { enum cudaMemcpyKind kind) {
PADDLE_ENFORCE(cudaMemcpy(dst, src, count, kind), PADDLE_ENFORCE(cudaMemcpy(dst, src, count, kind),
"cudaMemcpy failed in paddle::platform::GpuMemcpySync"); "cudaMemcpy failed in paddle::platform::GpuMemcpySync (%p -> "
"%p, length: %d)",
src, dst, static_cast<int>(count));
} }
void GpuMemcpyPeerAsync(void *dst, int dst_device, const void *src, void GpuMemcpyPeerAsync(void *dst, int dst_device, const void *src,
......
...@@ -15,18 +15,38 @@ limitations under the License. */ ...@@ -15,18 +15,38 @@ limitations under the License. */
#include "paddle/fluid/pybind/imperative.h" #include "paddle/fluid/pybind/imperative.h"
#include "paddle/fluid/framework/block_desc.h" #include "paddle/fluid/framework/block_desc.h"
#include "paddle/fluid/imperative/tracer.h" #include "paddle/fluid/imperative/tracer.h"
#include "paddle/fluid/imperative/type_defs.h"
namespace paddle { namespace paddle {
namespace pybind { namespace pybind {
// Bind Methods // Bind Methods
void BindTracer(pybind11::module *m) { void BindTracer(pybind11::module* m) {
pybind11::class_<imperative::Tracer>(*m, "Tracer", "") pybind11::class_<imperative::Tracer>(*m, "Tracer", "")
.def("__init__", .def("__init__",
[](imperative::Tracer &self, framework::BlockDesc *root_block) { [](imperative::Tracer& self, framework::BlockDesc* root_block) {
new (&self) imperative::Tracer(root_block); new (&self) imperative::Tracer(root_block);
}) })
.def("trace", &imperative::Tracer::Trace) .def("trace",
[](imperative::Tracer& self, imperative::OpBase* op,
const imperative::VarBasePtrMap& inputs,
const imperative::VarBasePtrMap& outputs,
framework::BlockDesc* block,
const platform::CPUPlace expected_place,
const bool stop_gradient = false) {
self.Trace(op, inputs, outputs, block, expected_place,
stop_gradient);
})
.def("trace",
[](imperative::Tracer& self, imperative::OpBase* op,
const imperative::VarBasePtrMap& inputs,
const imperative::VarBasePtrMap& outputs,
framework::BlockDesc* block,
const platform::CUDAPlace expected_place,
const bool stop_gradient = false) {
self.Trace(op, inputs, outputs, block, expected_place,
stop_gradient);
})
.def("py_trace", &imperative::Tracer::PyTrace, .def("py_trace", &imperative::Tracer::PyTrace,
pybind11::return_value_policy::take_ownership); pybind11::return_value_policy::take_ownership);
} }
......
...@@ -180,8 +180,14 @@ void BindNativePredictor(py::module *m) { ...@@ -180,8 +180,14 @@ void BindNativePredictor(py::module *m) {
} }
void BindAnalysisConfig(py::module *m) { void BindAnalysisConfig(py::module *m) {
py::class_<AnalysisConfig>(*m, "AnalysisConfig") py::class_<AnalysisConfig> analysis_config(*m, "AnalysisConfig");
.def(py::init<const AnalysisConfig &>())
py::enum_<AnalysisConfig::Precision>(analysis_config, "Precision")
.value("Float32", AnalysisConfig::Precision::kFloat32)
.value("Int8", AnalysisConfig::Precision::kInt8)
.export_values();
analysis_config.def(py::init<const AnalysisConfig &>())
.def(py::init<const std::string &>()) .def(py::init<const std::string &>())
.def(py::init<const std::string &, const std::string &>()) .def(py::init<const std::string &, const std::string &>())
.def("set_model", (void (AnalysisConfig::*)(const std::string &)) & .def("set_model", (void (AnalysisConfig::*)(const std::string &)) &
...@@ -215,7 +221,8 @@ void BindAnalysisConfig(py::module *m) { ...@@ -215,7 +221,8 @@ void BindAnalysisConfig(py::module *m) {
.def("specify_input_name", &AnalysisConfig::specify_input_name) .def("specify_input_name", &AnalysisConfig::specify_input_name)
.def("enable_tensorrt_engine", &AnalysisConfig::EnableTensorRtEngine, .def("enable_tensorrt_engine", &AnalysisConfig::EnableTensorRtEngine,
py::arg("workspace_size") = 1 << 20, py::arg("max_batch_size") = 1, py::arg("workspace_size") = 1 << 20, py::arg("max_batch_size") = 1,
py::arg("min_subgraph_size") = 3) py::arg("min_subgraph_size") = 3,
py::arg("precision_mode") = AnalysisConfig::Precision::kFloat32)
.def("tensorrt_engine_enabled", &AnalysisConfig::tensorrt_engine_enabled) .def("tensorrt_engine_enabled", &AnalysisConfig::tensorrt_engine_enabled)
.def("switch_ir_debug", &AnalysisConfig::SwitchIrDebug, .def("switch_ir_debug", &AnalysisConfig::SwitchIrDebug,
py::arg("x") = true) py::arg("x") = true)
......
...@@ -138,6 +138,22 @@ PYBIND11_MODULE(core, m) { ...@@ -138,6 +138,22 @@ PYBIND11_MODULE(core, m) {
.def("_grad_ivar", .def("_grad_ivar",
[](const imperative::VarBase &self) { return self.grads_; }, [](const imperative::VarBase &self) { return self.grads_; },
py::return_value_policy::reference) py::return_value_policy::reference)
.def("_copy_to",
[](const imperative::VarBase &self, const platform::CPUPlace &place,
bool blocking) {
std::unique_ptr<imperative::VarBase> new_var =
self.NewVarBase(place, blocking);
return new_var.release();
},
py::return_value_policy::take_ownership)
.def("_copy_to",
[](const imperative::VarBase &self, const platform::CUDAPlace &place,
bool blocking) {
std::unique_ptr<imperative::VarBase> new_var =
self.NewVarBase(place, blocking);
return new_var.release();
},
py::return_value_policy::take_ownership)
.def("value", [](const imperative::VarBase &self) { return self.var_; }, .def("value", [](const imperative::VarBase &self) { return self.var_; },
py::return_value_policy::reference) py::return_value_policy::reference)
.def_property( .def_property(
...@@ -469,6 +485,7 @@ All parameter, weight, gradient are variables in Paddle. ...@@ -469,6 +485,7 @@ All parameter, weight, gradient are variables in Paddle.
py::return_value_policy::reference); py::return_value_policy::reference);
py::class_<framework::ReaderHolder>(m, "Reader", "") py::class_<framework::ReaderHolder>(m, "Reader", "")
.def("start", &framework::ReaderHolder::Start)
.def("reset", &framework::ReaderHolder::ResetAll); .def("reset", &framework::ReaderHolder::ResetAll);
using LoDTensorBlockingQueue = using LoDTensorBlockingQueue =
...@@ -489,19 +506,12 @@ All parameter, weight, gradient are variables in Paddle. ...@@ -489,19 +506,12 @@ All parameter, weight, gradient are variables in Paddle.
.def("is_closed", &LoDTensorBlockingQueue::IsClosed); .def("is_closed", &LoDTensorBlockingQueue::IsClosed);
m.def("init_lod_tensor_blocking_queue", m.def("init_lod_tensor_blocking_queue",
[](Variable &var, size_t capacity, [](Variable &var,
const std::vector<std::vector<int64_t>> &shapes) size_t capacity) -> std::shared_ptr<LoDTensorBlockingQueue> {
-> std::shared_ptr<LoDTensorBlockingQueue> { auto *holder = var.GetMutable<LoDTensorBlockingQueueHolder>();
std::vector<DDim> dims(shapes.size()); holder->InitOnce(capacity, FLAGS_reader_queue_speed_test_mode);
std::transform(shapes.begin(), shapes.end(), dims.begin(), return holder->GetQueue();
[](const std::vector<int64_t> &shape) { },
return make_ddim(shape);
});
auto *holder = var.GetMutable<LoDTensorBlockingQueueHolder>();
holder->InitOnce(capacity, dims,
FLAGS_reader_queue_speed_test_mode);
return holder->GetQueue();
},
py::return_value_policy::copy); py::return_value_policy::copy);
py::class_<Scope>(m, "_Scope", R"DOC( py::class_<Scope>(m, "_Scope", R"DOC(
...@@ -626,7 +636,18 @@ All parameter, weight, gradient are variables in Paddle. ...@@ -626,7 +636,18 @@ All parameter, weight, gradient are variables in Paddle.
py::class_<platform::Communicator>(m, "Communicator").def(py::init<>()); py::class_<platform::Communicator>(m, "Communicator").def(py::init<>());
#endif #endif
py::class_<platform::CUDAPlace>(m, "CUDAPlace") py::class_<platform::CUDAPlace>(m, "CUDAPlace")
.def(py::init<int>()) .def("__init__",
[](platform::CUDAPlace &self, int dev_id) {
#ifdef PADDLE_WITH_CUDA
PADDLE_ENFORCE(
dev_id >= 0 && dev_id < platform::GetCUDADeviceCount(),
"Invalid CUDAPlace(%d), must inside [0, %d)", dev_id,
platform::GetCUDADeviceCount());
new (&self) platform::CUDAPlace(dev_id);
#else
PADDLE_THROW("Cannot use CUDAPlace in CPU only version");
#endif
})
.def("__str__", string::to_string<const platform::CUDAPlace &>); .def("__str__", string::to_string<const platform::CUDAPlace &>);
py::class_<paddle::platform::CPUPlace>(m, "CPUPlace") py::class_<paddle::platform::CPUPlace>(m, "CPUPlace")
...@@ -634,7 +655,12 @@ All parameter, weight, gradient are variables in Paddle. ...@@ -634,7 +655,12 @@ All parameter, weight, gradient are variables in Paddle.
.def("__str__", string::to_string<const platform::CPUPlace &>); .def("__str__", string::to_string<const platform::CPUPlace &>);
py::class_<paddle::platform::CUDAPinnedPlace>(m, "CUDAPinnedPlace") py::class_<paddle::platform::CUDAPinnedPlace>(m, "CUDAPinnedPlace")
.def(py::init<>()) .def("__init__",
[](platform::CUDAPinnedPlace &) {
#ifndef PADDLE_WITH_CUDA
PADDLE_THROW("Cannot use CUDAPinnedPlace in CPU only version");
#endif
})
.def("__str__", string::to_string<const platform::CUDAPinnedPlace &>); .def("__str__", string::to_string<const platform::CUDAPinnedPlace &>);
py::class_<platform::Place>(m, "Place") py::class_<platform::Place>(m, "Place")
...@@ -1005,7 +1031,7 @@ All parameter, weight, gradient are variables in Paddle. ...@@ -1005,7 +1031,7 @@ All parameter, weight, gradient are variables in Paddle.
PADDLE_ENFORCE(!self.IsFinalized(), "BuildStrategy is finlaized."); PADDLE_ENFORCE(!self.IsFinalized(), "BuildStrategy is finlaized.");
self.remove_unnecessary_lock_ = b; self.remove_unnecessary_lock_ = b;
}, },
R"DOC(The type is BOOL. If set True, some locks in GPU ops would be released and ParallelExecutor would run faster. Default False.)DOC") R"DOC(The type is BOOL. If set True, some locks in GPU ops would be released and ParallelExecutor would run faster. Default True.)DOC")
.def_property( .def_property(
"num_trainers", "num_trainers",
[](const BuildStrategy &self) { return self.num_trainers_; }, [](const BuildStrategy &self) { return self.num_trainers_; },
......
...@@ -173,7 +173,6 @@ function cmake_gen() { ...@@ -173,7 +173,6 @@ function cmake_gen() {
-DCMAKE_BUILD_TYPE=${CMAKE_BUILD_TYPE:-Release} -DCMAKE_BUILD_TYPE=${CMAKE_BUILD_TYPE:-Release}
${PYTHON_FLAGS} ${PYTHON_FLAGS}
-DWITH_DSO=ON -DWITH_DSO=ON
-DWITH_DOC=${WITH_DOC:-OFF}
-DWITH_GPU=${WITH_GPU:-OFF} -DWITH_GPU=${WITH_GPU:-OFF}
-DWITH_AMD_GPU=${WITH_AMD_GPU:-OFF} -DWITH_AMD_GPU=${WITH_AMD_GPU:-OFF}
-DWITH_DISTRIBUTE=${distibuted_flag} -DWITH_DISTRIBUTE=${distibuted_flag}
...@@ -208,7 +207,6 @@ EOF ...@@ -208,7 +207,6 @@ EOF
-DCMAKE_BUILD_TYPE=${CMAKE_BUILD_TYPE:-Release} \ -DCMAKE_BUILD_TYPE=${CMAKE_BUILD_TYPE:-Release} \
${PYTHON_FLAGS} \ ${PYTHON_FLAGS} \
-DWITH_DSO=ON \ -DWITH_DSO=ON \
-DWITH_DOC=${WITH_DOC:-OFF} \
-DWITH_GPU=${WITH_GPU:-OFF} \ -DWITH_GPU=${WITH_GPU:-OFF} \
-DWITH_AMD_GPU=${WITH_AMD_GPU:-OFF} \ -DWITH_AMD_GPU=${WITH_AMD_GPU:-OFF} \
-DWITH_DISTRIBUTE=${distibuted_flag} \ -DWITH_DISTRIBUTE=${distibuted_flag} \
...@@ -328,7 +326,8 @@ function run_brpc_test() { ...@@ -328,7 +326,8 @@ function run_brpc_test() {
======================================== ========================================
EOF EOF
set +x set +x
declare -a other_tests=("test_listen_and_serv_op" "system_allocator_test") declare -a other_tests=("test_listen_and_serv_op" "system_allocator_test" \
"rpc_server_test" "varhandle_test" "collective_server_test" "brpc_serde_test")
all_tests=`ctest -N` all_tests=`ctest -N`
for t in "${other_tests[@]}" for t in "${other_tests[@]}"
...@@ -527,31 +526,6 @@ function bind_test() { ...@@ -527,31 +526,6 @@ function bind_test() {
wait wait
} }
function gen_docs() {
mkdir -p ${PADDLE_ROOT}/build
cd ${PADDLE_ROOT}/build
cat <<EOF
========================================
Building documentation ...
In /paddle/build
========================================
EOF
cmake .. \
-DCMAKE_BUILD_TYPE=Release \
-DWITH_DOC=ON \
-DWITH_GPU=OFF \
-DWITH_MKL=OFF
make -j `nproc` paddle_docs paddle_apis
# check websites for broken links
linkchecker doc/v2/en/html/index.html
linkchecker doc/v2/cn/html/index.html
linkchecker doc/v2/api/en/html/index.html
}
function gen_doc_lib() { function gen_doc_lib() {
mkdir -p ${PADDLE_ROOT}/build mkdir -p ${PADDLE_ROOT}/build
cd ${PADDLE_ROOT}/build cd ${PADDLE_ROOT}/build
...@@ -563,7 +537,6 @@ function gen_doc_lib() { ...@@ -563,7 +537,6 @@ function gen_doc_lib() {
EOF EOF
cmake .. \ cmake .. \
-DCMAKE_BUILD_TYPE=Release \ -DCMAKE_BUILD_TYPE=Release \
-DWITH_DOC=ON \
-DWITH_GPU=OFF \ -DWITH_GPU=OFF \
-DWITH_MKL=OFF \ -DWITH_MKL=OFF \
-DWITH_FLUID_ONLY=ON -DWITH_FLUID_ONLY=ON
...@@ -802,9 +775,6 @@ function main() { ...@@ -802,9 +775,6 @@ function main() {
bind_test) bind_test)
bind_test bind_test
;; ;;
doc)
gen_docs
;;
gen_doc_lib) gen_doc_lib)
gen_doc_lib $2 gen_doc_lib $2
;; ;;
......
...@@ -22,6 +22,8 @@ from . import op_frequence ...@@ -22,6 +22,8 @@ from . import op_frequence
from .op_frequence import * from .op_frequence import *
from . import quantize from . import quantize
from .quantize import * from .quantize import *
from . import reader
from .reader import *
from . import slim from . import slim
from .slim import * from .slim import *
from . import utils from . import utils
...@@ -32,5 +34,6 @@ __all__ += decoder.__all__ ...@@ -32,5 +34,6 @@ __all__ += decoder.__all__
__all__ += memory_usage_calc.__all__ __all__ += memory_usage_calc.__all__
__all__ += op_frequence.__all__ __all__ += op_frequence.__all__
__all__ += quantize.__all__ __all__ += quantize.__all__
__all__ += reader.__all__
__all__ += slim.__all__ __all__ += slim.__all__
__all__ += utils.__all__ __all__ += utils.__all__
...@@ -32,10 +32,13 @@ class Calibrator(object): ...@@ -32,10 +32,13 @@ class Calibrator(object):
def __init__(self, *args, **kwargs): def __init__(self, *args, **kwargs):
self.program = kwargs['program'] self.program = kwargs['program']
self.iterations = kwargs['iterations']
self.pretrained_model = kwargs['pretrained_model'] self.pretrained_model = kwargs['pretrained_model']
self.debug = kwargs['debug'] self.debug = kwargs['debug'] if 'debug' in kwargs else False
self.algo = kwargs['algo'] self.algo = kwargs['algo']
self.output = kwargs['output']
self.feed_var_names = kwargs['feed_var_names']
self.fetch_list = kwargs['fetch_list']
self.exe = kwargs['exe']
self._conv_input_var_name = [] self._conv_input_var_name = []
self._conv_output_var_name = [] self._conv_output_var_name = []
...@@ -54,17 +57,38 @@ class Calibrator(object): ...@@ -54,17 +57,38 @@ class Calibrator(object):
self._u8_output_var = [] self._u8_output_var = []
self._s8_output_var = [] self._s8_output_var = []
self._persistable_vars = [] self._persistable_vars = []
self._sampling_data = {}
def generate_sampling_program(self):
self.__init_analysis() self.__init_analysis()
self.__generate_output_program() self.__generate_output_program()
def generate_quantized_data(self, sampling_data): def save_int8_model(self):
self.__sampling(sampling_data) self.__sampling(self._sampling_data)
self.__save_scale() self.__save_scale()
self.__update_program() self.__update_program()
self.__update_output_program_attr() self.__update_output_program_attr()
self.__display_debug() self.__display_debug()
self.__save_offline_model()
def sample_data(self):
'''
Sampling the tensor data of variable.
'''
for i in self.sampling_program.list_vars():
if i.name in self.sampling_vars:
np_data = np.array(fluid.global_scope().find_var(i.name)
.get_tensor())
if i.name not in self._sampling_data:
self._sampling_data[i.name] = []
self._sampling_data[i.name].append(np_data)
def __save_offline_model(self):
'''
Save the quantized model to the disk.
'''
fluid.io.save_inference_model(self.output, self.feed_var_names,
self.fetch_list, self.exe,
self.sampling_program)
def __display_debug(self): def __display_debug(self):
if self.debug: if self.debug:
......
## CTR READER
An multi-thread cpp reader that has the same interface with py_reader. It
uses cpp multi-thread to read file and is much more faster then the Python read
thread in py_reader.
Currently, it support two types of file:
- gzip
- plain text file
and two types of data format:
- cvs data format is :
* label dense_fea,dense_fea sparse_fea,sparse_fea
- the svm data format is :
* label slot1:fea_sign slot2:fea_sign slot1:fea_sign
# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from __future__ import print_function
from . import ctr_reader
__all__ = ctr_reader.__all__
...@@ -20,6 +20,8 @@ from paddle.fluid.framework import default_main_program, \ ...@@ -20,6 +20,8 @@ from paddle.fluid.framework import default_main_program, \
default_startup_program, Variable default_startup_program, Variable
from paddle.fluid.unique_name import generate as unique_name from paddle.fluid.unique_name import generate as unique_name
__all__ = ['ctr_reader']
def monkey_patch_reader_methods(reader): def monkey_patch_reader_methods(reader):
def __get_reader__(): def __get_reader__():
...@@ -30,7 +32,11 @@ def monkey_patch_reader_methods(reader): ...@@ -30,7 +32,11 @@ def monkey_patch_reader_methods(reader):
def reset(): def reset():
return __get_reader__().reset() return __get_reader__().reset()
def start():
return __get_reader__().start()
reader.reset = reset reader.reset = reset
reader.start = start
reader.stop_gradient = True reader.stop_gradient = True
reader.persistable = True reader.persistable = True
return reader return reader
...@@ -44,13 +50,18 @@ def _copy_reader_var_(block, var): ...@@ -44,13 +50,18 @@ def _copy_reader_var_(block, var):
return new_var return new_var
def ctr_reader(feed_data, def ctr_reader(
capacity, feed_dict,
thread_num, file_type, # gzip or plain
batch_size, file_format, # csv or svm
file_list, dense_slot_index,
slots, sparse_slot_index,
name=None): capacity,
thread_num,
batch_size,
file_list,
slots,
name=None):
""" """
Create a CTR reader for data feeding in Python Create a CTR reader for data feeding in Python
...@@ -67,12 +78,21 @@ def ctr_reader(feed_data, ...@@ -67,12 +78,21 @@ def ctr_reader(feed_data,
Note that :code:`Program.clone()` method cannot clone :code:`py_reader`. Note that :code:`Program.clone()` method cannot clone :code:`py_reader`.
Args: Args:
feed_dict(list(variable)): a list of data variable.
file_type('gzip'|'plain'): the type of the data file
file_format('csv'|'svm'): csv data or svm data format.
cvs data format is :
label dense_fea,dense_fea sparse_fea,sparse_fea
the svm data format is :
label slot1:fea_sign slot2:fea_sign slot1:fea_sign
dense_slot_index(list(int)): the index of dense slots
sparse_slot_index(list(int)): the index of sparse slots
capacity(int): The buffer capacity maintained by :code:`py_reader`. capacity(int): The buffer capacity maintained by :code:`py_reader`.
thread_num(list|tuple): List of tuples which declaring data shapes. thread_num(int): the thread num to read files by cpp reader.
batch_size(list|tuple): List of strs which declaring data type. batch_size(int): batch size of data.
file_list(list|tuple): List of ints which declaring data lod_level. file_list(list(str)): List of file names that need to read.
slots(bool): Whether use double buffer or not. slots(list(int64)): list of slot id.
name(basestring): The prefix Python queue name and Reader name. None will name(string): The prefix Python queue name and Reader name. None will
be generated automatically. be generated automatically.
Returns: Returns:
...@@ -80,7 +100,15 @@ def ctr_reader(feed_data, ...@@ -80,7 +100,15 @@ def ctr_reader(feed_data,
Examples: Examples:
1. The basic usage of :code:`py_reader` is as follows: 1. The basic usage of :code:`ctr_reader` is as follows:
.. code-block:: python
py_reader = fluid.contrib.ctr_reader.ctr_reader(
feed_dict=datas, file_type='plain', file_format='csv',
file_list=file_list, dense_slot_indexs=[1, 2, 3, 4], sparse_slot_indexs=[],
capacity=64, thread_num=20, batch_size=1000, slots=[], name='ctr_reader')
""" """
if name is None: if name is None:
queue_name = unique_name('lod_tensor_blocking_queue') queue_name = unique_name('lod_tensor_blocking_queue')
...@@ -90,7 +118,7 @@ def ctr_reader(feed_data, ...@@ -90,7 +118,7 @@ def ctr_reader(feed_data,
reader_name = "_".join([name, "reader"]) reader_name = "_".join([name, "reader"])
var = global_scope().var(queue_name) var = global_scope().var(queue_name)
feed_queue = core.init_lod_tensor_blocking_queue(var, capacity, shapes) feed_queue = core.init_lod_tensor_blocking_queue(var, capacity)
startup_blk = default_startup_program().current_block() startup_blk = default_startup_program().current_block()
reader_var = startup_blk.create_var(name=reader_name) reader_var = startup_blk.create_var(name=reader_name)
...@@ -99,12 +127,22 @@ def ctr_reader(feed_data, ...@@ -99,12 +127,22 @@ def ctr_reader(feed_data,
inputs={'blocking_queue': [queue_name]}, inputs={'blocking_queue': [queue_name]},
outputs={'Out': [reader_var]}, outputs={'Out': [reader_var]},
attrs={ attrs={
'use_data_config': False,
'thread_num': thread_num, 'thread_num': thread_num,
'batch_size': batch_size, 'batch_size': batch_size,
'file_list': file_list, 'file_list': file_list,
'slots': slots, 'file_type': file_type,
'file_format': file_format,
'dense_slot_index': dense_slot_index,
'sparse_slot_index': sparse_slot_index,
'sparse_slots': slots,
'ranks': [],
'lod_levels': [],
'shape_concat': []
}) })
dtypes = [data.dtype for data in feed_dict]
reader_var.desc.set_dtypes(dtypes)
reader_var.persistable = True reader_var.persistable = True
main_prog_reader_var = _copy_reader_var_( main_prog_reader_var = _copy_reader_var_(
...@@ -118,6 +156,9 @@ def ctr_reader(feed_data, ...@@ -118,6 +156,9 @@ def ctr_reader(feed_data,
main_blk = default_main_program().current_block() main_blk = default_main_program().current_block()
main_blk.append_op( main_blk.append_op(
type='read', inputs={'Reader': [reader]}, outputs={'Out': feed_data}) type='read',
inputs={'Reader': [reader]},
attrs={'infer_out': False},
outputs={'Out': feed_dict})
return reader return reader
...@@ -23,10 +23,11 @@ import argparse ...@@ -23,10 +23,11 @@ import argparse
import functools import functools
import contextlib import contextlib
import paddle.fluid.profiler as profiler import paddle.fluid.profiler as profiler
from paddle.dataset.common import download
from PIL import Image, ImageEnhance from PIL import Image, ImageEnhance
import math import math
sys.path.append('..') sys.path.append('..')
import int8_inference.utility as ut import int8_inference.utility as int8_utility
random.seed(0) random.seed(0)
np.random.seed(0) np.random.seed(0)
...@@ -116,27 +117,43 @@ def val(data_dir=DATA_DIR): ...@@ -116,27 +117,43 @@ def val(data_dir=DATA_DIR):
return _reader_creator(file_list, 'val', shuffle=False, data_dir=data_dir) return _reader_creator(file_list, 'val', shuffle=False, data_dir=data_dir)
class TestCalibration(unittest.TestCase): class TestCalibrationForResnet50(unittest.TestCase):
def setUp(self): def setUp(self):
# TODO(guomingz): Put the download process in the cmake. self.int8_download = 'int8/download'
# Download and unzip test data set self.cache_folder = os.path.expanduser('~/.cache/paddle/dataset/' +
imagenet_dl_url = 'http://paddle-inference-dist.bj.bcebos.com/int8/calibration_test_data.tar.gz' self.int8_download)
zip_file_name = imagenet_dl_url.split('/')[-1]
cmd = 'rm -rf data {} && mkdir data && wget {} && tar xvf {} -C data'.format( data_url = 'http://paddle-inference-dist.cdn.bcebos.com/int8/calibration_test_data.tar.gz'
zip_file_name, imagenet_dl_url, zip_file_name) data_md5 = '1b6c1c434172cca1bf9ba1e4d7a3157d'
os.system(cmd) self.data_cache_folder = self.download_data(data_url, data_md5, "data")
# resnet50 fp32 data
resnet50_fp32_model_url = 'http://paddle-inference-dist.bj.bcebos.com/int8/resnet50_int8_model.tar.gz' # reader/decorator.py requires the relative path to the data folder
resnet50_zip_name = resnet50_fp32_model_url.split('/')[-1] cmd = 'rm -rf {0} && ln -s {1} {0}'.format("data",
resnet50_unzip_folder_name = 'resnet50_fp32' self.data_cache_folder)
cmd = 'rm -rf {} {} && mkdir {} && wget {} && tar xvf {} -C {}'.format(
resnet50_unzip_folder_name, resnet50_zip_name,
resnet50_unzip_folder_name, resnet50_fp32_model_url,
resnet50_zip_name, resnet50_unzip_folder_name)
os.system(cmd) os.system(cmd)
self.iterations = 100 self.iterations = 50
self.skip_batch_num = 5
def cache_unzipping(self, target_folder, zip_path):
if not os.path.exists(target_folder):
cmd = 'mkdir {0} && tar xf {1} -C {0}'.format(target_folder,
zip_path)
os.system(cmd)
def download_data(self, data_url, data_md5, folder_name):
download(data_url, self.int8_download, data_md5)
data_cache_folder = os.path.join(self.cache_folder, folder_name)
file_name = data_url.split('/')[-1]
zip_path = os.path.join(self.cache_folder, file_name)
self.cache_unzipping(data_cache_folder, zip_path)
return data_cache_folder
def download_resnet50_model(self):
# resnet50 fp32 data
data_url = 'http://paddle-inference-dist.cdn.bcebos.com/int8/resnet50_int8_model.tar.gz'
data_md5 = '4a5194524823d9b76da6e738e1367881'
self.model_cache_folder = self.download_data(data_url, data_md5,
"resnet50_fp32")
def run_program(self, model_path, generate_int8=False, algo='direct'): def run_program(self, model_path, generate_int8=False, algo='direct'):
image_shape = [3, 224, 224] image_shape = [3, 224, 224]
...@@ -163,16 +180,15 @@ class TestCalibration(unittest.TestCase): ...@@ -163,16 +180,15 @@ class TestCalibration(unittest.TestCase):
print("Start calibration ...") print("Start calibration ...")
calibrator = ut.Calibrator( calibrator = int8_utility.Calibrator(
program=infer_program, program=infer_program,
pretrained_model=model_path, pretrained_model=model_path,
iterations=100, algo=algo,
debug=False, exe=exe,
algo=algo) output=int8_model,
feed_var_names=feed_dict,
sampling_data = {} fetch_list=fetch_targets)
calibrator.generate_sampling_program()
test_info = [] test_info = []
cnt = 0 cnt = 0
for batch_id, data in enumerate(val_reader()): for batch_id, data in enumerate(val_reader()):
...@@ -192,13 +208,7 @@ class TestCalibration(unittest.TestCase): ...@@ -192,13 +208,7 @@ class TestCalibration(unittest.TestCase):
feed_dict[1]: label}, feed_dict[1]: label},
fetch_list=fetch_targets) fetch_list=fetch_targets)
if generate_int8: if generate_int8:
for i in calibrator.sampling_program.list_vars(): calibrator.sample_data()
if i.name in calibrator.sampling_vars:
np_data = np.array(fluid.global_scope().find_var(i.name)
.get_tensor())
if i.name not in sampling_data:
sampling_data[i.name] = []
sampling_data[i.name].append(np_data)
test_info.append(np.mean(acc1) * len(data)) test_info.append(np.mean(acc1) * len(data))
cnt += len(data) cnt += len(data)
...@@ -209,18 +219,35 @@ class TestCalibration(unittest.TestCase): ...@@ -209,18 +219,35 @@ class TestCalibration(unittest.TestCase):
break break
if generate_int8: if generate_int8:
calibrator.generate_quantized_data(sampling_data) calibrator.save_int8_model()
fluid.io.save_inference_model(int8_model, feed_dict, fetch_targets,
exe, calibrator.sampling_program)
print( print(
"Calibration is done and the corresponding files were generated at {}". "Calibration is done and the corresponding files are generated at {}".
format(os.path.abspath("calibration_out"))) format(os.path.abspath("calibration_out")))
else: else:
return np.sum(test_info) / cnt return np.sum(test_info) / cnt
def test_calibration_for_resnet50(self): def test_calibration(self):
fp32_acc1 = self.run_program("resnet50_fp32/model") self.download_resnet50_model()
self.run_program("resnet50_fp32/model", True) fp32_acc1 = self.run_program(self.model_cache_folder + "/model")
self.run_program(self.model_cache_folder + "/model", True)
int8_acc1 = self.run_program("calibration_out")
delta_value = np.abs(fp32_acc1 - int8_acc1)
self.assertLess(delta_value, 0.01)
class TestCalibrationForMobilenetv1(TestCalibrationForResnet50):
def download_mobilenetv1_model(self):
# mobilenetv1 fp32 data
data_url = 'http://paddle-inference-dist.cdn.bcebos.com/int8/mobilenetv1_int8_model.tar.gz'
data_md5 = '13892b0716d26443a8cdea15b3c6438b'
self.model_cache_folder = self.download_data(data_url, data_md5,
"mobilenetv1_fp32")
def test_calibration(self):
self.download_mobilenetv1_model()
fp32_acc1 = self.run_program(self.model_cache_folder + "/model")
self.run_program(self.model_cache_folder + "/model", True, algo='KL')
int8_acc1 = self.run_program("calibration_out") int8_acc1 = self.run_program("calibration_out")
delta_value = np.abs(fp32_acc1 - int8_acc1) delta_value = np.abs(fp32_acc1 - int8_acc1)
self.assertLess(delta_value, 0.01) self.assertLess(delta_value, 0.01)
......
...@@ -70,6 +70,7 @@ ZERO_VAR_SUFFIX = core.kZeroVarSuffix() ...@@ -70,6 +70,7 @@ ZERO_VAR_SUFFIX = core.kZeroVarSuffix()
CONTROL_DEP_VAR_PREFIX = core.kControlDepVarName() CONTROL_DEP_VAR_PREFIX = core.kControlDepVarName()
_imperative_tracer_ = None _imperative_tracer_ = None
_imperative_current_expected_place_ = None
def _in_imperative_mode(): def _in_imperative_mode():
...@@ -80,6 +81,10 @@ def _imperative_tracer(): ...@@ -80,6 +81,10 @@ def _imperative_tracer():
return _imperative_tracer_ return _imperative_tracer_
def _current_expected_place():
return _imperative_current_expected_place_
class NameScope(object): class NameScope(object):
def __init__(self, name="", parent=None): def __init__(self, name="", parent=None):
self._children = dict() self._children = dict()
...@@ -383,8 +388,8 @@ class Variable(object): ...@@ -383,8 +388,8 @@ class Variable(object):
self._ivar.stop_gradient = stop_gradient self._ivar.stop_gradient = stop_gradient
def _numpy(self): def _numpy(self):
tensor = self._ivar.value().get_tensor() new_ivar = self._ivar._copy_to(core.CPUPlace(), True)
return np.array(tensor) return np.array(new_ivar.value().get_tensor())
def _backward(self): def _backward(self):
self._ivar._run_backward() self._ivar._run_backward()
...@@ -1311,6 +1316,7 @@ class Block(object): ...@@ -1311,6 +1316,7 @@ class Block(object):
def _trace_op(self, op, stop_gradient=False): def _trace_op(self, op, stop_gradient=False):
if _in_imperative_mode(): if _in_imperative_mode():
_imperative_tracer().trace(op.iop, op.inputs, op.outputs, self.desc, _imperative_tracer().trace(op.iop, op.inputs, op.outputs, self.desc,
_imperative_current_expected_place_,
stop_gradient) stop_gradient)
def _insert_op(self, index, *args, **kwargs): def _insert_op(self, index, *args, **kwargs):
...@@ -2502,5 +2508,18 @@ def _imperative_guard(tracer): ...@@ -2502,5 +2508,18 @@ def _imperative_guard(tracer):
global _imperative_tracer_ global _imperative_tracer_
tmp_trace = _imperative_tracer_ tmp_trace = _imperative_tracer_
_imperative_tracer_ = tracer _imperative_tracer_ = tracer
yield yield
_imperative_tracer_ = tmp_trace _imperative_tracer_ = tmp_trace
@contextlib.contextmanager
def _imperative_place_guard(place):
global _imperative_current_expected_place_
tmp_place = _imperative_current_expected_place_
_imperative_current_expected_place_ = place
yield
_imperative_current_expected_place_ = tmp_place
...@@ -25,18 +25,28 @@ def enabled(): ...@@ -25,18 +25,28 @@ def enabled():
@contextlib.contextmanager @contextlib.contextmanager
def guard(): def guard(place=None):
train = framework.Program() train = framework.Program()
startup = framework.Program() startup = framework.Program()
tracer = core.Tracer(train.current_block().desc) tracer = core.Tracer(train.current_block().desc)
if place is None:
if core.is_compiled_with_cuda():
place = core.CUDAPlace(0)
else:
place = core.CPUPlace()
with framework.program_guard(train, startup): with framework.program_guard(train, startup):
with framework.unique_name.guard(): with framework.unique_name.guard():
with framework._imperative_guard(tracer): with framework._imperative_guard(tracer):
yield with framework._imperative_place_guard(place):
yield
def to_variable(value, block=None): def to_variable(value, block=None):
if isinstance(value, np.ndarray): if isinstance(value, np.ndarray):
assert enabled(), "to_variable could only be called in imperative mode"
if not block: if not block:
block = framework.default_main_program().current_block() block = framework.default_main_program().current_block()
py_var = framework.Variable( py_var = framework.Variable(
...@@ -47,9 +57,7 @@ def to_variable(value, block=None): ...@@ -47,9 +57,7 @@ def to_variable(value, block=None):
dtype=value.dtype) dtype=value.dtype)
var = py_var._ivar.value() var = py_var._ivar.value()
tensor = var.get_tensor() tensor = var.get_tensor()
tensor.set(value, core.CPUPlace()) tensor.set(value, framework._current_expected_place())
return py_var return py_var
elif isinstance(value, framework.Variable): elif isinstance(value, framework.Variable):
return value return value
else:
raise ValueError("Unsupported type %s" % type(value))
...@@ -27,6 +27,7 @@ __all__ = [ ...@@ -27,6 +27,7 @@ __all__ = [
'Conv2D', 'Conv2D',
'Pool2D', 'Pool2D',
'FC', 'FC',
'BatchNorm',
] ]
...@@ -55,7 +56,8 @@ class Conv2D(layers.Layer): ...@@ -55,7 +56,8 @@ class Conv2D(layers.Layer):
param_attr=param_attr, param_attr=param_attr,
bias_attr=bias_attr, bias_attr=bias_attr,
dtype=dtype, dtype=dtype,
name=name) name=name,
act=act)
self._groups = groups self._groups = groups
self._stride = utils.convert_to_list(stride, 2, 'stride') self._stride = utils.convert_to_list(stride, 2, 'stride')
...@@ -141,6 +143,7 @@ class Conv2D(layers.Layer): ...@@ -141,6 +143,7 @@ class Conv2D(layers.Layer):
outputs={'Out': [pre_act]}, outputs={'Out': [pre_act]},
attrs={'axis': 1}) attrs={'axis': 1})
# Currently, we don't support inplace in imperative mode
return self._helper.append_activation(pre_act) return self._helper.append_activation(pre_act)
...@@ -216,6 +219,7 @@ class FC(layers.Layer): ...@@ -216,6 +219,7 @@ class FC(layers.Layer):
act=None, act=None,
name=None): name=None):
super(FC, self).__init__() super(FC, self).__init__()
self._size = size self._size = size
self._num_flatten_dims = num_flatten_dims self._num_flatten_dims = num_flatten_dims
self._dtype = dtype self._dtype = dtype
...@@ -241,6 +245,16 @@ class FC(layers.Layer): ...@@ -241,6 +245,16 @@ class FC(layers.Layer):
dtype=self._dtype, dtype=self._dtype,
is_bias=False) is_bias=False)
if self._helper.bias_attr:
size = list([self._size])
self._b = self._helper.create_parameter(
attr=self._helper.bias_attr,
shape=size,
dtype=self._dtype,
is_bias=True)
else:
self._b = None
def forward(self, input): def forward(self, input):
tmp = self._helper.create_variable_for_type_inference(self._dtype) tmp = self._helper.create_variable_for_type_inference(self._dtype)
self._helper.append_op( self._helper.append_op(
...@@ -253,28 +267,155 @@ class FC(layers.Layer): ...@@ -253,28 +267,155 @@ class FC(layers.Layer):
"y_num_col_dims": 1 "y_num_col_dims": 1
}) })
out = self._helper.create_variable_for_type_inference(self._dtype) pre_bias = self._helper.create_variable_for_type_inference(self._dtype)
self._helper.append_op( self._helper.append_op(
type="sum", type="sum",
inputs={"X": [tmp]}, inputs={"X": [tmp]},
outputs={"Out": out}, outputs={"Out": pre_bias},
attrs={"use_mkldnn": False}) attrs={"use_mkldnn": False})
bias_attr = self._helper.bias_attr if self._b:
if bias_attr: pre_activation = self._helper.create_variable_for_type_inference(
# add bias dtype=self._dtype)
size = list(out.shape[1:])
if not self._built:
self._b = self._helper.create_parameter(
attr=bias_attr, shape=size, dtype=out.dtype, is_bias=True)
bias_out = self._helper.create_variable_for_type_inference(
dtype=out.dtype)
self._helper.append_op( self._helper.append_op(
type='elementwise_add', type='elementwise_add',
inputs={'X': [out], inputs={'X': [pre_bias],
'Y': [self._b]}, 'Y': [self._b]},
outputs={'Out': [bias_out]}, outputs={'Out': [pre_activation]},
attrs={'axis': 1}) attrs={'axis': self._num_flatten_dims})
out = bias_out else:
# add activation pre_activation = pre_bias
return self._helper.append_activation(out) # Currently, we don't support inplace in imperative mode
return self._helper.append_activation(pre_activation)
class BatchNorm(layers.Layer):
def __init__(self,
num_channels,
act=None,
is_test=False,
momentum=0.9,
epsilon=1e-05,
param_attr=None,
bias_attr=None,
dtype=core.VarDesc.VarType.FP32,
data_layout='NCHW',
in_place=False,
name=None,
moving_mean_name=None,
moving_variance_name=None,
do_model_average_for_mean_and_var=False,
fuse_with_relu=False,
use_global_stats=False):
super(BatchNorm, self).__init__()
assert bias_attr is not False, "bias_attr should not be False in batch_norm."
from ..layer_helper import LayerHelper
self._helper = LayerHelper(
'batch_norm',
param_attr=param_attr,
bias_attr=bias_attr,
name=name,
act=act)
if dtype == core.VarDesc.VarType.FP16:
self._dtype = core.VarDesc.VarType.FP32
else:
self._dtype = dtype
param_shape = [num_channels]
# create parameter
self._scale = self._helper.create_parameter(
attr=self._helper.param_attr,
shape=param_shape,
dtype=self._dtype,
default_initializer=Constant(1.0))
# TODO(minqiyang): change stop_gradient sign to trainable to align with static graph
# # setting stop_gradient=True to reduce computation
# if use_global_stats and self._helper.param_attr.learning_rate == 0.:
# self._scale.stop_gradient = True
self._bias = self._helper.create_parameter(
attr=self._helper.bias_attr,
shape=param_shape,
dtype=self._dtype,
is_bias=True)
# TODO(minqiyang): change stop_gradient sign to trainable to align with static graph
# # setting stop_gradient=True to reduce computation
# if use_global_stats and self._helper.bias_attr.learning_rate == 0.:
# self._bias.stop_gradient = True
self._mean = self._helper.create_parameter(
attr=ParamAttr(
name=moving_mean_name,
initializer=Constant(0.0),
trainable=False,
do_model_average=do_model_average_for_mean_and_var),
shape=param_shape,
dtype=self._dtype)
self._mean.stop_gradient = True
self._variance = self._helper.create_parameter(
attr=ParamAttr(
name=moving_variance_name,
initializer=Constant(1.0),
trainable=False,
do_model_average=do_model_average_for_mean_and_var),
shape=param_shape,
dtype=self._dtype)
self._variance.stop_gradient = True
self._in_place = in_place
self._momentum = momentum
self._epsilon = epsilon
self._is_test = is_test
self._fuse_with_relu = fuse_with_relu
self._use_global_stats = use_global_stats
def _build_once(self, input):
pass
def forward(self, input):
# create output
# mean and mean_out share the same memory
mean_out = self._mean
# variance and variance out share the same memory
variance_out = self._variance
saved_mean = self._helper.create_variable_for_type_inference(
dtype=self._dtype, stop_gradient=True)
saved_variance = self._helper.create_variable_for_type_inference(
dtype=self._dtype, stop_gradient=True)
batch_norm_out = input if self._in_place else self._helper.create_variable_for_type_inference(
self._dtype)
self._helper.append_op(
type="batch_norm",
inputs={
"X": input,
"Scale": self._scale,
"Bias": self._bias,
"Mean": self._mean,
"Variance": self._variance
},
outputs={
"Y": batch_norm_out,
"MeanOut": mean_out,
"VarianceOut": variance_out,
"SavedMean": saved_mean,
"SavedVariance": saved_variance
},
attrs={
"momentum": self._momentum,
"epsilon": self._epsilon,
"is_test": self._is_test,
"use_mkldnn": False,
"fuse_with_relu": self._fuse_with_relu,
"use_global_stats": self._use_global_stats
})
# Currently, we don't support inplace in imperative mode
return self._helper.append_activation(batch_norm_out)
...@@ -435,7 +435,10 @@ class LayerHelper(object): ...@@ -435,7 +435,10 @@ class LayerHelper(object):
act_type = act.pop('type') act_type = act.pop('type')
tmp = input_var tmp = input_var
# NOTE(dzhwinter): some activation support inplace compution. # NOTE(dzhwinter): some activation support inplace compution.
if not core.IsInplace(act_type): # NOTE(minqiyang): currently, we don't support inplace in imperative mode
if not imperative_base.enabled() and core.IsInplace(act_type):
tmp = input_var
else:
tmp = self.create_variable_for_type_inference(dtype=input_var.dtype) tmp = self.create_variable_for_type_inference(dtype=input_var.dtype)
self.append_op( self.append_op(
type=act_type, type=act_type,
......
...@@ -523,7 +523,7 @@ def _py_reader(capacity, ...@@ -523,7 +523,7 @@ def _py_reader(capacity,
double_buffer_name = "_".join([name, "double_buffer"]) double_buffer_name = "_".join([name, "double_buffer"])
var = global_scope().var(queue_name) var = global_scope().var(queue_name)
feed_queue = core.init_lod_tensor_blocking_queue(var, capacity, shapes) feed_queue = core.init_lod_tensor_blocking_queue(var, capacity)
startup_blk = default_startup_program().current_block() startup_blk = default_startup_program().current_block()
startup_var = startup_blk.create_var(name=reader_name) startup_var = startup_blk.create_var(name=reader_name)
......
...@@ -321,7 +321,7 @@ def append_LARS(params_grads, learning_rate, weight_decay): ...@@ -321,7 +321,7 @@ def append_LARS(params_grads, learning_rate, weight_decay):
The decayed learning rate The decayed learning rate
Examples: Examples:
.. code-block:: python .. code-block:: python
learning_rate *= local_gw_ratio * sqrt(sumsq(param)) learning_rate *= local_gw_ratio * sqrt(sumsq(param))
/ (sqrt(sumsq(gradient))+ weight_decay * sqrt(sumsq(param))) / (sqrt(sumsq(gradient))+ weight_decay * sqrt(sumsq(param)))
""" """
......
...@@ -179,6 +179,7 @@ __all__ = [ ...@@ -179,6 +179,7 @@ __all__ = [
'merge_selected_rows', 'merge_selected_rows',
'get_tensor_from_selected_rows', 'get_tensor_from_selected_rows',
'lstm', 'lstm',
'shuffle_channel',
'py_func', 'py_func',
'psroi_pool', 'psroi_pool',
'teacher_student_sigmoid_loss', 'teacher_student_sigmoid_loss',
...@@ -2874,7 +2875,7 @@ def batch_norm(input, ...@@ -2874,7 +2875,7 @@ def batch_norm(input,
attr=helper.bias_attr, shape=param_shape, dtype=dtype, is_bias=True) attr=helper.bias_attr, shape=param_shape, dtype=dtype, is_bias=True)
# setting stop_gradient=True to reduce computation # setting stop_gradient=True to reduce computation
if use_global_stats and helper.bias_attr.learning_rate == 0.: if use_global_stats and helper.bias_attr.learning_rate == 0.:
scale.stop_gradient = True bias.stop_gradient = True
mean = helper.create_parameter( mean = helper.create_parameter(
attr=ParamAttr( attr=ParamAttr(
...@@ -3875,6 +3876,7 @@ def beam_search(pre_ids, ...@@ -3875,6 +3876,7 @@ def beam_search(pre_ids,
beam_size, beam_size,
end_id, end_id,
level=0, level=0,
is_accumulated=True,
name=None): name=None):
""" """
Beam search is a classical algorithm for selecting candidate words in a Beam search is a classical algorithm for selecting candidate words in a
...@@ -3887,14 +3889,17 @@ def beam_search(pre_ids, ...@@ -3887,14 +3889,17 @@ def beam_search(pre_ids,
selects the top-K candidate word ids of current step from :attr:`ids` selects the top-K candidate word ids of current step from :attr:`ids`
according to their :attr:`scores` for all source sentences, where K is according to their :attr:`scores` for all source sentences, where K is
:attr:`beam_size` and :attr:`ids, scores` are predicted results from the :attr:`beam_size` and :attr:`ids, scores` are predicted results from the
computation cell. Additionally, :attr:`pre_ids` and :attr:`pre_scores` are computation cell. If :attr:`ids` is not set, it will be calculated out
the output of beam_search at previous step, they are needed for special use according to :attr:`scores`. Additionally, :attr:`pre_ids` and
to handle ended candidate translations. :attr:`pre_scores` are the output of beam_search at previous step, they
are needed for special use to handle ended candidate translations.
Note that the :attr:`scores` passed in should be accumulated scores, and
length penalty should be done with extra operators before calculating the Note that if :attr:`is_accumulated` is :attr:`True`, the :attr:`scores`
accumulated scores if needed, also suggest finding top-K before it and passed in should be accumulated scores. Else, the :attr:`scores` are
using the top-K candidates following. considered as the straightforward scores and will be transformed to the
log field and accumulated the :attr:`pre_scores` in this operator.
Length penalty should be done with extra operators before calculating the
accumulated scores if needed.
Please see the following demo for a fully beam search usage example: Please see the following demo for a fully beam search usage example:
...@@ -3924,6 +3929,8 @@ def beam_search(pre_ids, ...@@ -3924,6 +3929,8 @@ def beam_search(pre_ids,
describes how these candidates belong to the prefix. The paths describes how these candidates belong to the prefix. The paths
linking prefixes and selected candidates are organized and reserved linking prefixes and selected candidates are organized and reserved
in lod. in lod.
is_accumulated(bool, default True): Whether the input :attr:`score` is
accumulated scores.
name(str|None): A name for this layer(optional). If set None, the layer name(str|None): A name for this layer(optional). If set None, the layer
will be named automatically. will be named automatically.
...@@ -3952,8 +3959,12 @@ def beam_search(pre_ids, ...@@ -3952,8 +3959,12 @@ def beam_search(pre_ids,
end_id=end_id) end_id=end_id)
""" """
helper = LayerHelper('beam_search', **locals()) helper = LayerHelper('beam_search', **locals())
score_type = scores.dtype score_type = pre_scores.dtype
id_type = ids.dtype id_type = pre_ids.dtype
inputs = {"pre_ids": pre_ids, "pre_scores": pre_scores, "scores": scores}
if ids is not None:
inputs["ids"] = ids
selected_scores = helper.create_variable_for_type_inference( selected_scores = helper.create_variable_for_type_inference(
dtype=score_type) dtype=score_type)
...@@ -3961,12 +3972,7 @@ def beam_search(pre_ids, ...@@ -3961,12 +3972,7 @@ def beam_search(pre_ids,
helper.append_op( helper.append_op(
type='beam_search', type='beam_search',
inputs={ inputs=inputs,
'pre_ids': pre_ids,
'pre_scores': pre_scores,
'ids': ids,
'scores': scores,
},
outputs={ outputs={
'selected_ids': selected_ids, 'selected_ids': selected_ids,
'selected_scores': selected_scores, 'selected_scores': selected_scores,
...@@ -3976,6 +3982,7 @@ def beam_search(pre_ids, ...@@ -3976,6 +3982,7 @@ def beam_search(pre_ids,
'level': level, 'level': level,
'beam_size': beam_size, 'beam_size': beam_size,
'end_id': end_id, 'end_id': end_id,
'is_accumulated': is_accumulated,
}) })
return selected_ids, selected_scores return selected_ids, selected_scores
...@@ -5146,9 +5153,9 @@ def nce(input, ...@@ -5146,9 +5153,9 @@ def nce(input,
littles = [] littles = []
for i in range(custom_dist_len): for i in range(custom_dist_len):
normal_prob = custom_dist[i] * custom_dist_len normal_prob = custom_dist[i] * custom_dist_len
if normal_prob - 1.0 > 1e-4: if normal_prob - 1.0 > 0:
bigs.append((i, normal_prob)) bigs.append((i, normal_prob))
elif 1.0 - normal_prob > 1e-4: elif 1.0 - normal_prob > 0:
littles.append((i, normal_prob)) littles.append((i, normal_prob))
else: else:
alias_probs_[i] = normal_prob alias_probs_[i] = normal_prob
...@@ -5164,9 +5171,9 @@ def nce(input, ...@@ -5164,9 +5171,9 @@ def nce(input,
alias_probs_[little[0]] = little[1] alias_probs_[little[0]] = little[1]
alias_[little[0]] = big_idx alias_[little[0]] = big_idx
big_left = big[1] + little[1] - 1 big_left = big[1] + little[1] - 1
if big_left - 1.0 > 1e-4: if big_left - 1.0 > 0:
bigs.append((big_idx, big_left)) bigs.append((big_idx, big_left))
elif 1.0 - big_left > 1e-4: elif 1.0 - big_left > 0:
littles.append((big_idx, big_left)) littles.append((big_idx, big_left))
else: else:
alias_probs_[big_idx] = big_left alias_probs_[big_idx] = big_left
...@@ -5856,7 +5863,8 @@ def autoincreased_step_counter(counter_name=None, begin=1, step=1): ...@@ -5856,7 +5863,8 @@ def autoincreased_step_counter(counter_name=None, begin=1, step=1):
type='increment', type='increment',
inputs={'X': [counter]}, inputs={'X': [counter]},
outputs={'Out': [counter]}, outputs={'Out': [counter]},
attrs={'step': float(step)}) attrs={'step': float(step)},
stop_gradient=True)
counter.stop_gradient = True counter.stop_gradient = True
return counter return counter
...@@ -9475,7 +9483,7 @@ def teacher_student_sigmoid_loss(input, ...@@ -9475,7 +9483,7 @@ def teacher_student_sigmoid_loss(input,
by the previous operator. by the previous operator.
label (Variable|list): the ground truth which is a 2-D tensor with label (Variable|list): the ground truth which is a 2-D tensor with
shape [N x 1], where N is the batch size. shape [N x 1], where N is the batch size.
soft_max_up_bound (float): if input > soft_max_up_bound, will be bound soft_max_up_bound (float): if input > soft_max_up_bound, will be bound
soft_max_lower_bound (float): if input < soft_max_lower_bound, will be bound soft_max_lower_bound (float): if input < soft_max_lower_bound, will be bound
Returns: Returns:
...@@ -9639,6 +9647,79 @@ def get_tensor_from_selected_rows(x, name=None): ...@@ -9639,6 +9647,79 @@ def get_tensor_from_selected_rows(x, name=None):
return out return out
def shuffle_channel(x, group, name=None):
"""
**Shuffle Channel Operator**
This operator shuffles the channels of input x.
It divide the input channels in each group into :attr:`group` subgroups,
and obtain a new order by selecting element from every subgroup one by one.
Please refer to the paper
https://arxiv.org/pdf/1707.01083.pdf
.. code-block:: text
Given a 4-D tensor input with the shape (N, C, H, W):
input.shape = (1, 4, 2, 2)
input.data =[[[[0.1, 0.2],
[0.2, 0.3]],
[[0.3, 0.4],
[0.4, 0.5]],
[[0.5, 0.6],
[0.6, 0.7]],
[[0.7, 0.8],
[0.8, 0.9]]]]
Given group: 2
then we get a 4-D tensor out whth the same shape of input:
out.shape = (1, 4, 2, 2)
out.data = [[[[0.1, 0.2],
[0.2, 0.3]],
[[0.5, 0.6],
[0.6, 0.7]],
[[0.3, 0.4],
[0.4, 0.5]],
[[0.7, 0.8],
[0.8, 0.9]]]]
Args:
x(Variable): The input tensor variable. It should be a 4-D tensor with shape [N, C, H, W]
group(int): Indicating the conuts of subgroups, It should divide the number of channels.
Returns:
out(Variable): the channels shuffling result is a tensor variable with the
same shape and same type as the input.
Raises:
ValueError: If group is not an int type variable.
Examples:
.. code-block:: python
input = fluid.layers.data(name='input', shape=[4,2,2], dtype='float32')
out = fluid.layers.shuffle_channel(x=input, group=2)
"""
helper = LayerHelper("shuffle_channel", **locals())
out = helper.create_variable_for_type_inference(dtype=x.dtype)
if not isinstance(group, int):
raise TypeError("group must be int type")
helper.append_op(
type="shuffle_channel",
inputs={"X": x},
outputs={"Out": out},
attrs={"group": group})
return out
class PyFuncRegistry(object): class PyFuncRegistry(object):
_register_funcs = [] _register_funcs = []
......
...@@ -382,7 +382,8 @@ def fill_constant(shape, dtype, value, force_cpu=False, out=None): ...@@ -382,7 +382,8 @@ def fill_constant(shape, dtype, value, force_cpu=False, out=None):
'dtype': out.dtype, 'dtype': out.dtype,
'value': float(value), 'value': float(value),
'force_cpu': force_cpu or force_init_on_cpu() 'force_cpu': force_cpu or force_init_on_cpu()
}) },
stop_gradient=True)
out.stop_gradient = True out.stop_gradient = True
return out return out
......
...@@ -301,10 +301,10 @@ class Optimizer(object): ...@@ -301,10 +301,10 @@ class Optimizer(object):
no_grad_set (set|None): set of Variables should be ignored. no_grad_set (set|None): set of Variables should be ignored.
callbacks (list|None): list of callables to run when appending backward callbacks (list|None): list of callables to run when appending backward
operator for one parameter. operator for one parameter.
Return: Return:
list: list of (param, grad) pair, grad is the output of backward. list: list of (param, grad) pair, grad is the output of backward.
Examples: Examples:
See examples in `apply_gradients`. See examples in `apply_gradients`.
""" """
...@@ -322,10 +322,10 @@ class Optimizer(object): ...@@ -322,10 +322,10 @@ class Optimizer(object):
Args: Args:
params_grads (list): list of (param, grad) pair to do optimization. params_grads (list): list of (param, grad) pair to do optimization.
Returns: Returns:
list: A list of operators appended to the current program. list: A list of operators appended to the current program.
Examples: Examples:
.. code-block:: python .. code-block:: python
...@@ -364,7 +364,7 @@ class Optimizer(object): ...@@ -364,7 +364,7 @@ class Optimizer(object):
This method combines interface `backward()` and This method combines interface `backward()` and
`apply_gradients()` into one. `apply_gradients()` into one.
Args: Args:
loss (Variable): loss variable to run optimizations. loss (Variable): loss variable to run optimizations.
startup_program (Program): startup_program for initializing parameters startup_program (Program): startup_program for initializing parameters
...@@ -381,18 +381,21 @@ class Optimizer(object): ...@@ -381,18 +381,21 @@ class Optimizer(object):
optimize_ops = [] optimize_ops = []
if imperative_base.enabled(): if imperative_base.enabled():
if parameter_list is not None: if parameter_list is not None:
params_grads = parameter_list parameters = parameter_list
else: else:
parameters = program.global_block().all_parameters() parameters = program.global_block().all_parameters()
params_grads = []
for param in parameters: params_grads = []
# create gradient variable for param in parameters:
grad_var = Variable( if param.stop_gradient:
block=loss.block, continue
name=param._ivar._grad_name(), # create gradient variable
stop_gradient=True, grad_var = Variable(
ivar=param._ivar._grad_ivar()) block=loss.block,
params_grads.append((param, grad_var)) name=param._ivar._grad_name(),
stop_gradient=True,
ivar=param._ivar._grad_ivar())
params_grads.append((param, grad_var))
with program_guard(program, startup_program): with program_guard(program, startup_program):
optimize_ops = self._create_optimization_pass(params_grads) optimize_ops = self._create_optimization_pass(params_grads)
else: else:
......
...@@ -159,7 +159,7 @@ class ParallelExecutor(object): ...@@ -159,7 +159,7 @@ class ParallelExecutor(object):
trainers_endpoints = main._trainers_endpoints trainers_endpoints = main._trainers_endpoints
if num_trainers > 1 and trainers_endpoints: if num_trainers > 1 and trainers_endpoints:
assert num_trainers == len( assert num_trainers == len(
trainers_endpoints), "num_trainers == len(end_points)" trainers_endpoints), "num_trainers == len(endpoints)"
build_strategy.trainers_endpoints = trainers_endpoints build_strategy.trainers_endpoints = trainers_endpoints
# step6: get persistable_vars, places. persistable_vars # step6: get persistable_vars, places. persistable_vars
......
...@@ -84,6 +84,7 @@ list(REMOVE_ITEM TEST_OPS test_parallel_executor_transformer) ...@@ -84,6 +84,7 @@ list(REMOVE_ITEM TEST_OPS test_parallel_executor_transformer)
list(REMOVE_ITEM TEST_OPS test_image_classification_resnet) list(REMOVE_ITEM TEST_OPS test_image_classification_resnet)
list(REMOVE_ITEM TEST_OPS test_bilinear_interp_op) list(REMOVE_ITEM TEST_OPS test_bilinear_interp_op)
list(REMOVE_ITEM TEST_OPS test_nearest_interp_op) list(REMOVE_ITEM TEST_OPS test_nearest_interp_op)
list(REMOVE_ITEM TEST_OPS test_imperative_resnet)
foreach(TEST_OP ${TEST_OPS}) foreach(TEST_OP ${TEST_OPS})
py_test_modules(${TEST_OP} MODULES ${TEST_OP}) py_test_modules(${TEST_OP} MODULES ${TEST_OP})
endforeach(TEST_OP) endforeach(TEST_OP)
...@@ -91,6 +92,8 @@ py_test_modules(test_adam_op_multi_thread MODULES test_adam_op ENVS FLAGS_inner_ ...@@ -91,6 +92,8 @@ py_test_modules(test_adam_op_multi_thread MODULES test_adam_op ENVS FLAGS_inner_
py_test_modules(test_warpctc_op MODULES test_warpctc_op ENVS FLAGS_warpctc_dir=${WARPCTC_LIB_DIR} SERIAL) py_test_modules(test_warpctc_op MODULES test_warpctc_op ENVS FLAGS_warpctc_dir=${WARPCTC_LIB_DIR} SERIAL)
py_test_modules(test_bilinear_interp_op MODULES test_bilinear_interp_op SERIAL) py_test_modules(test_bilinear_interp_op MODULES test_bilinear_interp_op SERIAL)
py_test_modules(test_nearest_interp_op MODULES test_nearest_interp_op SERIAL) py_test_modules(test_nearest_interp_op MODULES test_nearest_interp_op SERIAL)
py_test_modules(test_imperative_resnet MODULES test_imperative_resnet ENVS
FLAGS_cudnn_deterministic=1)
if(WITH_DISTRIBUTE) if(WITH_DISTRIBUTE)
py_test_modules(test_dist_train MODULES test_dist_train SERIAL) py_test_modules(test_dist_train MODULES test_dist_train SERIAL)
set_tests_properties(test_listen_and_serv_op PROPERTIES TIMEOUT 20) set_tests_properties(test_listen_and_serv_op PROPERTIES TIMEOUT 20)
......
...@@ -124,7 +124,7 @@ class TestDistRunnerBase(object): ...@@ -124,7 +124,7 @@ class TestDistRunnerBase(object):
if args.batch_merge_repeat > 1: if args.batch_merge_repeat > 1:
pass_builder = build_stra._finalize_strategy_and_create_passes() pass_builder = build_stra._finalize_strategy_and_create_passes()
mypass = pass_builder.insert_pass( mypass = pass_builder.insert_pass(
len(pass_builder.all_passes()) - 2, "multi_batch_merge_pass") len(pass_builder.all_passes()) - 3, "multi_batch_merge_pass")
mypass.set("num_repeats", args.batch_merge_repeat) mypass.set("num_repeats", args.batch_merge_repeat)
if args.update_method == "nccl2": if args.update_method == "nccl2":
......
...@@ -16,12 +16,17 @@ import os ...@@ -16,12 +16,17 @@ import os
import unittest import unittest
os.environ['FLAGS_eager_delete_tensor_gb'] = "0.0" os.environ['FLAGS_eager_delete_tensor_gb'] = "0.0"
# FIXME(zjl): It seems that this unittest fails randomly
# when comparing all reduce last loss and reduce last loss
# e.g.: AssertionError: 1.0357145 != 1.0673475 within 0.01 delta
# Disable it temporarily.
'''
from test_parallel_executor_mnist import TestMNIST from test_parallel_executor_mnist import TestMNIST
class EagerDeletionTestMNIST(TestMNIST): class EagerDeletionTestMNIST(TestMNIST):
pass pass
'''
if __name__ == '__main__': if __name__ == '__main__':
unittest.main() unittest.main()
...@@ -67,6 +67,18 @@ class MLP(fluid.imperative.Layer): ...@@ -67,6 +67,18 @@ class MLP(fluid.imperative.Layer):
class TestImperative(unittest.TestCase): class TestImperative(unittest.TestCase):
def test_sum_op(self):
x = np.ones([2, 2], np.float32)
with fluid.imperative.guard():
inputs = []
for _ in range(10):
inputs.append(fluid.imperative.base.to_variable(x))
ret = fluid.layers.sums(inputs)
loss = fluid.layers.reduce_sum(ret)
loss._backward()
self.assertTrue(np.allclose(ret._numpy(), x * 10))
self.assertTrue(np.allclose(inputs[0]._gradient(), x))
def test_layer(self): def test_layer(self):
with fluid.imperative.guard(): with fluid.imperative.guard():
cl = core.Layer() cl = core.Layer()
...@@ -133,7 +145,8 @@ class TestImperative(unittest.TestCase): ...@@ -133,7 +145,8 @@ class TestImperative(unittest.TestCase):
x = fluid.layers.reduce_sum(fluid.layers.tanh(x1)) x = fluid.layers.reduce_sum(fluid.layers.tanh(x1))
param_grads = fluid.backward.append_backward( param_grads = fluid.backward.append_backward(
x, parameter_list=[x1.name])[0] x, parameter_list=[x1.name])[0]
exe = fluid.Executor(fluid.CPUPlace()) exe = fluid.Executor(fluid.CPUPlace(
) if not core.is_compiled_with_cuda() else fluid.CUDAPlace(0))
static_out, static_grad = exe.run( static_out, static_grad = exe.run(
feed={inp.name: np_inp}, feed={inp.name: np_inp},
...@@ -160,7 +173,8 @@ class TestImperative(unittest.TestCase): ...@@ -160,7 +173,8 @@ class TestImperative(unittest.TestCase):
x = l(inp)[0] x = l(inp)[0]
param_grads = fluid.backward.append_backward( param_grads = fluid.backward.append_backward(
x, parameter_list=[l._x_for_debug.name])[0] x, parameter_list=[l._x_for_debug.name])[0]
exe = fluid.Executor(fluid.CPUPlace()) exe = fluid.Executor(fluid.CPUPlace(
) if not core.is_compiled_with_cuda() else fluid.CUDAPlace(0))
static_out, static_grad = exe.run( static_out, static_grad = exe.run(
feed={inp.name: np_inp}, feed={inp.name: np_inp},
...@@ -186,7 +200,8 @@ class TestImperative(unittest.TestCase): ...@@ -186,7 +200,8 @@ class TestImperative(unittest.TestCase):
out = mlp(inp) out = mlp(inp)
param_grads = fluid.backward.append_backward( param_grads = fluid.backward.append_backward(
out, parameter_list=[mlp._fc1._w.name])[0] out, parameter_list=[mlp._fc1._w.name])[0]
exe = fluid.Executor(fluid.CPUPlace()) exe = fluid.Executor(fluid.CPUPlace(
) if not core.is_compiled_with_cuda() else fluid.CUDAPlace(0))
exe.run(fluid.default_startup_program()) exe.run(fluid.default_startup_program())
static_out, static_grad = exe.run( static_out, static_grad = exe.run(
......
...@@ -20,6 +20,7 @@ import sys ...@@ -20,6 +20,7 @@ import sys
import paddle import paddle
import paddle.fluid as fluid import paddle.fluid as fluid
import paddle.fluid.core as core
from paddle.fluid.optimizer import SGDOptimizer from paddle.fluid.optimizer import SGDOptimizer
from paddle.fluid.imperative.nn import Conv2D, Pool2D, FC from paddle.fluid.imperative.nn import Conv2D, Pool2D, FC
from test_imperative_base import new_program_scope from test_imperative_base import new_program_scope
...@@ -58,7 +59,7 @@ class Generator(fluid.imperative.Layer): ...@@ -58,7 +59,7 @@ class Generator(fluid.imperative.Layer):
class TestImperativeMnist(unittest.TestCase): class TestImperativeMnist(unittest.TestCase):
def test_mnist_cpu_float32(self): def test_gan_float32(self):
seed = 90 seed = 90
startup = fluid.Program() startup = fluid.Program()
...@@ -115,7 +116,8 @@ class TestImperativeMnist(unittest.TestCase): ...@@ -115,7 +116,8 @@ class TestImperativeMnist(unittest.TestCase):
sgd = SGDOptimizer(learning_rate=1e-3) sgd = SGDOptimizer(learning_rate=1e-3)
sgd.minimize(g_loss) sgd.minimize(g_loss)
exe = fluid.Executor(fluid.CPUPlace()) exe = fluid.Executor(fluid.CPUPlace() if not core.is_compiled_with_cuda(
) else fluid.CUDAPlace(0))
static_params = dict() static_params = dict()
with fluid.scope_guard(scope): with fluid.scope_guard(scope):
img = np.ones([2, 1], np.float32) img = np.ones([2, 1], np.float32)
......
...@@ -145,7 +145,8 @@ class TestImperativeMnist(unittest.TestCase): ...@@ -145,7 +145,8 @@ class TestImperativeMnist(unittest.TestCase):
fluid.default_startup_program().random_seed = seed fluid.default_startup_program().random_seed = seed
fluid.default_main_program().random_seed = seed fluid.default_main_program().random_seed = seed
exe = fluid.Executor(fluid.CPUPlace()) exe = fluid.Executor(fluid.CPUPlace(
) if not core.is_compiled_with_cuda() else fluid.CUDAPlace(0))
mnist = MNIST() mnist = MNIST()
sgd = SGDOptimizer(learning_rate=1e-3) sgd = SGDOptimizer(learning_rate=1e-3)
......
# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import contextlib
import unittest
import numpy as np
import six
import paddle
import paddle.fluid as fluid
from paddle.fluid import core
from paddle.fluid.layer_helper import LayerHelper
from paddle.fluid.optimizer import SGDOptimizer
from paddle.fluid.imperative.nn import Conv2D, Pool2D, BatchNorm, FC
from paddle.fluid.imperative.base import to_variable
from test_imperative_base import new_program_scope
batch_size = 8
train_parameters = {
"input_size": [3, 224, 224],
"input_mean": [0.485, 0.456, 0.406],
"input_std": [0.229, 0.224, 0.225],
"learning_strategy": {
"name": "piecewise_decay",
"batch_size": batch_size,
"epochs": [30, 60, 90],
"steps": [0.1, 0.01, 0.001, 0.0001]
},
"batch_size": batch_size,
"lr": 0.1,
"total_images": 1281164,
}
def optimizer_setting(params):
ls = params["learning_strategy"]
if ls["name"] == "piecewise_decay":
if "total_images" not in params:
total_images = 1281167
else:
total_images = params["total_images"]
batch_size = ls["batch_size"]
step = int(total_images / batch_size + 1)
bd = [step * e for e in ls["epochs"]]
base_lr = params["lr"]
lr = []
lr = [base_lr * (0.1**i) for i in range(len(bd) + 1)]
optimizer = fluid.optimizer.SGD(learning_rate=0.01)
# TODO(minqiyang): Add learning rate scheduler support to imperative mode
# optimizer = fluid.optimizer.Momentum(
# learning_rate=params["lr"],
# learning_rate=fluid.layers.piecewise_decay(
# boundaries=bd, values=lr),
# momentum=0.9,
# regularization=fluid.regularizer.L2Decay(1e-4))
return optimizer
class ConvBNLayer(fluid.imperative.Layer):
def __init__(self,
num_channels,
num_filters,
filter_size,
stride=1,
groups=1,
act=None):
super(ConvBNLayer, self).__init__()
self._conv = Conv2D(
num_channels=num_channels,
num_filters=num_filters,
filter_size=filter_size,
stride=stride,
padding=(filter_size - 1) // 2,
groups=groups,
act=None,
bias_attr=None)
self._batch_norm = BatchNorm(num_filters, act=act)
def forward(self, inputs):
y = self._conv(inputs)
y = self._batch_norm(y)
return y
class BottleneckBlock(fluid.imperative.Layer):
def __init__(self, num_channels, num_filters, stride, shortcut=True):
super(BottleneckBlock, self).__init__()
self.conv0 = ConvBNLayer(
num_channels=num_channels,
num_filters=num_filters,
filter_size=1,
act='relu')
self.conv1 = ConvBNLayer(
num_channels=num_filters,
num_filters=num_filters,
filter_size=3,
stride=stride,
act='relu')
self.conv2 = ConvBNLayer(
num_channels=num_filters,
num_filters=num_filters * 4,
filter_size=1,
act=None)
if not shortcut:
self.short = ConvBNLayer(
num_channels=num_channels,
num_filters=num_filters * 4,
filter_size=1,
stride=stride)
self.shortcut = shortcut
self._num_channels_out = num_filters * 4
def forward(self, inputs):
y = self.conv0(inputs)
conv1 = self.conv1(y)
conv2 = self.conv2(conv1)
if self.shortcut:
short = inputs
else:
short = self.short(inputs)
y = fluid.layers.elementwise_add(x=short, y=conv2)
layer_helper = LayerHelper('elementwise_add_activation', act='relu')
return layer_helper.append_activation(y)
class ResNet(fluid.imperative.Layer):
def __init__(self, layers=50, class_dim=102):
super(ResNet, self).__init__()
self.layers = layers
supported_layers = [50, 101, 152]
assert layers in supported_layers, \
"supported layers are {} but input layer is {}".format(supported_layers, layers)
if layers == 50:
depth = [3, 4, 6, 3]
elif layers == 101:
depth = [3, 4, 23, 3]
elif layers == 152:
depth = [3, 8, 36, 3]
num_filters = [64, 128, 256, 512]
self.conv = ConvBNLayer(
num_channels=3, num_filters=64, filter_size=7, stride=2, act='relu')
self.pool2d_max = Pool2D(
pool_size=3, pool_stride=2, pool_padding=1, pool_type='max')
self.bottleneck_block_list = []
num_channels = 64
for block in range(len(depth)):
shortcut = False
for i in range(depth[block]):
bottleneck_block = BottleneckBlock(
num_channels=num_channels,
num_filters=num_filters[block],
stride=2 if i == 0 and block != 0 else 1,
shortcut=shortcut)
num_channels = bottleneck_block._num_channels_out
self.bottleneck_block_list.append(bottleneck_block)
shortcut = True
self.pool2d_avg = Pool2D(
pool_size=7, pool_type='avg', global_pooling=True)
import math
stdv = 1.0 / math.sqrt(2048 * 1.0)
self.out = FC(size=class_dim,
act='softmax',
param_attr=fluid.param_attr.ParamAttr(
initializer=fluid.initializer.Uniform(-stdv, stdv)))
def forward(self, inputs):
y = self.conv(inputs)
y = self.pool2d_max(y)
for bottleneck_block in self.bottleneck_block_list:
y = bottleneck_block(y)
y = self.pool2d_avg(y)
y = self.out(y)
return y
class TestImperativeResnet(unittest.TestCase):
def test_resnet_float32(self):
seed = 90
batch_size = train_parameters["batch_size"]
batch_num = 1
with fluid.imperative.guard():
fluid.default_startup_program().random_seed = seed
fluid.default_main_program().random_seed = seed
resnet = ResNet()
optimizer = optimizer_setting(train_parameters)
np.random.seed(seed)
import random
random.seed = seed
train_reader = paddle.batch(
paddle.dataset.flowers.train(use_xmap=False),
batch_size=batch_size)
dy_param_init_value = {}
for param in fluid.default_main_program().global_block(
).all_parameters():
dy_param_init_value[param.name] = param._numpy()
for batch_id, data in enumerate(train_reader()):
if batch_id >= batch_num:
break
dy_x_data = np.array(
[x[0].reshape(3, 224, 224) for x in data]).astype('float32')
y_data = np.array([x[1] for x in data]).astype('int64').reshape(
batch_size, 1)
img = to_variable(dy_x_data)
label = to_variable(y_data)
label._stop_gradient = True
out = resnet(img)
loss = fluid.layers.cross_entropy(input=out, label=label)
avg_loss = fluid.layers.mean(x=loss)
dy_out = avg_loss._numpy()
if batch_id == 0:
for param in fluid.default_main_program().global_block(
).all_parameters():
if param.name not in dy_param_init_value:
dy_param_init_value[param.name] = param._numpy()
avg_loss._backward()
dy_grad_value = {}
for param in fluid.default_main_program().global_block(
).all_parameters():
if not param.stop_gradient:
np_array = np.array(param._ivar._grad_ivar().value()
.get_tensor())
dy_grad_value[param.name + core.grad_var_suffix(
)] = np_array
optimizer.minimize(avg_loss)
dy_param_value = {}
for param in fluid.default_main_program().global_block(
).all_parameters():
dy_param_value[param.name] = param._numpy()
with new_program_scope():
fluid.default_startup_program().random_seed = seed
fluid.default_main_program().random_seed = seed
exe = fluid.Executor(fluid.CPUPlace(
) if not core.is_compiled_with_cuda() else fluid.CUDAPlace(0))
resnet = ResNet()
optimizer = optimizer_setting(train_parameters)
np.random.seed(seed)
import random
random.seed = seed
train_reader = paddle.batch(
paddle.dataset.flowers.train(use_xmap=False),
batch_size=batch_size)
img = fluid.layers.data(
name='pixel', shape=[3, 224, 224], dtype='float32')
label = fluid.layers.data(name='label', shape=[1], dtype='int64')
out = resnet(img)
loss = fluid.layers.cross_entropy(input=out, label=label)
avg_loss = fluid.layers.mean(x=loss)
optimizer.minimize(avg_loss)
# initialize params and fetch them
static_param_init_value = {}
static_param_name_list = []
static_grad_name_list = []
for param in fluid.default_startup_program().global_block(
).all_parameters():
static_param_name_list.append(param.name)
for param in fluid.default_main_program().global_block(
).all_parameters():
if not param.stop_gradient:
static_grad_name_list.append(param.name +
core.grad_var_suffix())
out = exe.run(fluid.default_startup_program(),
fetch_list=static_param_name_list)
for i in range(len(static_param_name_list)):
static_param_init_value[static_param_name_list[i]] = out[i]
for batch_id, data in enumerate(train_reader()):
if batch_id >= batch_num:
break
static_x_data = np.array(
[x[0].reshape(3, 224, 224) for x in data]).astype('float32')
y_data = np.array([x[1] for x in data]).astype('int64').reshape(
[batch_size, 1])
fetch_list = [avg_loss.name]
fetch_list.extend(static_param_name_list)
fetch_list.extend(static_grad_name_list)
out = exe.run(fluid.default_main_program(),
feed={"pixel": static_x_data,
"label": y_data},
fetch_list=fetch_list)
static_param_value = {}
static_grad_value = {}
static_out = out[0]
param_start_pos = 1
grad_start_pos = len(static_param_name_list) + param_start_pos
for i in range(param_start_pos,
len(static_param_name_list) + param_start_pos):
static_param_value[static_param_name_list[
i - param_start_pos]] = out[i]
for i in range(grad_start_pos,
len(static_grad_name_list) + grad_start_pos):
static_grad_value[static_grad_name_list[
i - grad_start_pos]] = out[i]
self.assertTrue(np.allclose(static_out, dy_out))
self.assertEqual(len(dy_param_init_value), len(static_param_init_value))
for key, value in six.iteritems(static_param_init_value):
self.assertTrue(np.allclose(value, dy_param_init_value[key]))
self.assertTrue(np.isfinite(value.all()))
self.assertFalse(np.isnan(value.any()))
self.assertEqual(len(dy_grad_value), len(static_grad_value))
for key, value in six.iteritems(static_grad_value):
self.assertTrue(np.allclose(value, dy_grad_value[key]))
self.assertTrue(np.isfinite(value.all()))
self.assertFalse(np.isnan(value.any()))
self.assertEqual(len(dy_param_value), len(static_param_value))
for key, value in six.iteritems(static_param_value):
self.assertTrue(np.allclose(value, dy_param_value[key]))
self.assertTrue(np.isfinite(value.all()))
self.assertFalse(np.isnan(value.any()))
if __name__ == '__main__':
unittest.main()
...@@ -1023,6 +1023,14 @@ class TestBook(unittest.TestCase): ...@@ -1023,6 +1023,14 @@ class TestBook(unittest.TestCase):
print(str(program)) print(str(program))
def test_shuffle_channel(self):
program = Program()
with program_guard(program):
x = layers.data(name="X", shape=[16, 4, 4], dtype="float32")
out = layers.shuffle_channel(x, group=4)
self.assertIsNotNone(out)
print(str(program))
if __name__ == '__main__': if __name__ == '__main__':
unittest.main() unittest.main()
# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from __future__ import print_function
import unittest
import numpy as np
import sys
import math
from op_test import OpTest
import paddle.fluid.core as core
class TestShuffleChannelOp(OpTest):
def setUp(self):
self.op_type = "shuffle_channel"
self.batch_size = 10
self.input_channels = 16
self.layer_h = 4
self.layer_w = 4
self.group = 4
self.x = np.random.random(
(self.batch_size, self.input_channels, self.layer_h,
self.layer_w)).astype('float32')
self.inputs = {'X': self.x}
self.attrs = {'group': self.group}
n, c, h, w = self.x.shape
input_reshaped = np.reshape(self.x,
(-1, self.group, c // self.group, h, w))
input_transposed = np.transpose(input_reshaped, (0, 2, 1, 3, 4))
self.outputs = {'Out': np.reshape(input_transposed, (-1, c, h, w))}
def test_check_output(self):
self.check_output()
def test_check_grad(self):
self.check_grad(['X'], 'Out')
if __name__ == '__main__':
unittest.main()
...@@ -16,6 +16,7 @@ import sys ...@@ -16,6 +16,7 @@ import sys
import time import time
import socket import socket
from contextlib import closing from contextlib import closing
from six import string_types
def wait_server_ready(endpoints): def wait_server_ready(endpoints):
...@@ -32,6 +33,7 @@ def wait_server_ready(endpoints): ...@@ -32,6 +33,7 @@ def wait_server_ready(endpoints):
wait_server_ready(["127.0.0.1:8080", "127.0.0.1:8081"]) wait_server_ready(["127.0.0.1:8080", "127.0.0.1:8081"])
""" """
assert not isinstance(endpoints, string_types)
while True: while True:
all_ok = True all_ok = True
not_ready_endpoints = [] not_ready_endpoints = []
...@@ -45,7 +47,7 @@ def wait_server_ready(endpoints): ...@@ -45,7 +47,7 @@ def wait_server_ready(endpoints):
all_ok = False all_ok = False
not_ready_endpoints.append(ep) not_ready_endpoints.append(ep)
if not all_ok: if not all_ok:
sys.stderr.write("pserver not ready, wait 3 sec to retry...\n") sys.stderr.write("server not ready, wait 3 sec to retry...\n")
sys.stderr.write("not ready endpoints:" + str(not_ready_endpoints) + sys.stderr.write("not ready endpoints:" + str(not_ready_endpoints) +
"\n") "\n")
sys.stderr.flush() sys.stderr.flush()
......
...@@ -477,13 +477,16 @@ class DistributeTranspiler(object): ...@@ -477,13 +477,16 @@ class DistributeTranspiler(object):
trainer_id, trainer_id,
trainers, trainers,
current_endpoint, current_endpoint,
startup_program=None): startup_program=None,
wait_port=True):
if not startup_program: if not startup_program:
startup_program = default_startup_program() startup_program = default_startup_program()
if trainer_id >= 0: if trainer_id >= 0:
worker_endpoints = trainers.split(",") worker_endpoints = trainers.split(",")
# send NCCL_ID to others or recv from trainer 0 # send NCCL_ID to others or recv from trainer 0
worker_endpoints.remove(current_endpoint) worker_endpoints.remove(current_endpoint)
if trainer_id == 0 and wait_port:
wait_server_ready(worker_endpoints)
nccl_id_var = startup_program.global_block().create_var( nccl_id_var = startup_program.global_block().create_var(
name="NCCLID", persistable=True, type=core.VarDesc.VarType.RAW) name="NCCLID", persistable=True, type=core.VarDesc.VarType.RAW)
...@@ -564,11 +567,13 @@ class DistributeTranspiler(object): ...@@ -564,11 +567,13 @@ class DistributeTranspiler(object):
if self.config.mode == "nccl2": if self.config.mode == "nccl2":
assert (isinstance(trainers, str)) assert (isinstance(trainers, str))
self.origin_program._trainers_endpoints = trainers.split(",")
self._transpile_nccl2( self._transpile_nccl2(
trainer_id, trainer_id,
trainers, trainers,
current_endpoint, current_endpoint,
startup_program=startup_program) startup_program=startup_program,
wait_port=self.config.wait_port)
return return
self.trainer_num = trainers self.trainer_num = trainers
......
...@@ -109,6 +109,7 @@ packages=['paddle', ...@@ -109,6 +109,7 @@ packages=['paddle',
'paddle.fluid.contrib', 'paddle.fluid.contrib',
'paddle.fluid.contrib.decoder', 'paddle.fluid.contrib.decoder',
'paddle.fluid.contrib.quantize', 'paddle.fluid.contrib.quantize',
'paddle.fluid.contrib.reader',
'paddle.fluid.contrib.slim', 'paddle.fluid.contrib.slim',
'paddle.fluid.contrib.slim.core', 'paddle.fluid.contrib.slim.core',
'paddle.fluid.contrib.slim.graph', 'paddle.fluid.contrib.slim.graph',
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册