diff --git a/CMakeLists.txt b/CMakeLists.txt index 9ec632e20690eafdc558e24f160270a89b29ee41..e85fce58368aa233e39a554947e20a128fce6218 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -276,9 +276,3 @@ add_subdirectory(paddle) if(WITH_PYTHON) add_subdirectory(python) endif() - -if(WITH_DOC) - find_package(Sphinx REQUIRED) - find_python_module(recommonmark REQUIRED) - add_subdirectory(doc) -endif() diff --git a/Dockerfile b/Dockerfile index acfd091265e26d6c29c561d166fed2504c0cff1c..fe0721e9b99b5e028df2f6228ff04cb56a567a3f 100644 --- a/Dockerfile +++ b/Dockerfile @@ -11,12 +11,10 @@ RUN /bin/bash -c 'if [[ -n ${UBUNTU_MIRROR} ]]; then sed -i 's#http://archive.ub # ENV variables ARG WITH_GPU ARG WITH_AVX -ARG WITH_DOC ENV WOBOQ OFF ENV WITH_GPU=${WITH_GPU:-ON} ENV WITH_AVX=${WITH_AVX:-ON} -ENV WITH_DOC=${WITH_DOC:-OFF} ENV HOME /root # Add bash enhancements diff --git a/cmake/FindSphinx.cmake b/cmake/FindSphinx.cmake deleted file mode 100644 index f74cd4ff8c9c2c52319b18ac37264167b3718eae..0000000000000000000000000000000000000000 --- a/cmake/FindSphinx.cmake +++ /dev/null @@ -1,147 +0,0 @@ -# - This module looks for Sphinx -# Find the Sphinx documentation generator -# -# This modules defines -# SPHINX_EXECUTABLE -# SPHINX_FOUND - -find_program(SPHINX_EXECUTABLE - NAMES sphinx-build - PATHS - /usr/bin - /usr/local/bin - /opt/local/bin - DOC "Sphinx documentation generator" -) - -if( NOT SPHINX_EXECUTABLE ) - set(_Python_VERSIONS - 2.7 2.6 2.5 2.4 2.3 2.2 2.1 2.0 1.6 1.5 - ) - - foreach( _version ${_Python_VERSIONS} ) - set( _sphinx_NAMES sphinx-build-${_version} ) - - find_program( SPHINX_EXECUTABLE - NAMES ${_sphinx_NAMES} - PATHS - /usr/bin - /usr/local/bin - /opt/loca/bin - DOC "Sphinx documentation generator" - ) - endforeach() -endif() - -include(FindPackageHandleStandardArgs) - -find_package_handle_standard_args(Sphinx DEFAULT_MSG - SPHINX_EXECUTABLE -) - - -option( SPHINX_HTML_OUTPUT "Build a single HTML with the whole content." ON ) -option( SPHINX_DIRHTML_OUTPUT "Build HTML pages, but with a single directory per document." OFF ) -option( SPHINX_HTMLHELP_OUTPUT "Build HTML pages with additional information for building a documentation collection in htmlhelp." OFF ) -option( SPHINX_QTHELP_OUTPUT "Build HTML pages with additional information for building a documentation collection in qthelp." OFF ) -option( SPHINX_DEVHELP_OUTPUT "Build HTML pages with additional information for building a documentation collection in devhelp." OFF ) -option( SPHINX_EPUB_OUTPUT "Build HTML pages with additional information for building a documentation collection in epub." OFF ) -option( SPHINX_LATEX_OUTPUT "Build LaTeX sources that can be compiled to a PDF document using pdflatex." OFF ) -option( SPHINX_MAN_OUTPUT "Build manual pages in groff format for UNIX systems." OFF ) -option( SPHINX_TEXT_OUTPUT "Build plain text files." OFF ) - - -mark_as_advanced( - SPHINX_EXECUTABLE - SPHINX_HTML_OUTPUT - SPHINX_DIRHTML_OUTPUT - SPHINX_HTMLHELP_OUTPUT - SPHINX_QTHELP_OUTPUT - SPHINX_DEVHELP_OUTPUT - SPHINX_EPUB_OUTPUT - SPHINX_LATEX_OUTPUT - SPHINX_MAN_OUTPUT - SPHINX_TEXT_OUTPUT -) - -function( Sphinx_add_target target_name builder conf cache source destination ) - add_custom_target( ${target_name} ALL - COMMAND ${SPHINX_EXECUTABLE} -b ${builder} - -d ${cache} - -c ${conf} - ${source} - ${destination} - COMMENT "Generating sphinx documentation: ${builder}" - COMMAND cd ${destination} && ln -sf ./index_*.html index.html - ) - - set_property( - DIRECTORY APPEND PROPERTY - ADDITIONAL_MAKE_CLEAN_FILES - ${destination} - ) -endfunction() - -# Target dependencies can be optionally listed at the end. -function( Sphinx_add_targets target_base_name conf source base_destination ) - - set( _dependencies ) - - foreach( arg IN LISTS ARGN ) - set( _dependencies ${_dependencies} ${arg} ) - endforeach() - - if( ${SPHINX_HTML_OUTPUT} ) - Sphinx_add_target( ${target_base_name}_html html ${conf} ${source} ${base_destination}/html ) - - add_dependencies( ${target_base_name}_html ${_dependencies} ) - endif() - - if( ${SPHINX_DIRHTML_OUTPUT} ) - Sphinx_add_target( ${target_base_name}_dirhtml dirhtml ${conf} ${source} ${base_destination}/dirhtml ) - - add_dependencies( ${target_base_name}_dirhtml ${_dependencies} ) - endif() - - if( ${SPHINX_QTHELP_OUTPUT} ) - Sphinx_add_target( ${target_base_name}_qthelp qthelp ${conf} ${source} ${base_destination}/qthelp ) - - add_dependencies( ${target_base_name}_qthelp ${_dependencies} ) - endif() - - if( ${SPHINX_DEVHELP_OUTPUT} ) - Sphinx_add_target( ${target_base_name}_devhelp devhelp ${conf} ${source} ${base_destination}/devhelp ) - - add_dependencies( ${target_base_name}_devhelp ${_dependencies} ) - endif() - - if( ${SPHINX_EPUB_OUTPUT} ) - Sphinx_add_target( ${target_base_name}_epub epub ${conf} ${source} ${base_destination}/epub ) - - add_dependencies( ${target_base_name}_epub ${_dependencies} ) - endif() - - if( ${SPHINX_LATEX_OUTPUT} ) - Sphinx_add_target( ${target_base_name}_latex latex ${conf} ${source} ${base_destination}/latex ) - - add_dependencies( ${target_base_name}_latex ${_dependencies} ) - endif() - - if( ${SPHINX_MAN_OUTPUT} ) - Sphinx_add_target( ${target_base_name}_man man ${conf} ${source} ${base_destination}/man ) - - add_dependencies( ${target_base_name}_man ${_dependencies} ) - endif() - - if( ${SPHINX_TEXT_OUTPUT} ) - Sphinx_add_target( ${target_base_name}_text text ${conf} ${source} ${base_destination}/text ) - - add_dependencies( ${target_base_name}_text ${_dependencies} ) - endif() - - if( ${BUILD_TESTING} ) - sphinx_add_target( ${target_base_name}_linkcheck linkcheck ${conf} ${source} ${base_destination}/linkcheck ) - - add_dependencies( ${target_base_name}_linkcheck ${_dependencies} ) - endif() -endfunction() diff --git a/cmake/generic.cmake b/cmake/generic.cmake index 1f4dbe0b49825aef9a236f7ae72c6bea168b2ec5..6679a09dfc9dd00cfe3b5c5da3e12bd1c1389432 100644 --- a/cmake/generic.cmake +++ b/cmake/generic.cmake @@ -388,6 +388,7 @@ function(cc_test TARGET_NAME) endif() set_property(TEST ${TARGET_NAME} PROPERTY ENVIRONMENT FLAGS_cpu_deterministic=true) set_property(TEST ${TARGET_NAME} PROPERTY ENVIRONMENT FLAGS_init_allocated_mem=true) + set_property(TEST ${TARGET_NAME} PROPERTY ENVIRONMENT FLAGS_limit_of_tmp_allocation=4294967296) # 4G set_property(TEST ${TARGET_NAME} PROPERTY ENVIRONMENT FLAGS_cudnn_deterministic=true) # No unit test should exceed 10 minutes. set_tests_properties(${TARGET_NAME} PROPERTIES TIMEOUT 600) @@ -460,6 +461,7 @@ function(nv_test TARGET_NAME) endif() set_property(TEST ${TARGET_NAME} PROPERTY ENVIRONMENT FLAGS_cpu_deterministic=true) set_property(TEST ${TARGET_NAME} PROPERTY ENVIRONMENT FLAGS_init_allocated_mem=true) + set_property(TEST ${TARGET_NAME} PROPERTY ENVIRONMENT FLAGS_limit_of_tmp_allocation=4294967296) # 4G set_property(TEST ${TARGET_NAME} PROPERTY ENVIRONMENT FLAGS_cudnn_deterministic=true) endif() endfunction(nv_test) @@ -708,9 +710,10 @@ function(py_test TARGET_NAME) set(oneValueArgs "") set(multiValueArgs SRCS DEPS ARGS ENVS) cmake_parse_arguments(py_test "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN}) + add_test(NAME ${TARGET_NAME} COMMAND ${CMAKE_COMMAND} -E env FLAGS_init_allocated_mem=true FLAGS_cudnn_deterministic=true - FLAGS_cpu_deterministic=true + FLAGS_cpu_deterministic=true FLAGS_limit_of_tmp_allocation=4294967296 # 4G PYTHONPATH=${PADDLE_BINARY_DIR}/python ${py_test_ENVS} ${PYTHON_EXECUTABLE} -u ${py_test_SRCS} ${py_test_ARGS} WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}) diff --git a/paddle/fluid/API.spec b/paddle/fluid/API.spec index 4d040d219a7ebf96b3224952362aa84c30f946fa..afd3342768701adba4ff0040bd1c762b1cd8739d 100644 --- a/paddle/fluid/API.spec +++ b/paddle/fluid/API.spec @@ -122,7 +122,7 @@ paddle.fluid.layers.transpose ArgSpec(args=['x', 'perm', 'name'], varargs=None, paddle.fluid.layers.im2sequence ArgSpec(args=['input', 'filter_size', 'stride', 'padding', 'input_image_size', 'out_stride', 'name'], varargs=None, keywords=None, defaults=(1, 1, 0, None, 1, None)) paddle.fluid.layers.nce ArgSpec(args=['input', 'label', 'num_total_classes', 'sample_weight', 'param_attr', 'bias_attr', 'num_neg_samples', 'name', 'sampler', 'custom_dist', 'seed', 'is_sparse'], varargs=None, keywords=None, defaults=(None, None, None, None, None, 'uniform', None, 0, False)) paddle.fluid.layers.hsigmoid ArgSpec(args=['input', 'label', 'num_classes', 'param_attr', 'bias_attr', 'name', 'path_table', 'path_code', 'is_custom', 'is_sparse'], varargs=None, keywords=None, defaults=(None, None, None, None, None, False, False)) -paddle.fluid.layers.beam_search ArgSpec(args=['pre_ids', 'pre_scores', 'ids', 'scores', 'beam_size', 'end_id', 'level', 'name'], varargs=None, keywords=None, defaults=(0, None)) +paddle.fluid.layers.beam_search ArgSpec(args=['pre_ids', 'pre_scores', 'ids', 'scores', 'beam_size', 'end_id', 'level', 'is_accumulated', 'name'], varargs=None, keywords=None, defaults=(0, True, None)) paddle.fluid.layers.row_conv ArgSpec(args=['input', 'future_context_size', 'param_attr', 'act'], varargs=None, keywords=None, defaults=(None, None)) paddle.fluid.layers.multiplex ArgSpec(args=['inputs', 'index'], varargs=None, keywords=None, defaults=None) paddle.fluid.layers.layer_norm ArgSpec(args=['input', 'scale', 'shift', 'begin_norm_axis', 'epsilon', 'param_attr', 'bias_attr', 'act', 'name'], varargs=None, keywords=None, defaults=(True, True, 1, 1e-05, None, None, None, None)) @@ -213,6 +213,7 @@ paddle.fluid.layers.bilinear_tensor_product ArgSpec(args=['x', 'y', 'size', 'act paddle.fluid.layers.merge_selected_rows ArgSpec(args=['x', 'name'], varargs=None, keywords=None, defaults=(None,)) paddle.fluid.layers.get_tensor_from_selected_rows ArgSpec(args=['x', 'name'], varargs=None, keywords=None, defaults=(None,)) paddle.fluid.layers.lstm ArgSpec(args=['input', 'init_h', 'init_c', 'max_len', 'hidden_size', 'num_layers', 'dropout_prob', 'is_bidirec', 'is_test', 'name', 'default_initializer', 'seed'], varargs=None, keywords=None, defaults=(0.0, False, False, None, None, -1)) +paddle.fluid.layers.shuffle_channel ArgSpec(args=['x', 'group', 'name'], varargs=None, keywords=None, defaults=(None,)) paddle.fluid.layers.py_func ArgSpec(args=['func', 'x', 'out', 'backward_func', 'skip_vars_in_backward_input'], varargs=None, keywords=None, defaults=(None, None)) paddle.fluid.layers.psroi_pool ArgSpec(args=['input', 'rois', 'output_channels', 'spatial_scale', 'pooled_height', 'pooled_width', 'name'], varargs=None, keywords=None, defaults=(None,)) paddle.fluid.layers.teacher_student_sigmoid_loss ArgSpec(args=['input', 'label', 'soft_max_up_bound', 'soft_max_lower_bound'], varargs=None, keywords=None, defaults=(15.0, -15.0)) @@ -359,6 +360,7 @@ paddle.fluid.contrib.QuantizeTranspiler.__init__ ArgSpec(args=['self', 'weight_b paddle.fluid.contrib.QuantizeTranspiler.convert_to_int8 ArgSpec(args=['self', 'program', 'place', 'scope'], varargs=None, keywords=None, defaults=(None,)) paddle.fluid.contrib.QuantizeTranspiler.freeze_program ArgSpec(args=['self', 'program', 'place', 'fuse_bn', 'scope'], varargs=None, keywords=None, defaults=(False, None)) paddle.fluid.contrib.QuantizeTranspiler.training_transpile ArgSpec(args=['self', 'program', 'startup_program'], varargs=None, keywords=None, defaults=(None, None)) +paddle.fluid.contrib.reader.ctr_reader.ctr_reader ArgSpec(args=['feed_dict', 'file_type', 'file_format', 'dense_slot_index', 'sparse_slot_index', 'capacity', 'thread_num', 'batch_size', 'file_list', 'slots', 'name'], varargs=None, keywords=None, defaults=(None,)) paddle.fluid.contrib.build_compressor ArgSpec(args=['place', 'data_reader', 'data_feeder', 'scope', 'metrics', 'epoch', 'config'], varargs=None, keywords=None, defaults=(None, None, None, None, None, None, None)) paddle.fluid.contrib.CompressPass.__init__ ArgSpec(args=['self', 'place', 'data_reader', 'data_feeder', 'scope', 'metrics', 'epoch', 'program_exe'], varargs=None, keywords=None, defaults=(None, None, None, None, None, None, None)) paddle.fluid.contrib.CompressPass.add_strategy ArgSpec(args=['self', 'strategy'], varargs=None, keywords=None, defaults=None) diff --git a/paddle/fluid/framework/CMakeLists.txt b/paddle/fluid/framework/CMakeLists.txt index a167511160d074c13ca1dca36b4f2c5eeea4bb93..66f11dedbaccd7febcd75fa7ade9c68b6c42022c 100644 --- a/paddle/fluid/framework/CMakeLists.txt +++ b/paddle/fluid/framework/CMakeLists.txt @@ -1,4 +1,3 @@ - #windows treat symbolic file as a real file, which is different with unix #We create a hidden file and compile it instead of origin source file. function(windows_symbolic TARGET) @@ -129,12 +128,6 @@ cc_test(version_test SRCS version_test.cc DEPS version) cc_library(proto_desc SRCS var_desc.cc op_desc.cc block_desc.cc program_desc.cc DEPS shape_inference op_info operator glog version) -if(WITH_NGRAPH) - cc_library(ngraph_bridge SRCS ngraph_bridge.cc DEPS operator framework_proto ngraph) - cc_library(ngraph_operator SRCS ngraph_operator.cc DEPS ngraph_bridge operator op_info device_context tensor scope glog - shape_inference data_transform lod_tensor profiler) -endif(WITH_NGRAPH) - cc_library(op_registry SRCS op_registry.cc DEPS op_proto_maker op_info operator glog proto_desc) nv_test(op_registry_test SRCS op_registry_test.cc DEPS op_registry) @@ -171,13 +164,12 @@ if(WITH_DISTRIBUTE) set(DISTRIBUTE_COMPILE_FLAGS "-Wno-non-virtual-dtor -Wno-error=non-virtual-dtor -Wno-error=delete-non-virtual-dtor") set_source_files_properties(executor.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS}) - else() - if(WITH_NGRAPH) - cc_library(executor SRCS executor.cc DEPS op_registry device_context scope framework_proto glog lod_rank_table feed_fetch_method graph_to_program_pass ngraph_operator variable_helper) - else(WITH_NGRAPH) + if (WITH_NGRAPH) + cc_library(executor SRCS executor.cc DEPS op_registry device_context scope framework_proto glog lod_rank_table feed_fetch_method graph_to_program_pass variable_helper ngraph_engine) + else () cc_library(executor SRCS executor.cc DEPS op_registry device_context scope framework_proto glog lod_rank_table feed_fetch_method graph_to_program_pass variable_helper) - endif(WITH_NGRAPH) + endif() cc_test(test_naive_executor SRCS naive_executor_test.cc DEPS naive_executor elementwise_add_op) endif() @@ -214,3 +206,24 @@ endif (NOT WIN32) cc_library(dlpack_tensor SRCS dlpack_tensor.cc DEPS tensor dlpack) cc_test(dlpack_tensor_test SRCS dlpack_tensor_test.cc DEPS dlpack_tensor glog) + +# Get the current working branch +execute_process( + COMMAND git rev-parse --abbrev-ref HEAD + WORKING_DIRECTORY ${CMAKE_SOURCE_DIR} + OUTPUT_VARIABLE PADDLE_BRANCH + OUTPUT_STRIP_TRAILING_WHITESPACE +) + +# Get the latest abbreviated commit hash of the working branch +execute_process( + COMMAND git log -1 --format=%h + WORKING_DIRECTORY ${CMAKE_SOURCE_DIR} + OUTPUT_VARIABLE PADDLE_COMMIT + OUTPUT_STRIP_TRAILING_WHITESPACE +) + +message(STATUS "commit: ${PADDLE_COMMIT}") +message(STATUS "branch: ${PADDLE_BRANCH}") + +configure_file(commit.h.in commit.h) diff --git a/paddle/fluid/framework/commit.h.in b/paddle/fluid/framework/commit.h.in new file mode 100644 index 0000000000000000000000000000000000000000..3a33ece624443a99083ae29abb70254a5ac40a3d --- /dev/null +++ b/paddle/fluid/framework/commit.h.in @@ -0,0 +1,21 @@ +#pragma once + +#include + +namespace paddle { +namespace framework { + +static std::string paddle_commit() { + return "@PADDLE_COMMIT@"; +} + +static std::string paddle_compile_branch() { + return "@PADDLE_BRANCH@"; +} + +static std::string paddle_version() { + return "@PADDLE_VERSION@"; +} + +} // namespace framework +} // namespace paddle diff --git a/paddle/fluid/framework/details/build_strategy.h b/paddle/fluid/framework/details/build_strategy.h index 603df2e06936e3d9d8e7ec62efd0c6e83200239c..cd24a3175953bf323748bf0c7e3159761c13f0a9 100644 --- a/paddle/fluid/framework/details/build_strategy.h +++ b/paddle/fluid/framework/details/build_strategy.h @@ -91,7 +91,7 @@ struct BuildStrategy { int num_trainers_{1}; int trainer_id_{0}; std::vector trainers_endpoints_; - bool remove_unnecessary_lock_{false}; + bool remove_unnecessary_lock_{true}; // NOTE: // Before you add new options, think if it's a general strategy that works diff --git a/paddle/fluid/framework/details/execution_strategy.h b/paddle/fluid/framework/details/execution_strategy.h index 37b07e5736312b3050debe745f2d3c108469c5d6..318694a1d4b0599655f05bf01c907fb6c07a4193 100644 --- a/paddle/fluid/framework/details/execution_strategy.h +++ b/paddle/fluid/framework/details/execution_strategy.h @@ -25,6 +25,9 @@ struct ExecutionStrategy { size_t num_threads_{0}; bool use_cuda_{true}; bool allow_op_delay_{false}; + // If we set this to 1, we will delete all variables when finish a batch. and + // this will loss 15%+ performance. + // Please be aware about this parameters. size_t num_iteration_per_drop_scope_{1}; ExecutorType type_{kDefault}; bool dry_run_{false}; diff --git a/paddle/fluid/framework/executor.cc b/paddle/fluid/framework/executor.cc index c93bbe7ceecce9193acfae0b4e03c06212edd6d6..4323883fa5cc9b26a68c2980f3b7a49eca610543 100644 --- a/paddle/fluid/framework/executor.cc +++ b/paddle/fluid/framework/executor.cc @@ -27,7 +27,7 @@ limitations under the License. */ #include "paddle/fluid/platform/profiler.h" #ifdef PADDLE_WITH_NGRAPH -#include "paddle/fluid/framework/ngraph_operator.h" +#include "paddle/fluid/operators/ngraph/ngraph_engine.h" #endif DECLARE_bool(benchmark); @@ -133,24 +133,6 @@ static void DeleteUnusedTensors( } } -static void EnableFusedOp(ExecutorPrepareContext* ctx) { -#ifdef PADDLE_WITH_NGRAPH - VLOG(3) << "use_ngraph=True"; - auto intervals = NgraphOperator::NgraphOpIntervals(&ctx->ops_); - for (auto& interval : intervals) { - auto* ng_op = new NgraphOperator(ctx->prog_, ctx->block_id_, interval.at(0), - interval.at(1)); - *interval[0] = std::unique_ptr(ng_op); - } - for (auto it = intervals.rbegin(); it != intervals.rend(); ++it) { - ctx->ops_.erase(it->at(0) + 1, it->at(1)); - } -#else - LOG(WARNING) - << "'NGRAPH' is not supported, Please re-compile with WITH_NGRAPH option"; -#endif -} - Executor::Executor(const platform::Place& place) : place_(place) {} void Executor::Close() { @@ -204,6 +186,9 @@ void Executor::Run(const ProgramDesc& pdesc, Scope* scope, int block_id, bool create_local_scope, bool create_vars) { platform::RecordBlock b(block_id); if (FLAGS_use_mkldnn) EnableMKLDNN(pdesc); +#ifdef PADDLE_WITH_NGRAPH + if (FLAGS_use_ngraph) operators::NgraphEngine::EnableNgraph(pdesc); +#endif auto ctx = Prepare(pdesc, block_id); RunPreparedContext(ctx.get(), scope, create_local_scope, create_vars); } @@ -379,7 +364,6 @@ std::unique_ptr Executor::Prepare( for (auto& op_desc : block.AllOps()) { ctx->ops_.push_back(OpRegistry::CreateOp(*op_desc)); } - if (FLAGS_use_ngraph) EnableFusedOp(ctx.get()); return ctx; } diff --git a/paddle/fluid/framework/ir/graph_traits.cc b/paddle/fluid/framework/ir/graph_traits.cc index 2ee12cc410393d1e1aa5fc9e5374d858eca1b901..929d9edc34ffb92f468d5b7af54a0b8da4121543 100644 --- a/paddle/fluid/framework/ir/graph_traits.cc +++ b/paddle/fluid/framework/ir/graph_traits.cc @@ -14,6 +14,7 @@ #include "paddle/fluid/framework/ir/graph_traits.h" +#include #include namespace paddle { @@ -79,7 +80,7 @@ NodesTSIterator::NodesTSIterator(const std::vector &source) { } std::unordered_set visited; - std::unordered_set to_visit{source.begin(), source.end()}; + std::set to_visit{source.begin(), source.end()}; std::vector inlink_visited; while (!to_visit.empty()) { diff --git a/paddle/fluid/framework/lod_tensor.cc b/paddle/fluid/framework/lod_tensor.cc index 8fbbc6584e121d22bdec8173d501a35dc97c9c06..f46bdf96ba1e9e1e137c690057051d9a127d45c9 100644 --- a/paddle/fluid/framework/lod_tensor.cc +++ b/paddle/fluid/framework/lod_tensor.cc @@ -54,13 +54,14 @@ std::ostream &operator<<(std::ostream &os, const LoD &lod) { std::ostream &operator<<(std::ostream &os, const LoDTensor &t) { if (!platform::is_cpu_place(t.place())) { - LoDTensor tt; - framework::TensorCopy(t, platform::CPUPlace(), &tt); + LoDTensor cpu_tensor; + cpu_tensor.set_lod(t.lod()); + framework::TensorCopy(t, platform::CPUPlace(), &cpu_tensor); platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance(); auto &dev_ctx = *pool.Get(t.place()); dev_ctx.Wait(); - os << tt; + os << cpu_tensor; return os; } diff --git a/paddle/fluid/framework/mixed_vector.h b/paddle/fluid/framework/mixed_vector.h index c3a044d22cf04dceecc164fae934ee15c4563af1..5d854cb8d7856a631faf01741d29d3cecfd9a627 100644 --- a/paddle/fluid/framework/mixed_vector.h +++ b/paddle/fluid/framework/mixed_vector.h @@ -1,16 +1,16 @@ /* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. - Licensed under the Apache License, Version 2.0 (the "License"); - you may not use this file except in compliance with the License. - You may obtain a copy of the License at +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at - http://www.apache.org/licenses/LICENSE-2.0 + http://www.apache.org/licenses/LICENSE-2.0 - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. */ +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ #pragma once diff --git a/paddle/fluid/framework/ngraph_operator.cc b/paddle/fluid/framework/ngraph_operator.cc deleted file mode 100644 index 7e174c7def1ffa4089a94d9cc504b18843557c53..0000000000000000000000000000000000000000 --- a/paddle/fluid/framework/ngraph_operator.cc +++ /dev/null @@ -1,545 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include - -#include -#include - -#include "paddle/fluid/framework/feed_fetch_type.h" -#include "paddle/fluid/framework/framework.pb.h" -#include "paddle/fluid/framework/lod_tensor.h" -#include "paddle/fluid/framework/ngraph_bridge.h" -#include "paddle/fluid/framework/ngraph_operator.h" -#include "paddle/fluid/framework/tensor.h" -#include "paddle/fluid/framework/var_desc.h" -#include "paddle/fluid/framework/var_type.h" - -#include "ngraph/ngraph.hpp" - -namespace paddle { -namespace framework { - -static ngraph::Shape Ddim2Shape(const DDim& dims) { - ngraph::Shape sp; - for (int i = 0; i < dims.size(); ++i) { - int k = dims[i]; - k = k == 0 ? 1 : k; - sp.push_back(k); - } - return sp; -} - -static std::map pd2ng_type_map = { - {proto::VarType::FP32, ngraph::element::f32}, - {proto::VarType::FP64, ngraph::element::f64}, - {proto::VarType::INT32, ngraph::element::i32}, - {proto::VarType::INT64, ngraph::element::i64}, - {proto::VarType::BOOL, ngraph::element::boolean}, -}; - -typedef enum { /* nGraph support state on ops */ - FULL_TRAIN, /* Support full ops for train */ - PARTIAL_TRAIN, /* Support partial ops for train */ - FULL_TEST, /* Support full list of ops for test */ - PARTIAL_TEST /* Support partial list of ops for test */ -} op_state; - -// perform graph build through bridge and execute computation -class NgraphEngine { - public: - explicit NgraphEngine(const Scope& scope, const platform::Place& place, - const std::vector>& ops, - const std::unordered_map< - std::string, ngraph::element::Type>& var_type_map, - const std::unordered_set& persist, - const std::unordered_set& fetches, - const std::unordered_set& post_op_inputs, - op_state ng_op_state) - : scope_(scope), - place_(place), - fused_ops_(ops), - var_type_map_(var_type_map), - persistables_(persist), - fetches_(fetches), - post_op_inputs_(post_op_inputs), - ng_op_state_(ng_op_state) { - var_in_node_map_ = std::make_shared< - std::unordered_map>>(); - - var_node_map_ = std::make_shared< - std::unordered_map>>(); - - BuildNgIO(); - - GetNgFunction(); - } - - void Run(const Scope& scope, const platform::Place& place) const; - - private: - static std::unordered_map> - func_cache_; - const Scope& scope_; - const platform::Place& place_; - std::vector> fused_ops_; - std::unordered_map var_type_map_; - std::unordered_set persistables_; - std::unordered_set fetches_; - std::unordered_set post_op_inputs_; - op_state ng_op_state_; - - // ngraph backend eg. CPU - static std::shared_ptr backend_; - // ngraph function to call and execute - std::shared_ptr ngraph_function_; - // var_name of inputs - std::vector var_in_; - // var_name of outputs from fetch in order - std::vector var_out_; - // map input vars to nodes - std::shared_ptr< - std::unordered_map>> - var_in_node_map_; - // map each var name with a ngraph node - std::shared_ptr< - std::unordered_map>> - var_node_map_; - // cache key to check if function is cached - std::shared_ptr GetCacheKey(); - // get ngraph input and define ngraph input parameters - void GetNgInputShape(std::shared_ptr op); - // Call ngraph bridge to map ops - void BuildNgNodes(); - // get the ngraph input and output var list - void BuildNgIO(); - // build ngraph function call - void BuildNgFunction(); - // Check cache for ngraph function or otherwise build the function - void GetNgFunction(); -}; - -std::vector>::iterator>> -NgraphOperator::NgraphOpIntervals( - std::vector>* ops) { - std::vector>::iterator>> - intervals; - if (ops->empty()) { - return intervals; - } - size_t size = ops->size(); - size_t left = 0; - while (left < size && ops->at(left)->Type() != kFeedOpType) { - ++left; - } - if (left == size) { - return intervals; - } - while (left < size && ops->at(left)->Type() == kFeedOpType) { - ++left; - } - - size_t right = left; - while (right < size && ops->at(right)->Type() != kFetchOpType) { - ++right; - } - if (right == size) { - return intervals; - } - if (left >= right) return intervals; - - // (left, right - 1) represents indices between feed and fetch - size_t pivot = left; - while (pivot < right) { - auto op_type = ops->at(pivot)->Type(); - if (paddle::framework::NgraphBridge::NG_NODE_MAP.find(op_type) == - paddle::framework::NgraphBridge::NG_NODE_MAP.end()) { - ++pivot; - } else { - size_t start = pivot, end = start; - while (pivot < right && - (paddle::framework::NgraphBridge::NG_NODE_MAP.find( - ops->at(pivot)->Type()) != - paddle::framework::NgraphBridge::NG_NODE_MAP.end())) { - ++pivot; - ++end; - } - std::vector>::iterator> - interval = {ops->begin() + start, ops->begin() + end}; - intervals.push_back(interval); - } - } // end while - - return intervals; -} - -NgraphOperator::NgraphOperator( - const ProgramDesc& prog, size_t block_id, - std::vector>::iterator start, - std::vector>::iterator end, - const std::string& type, const VariableNameMap& inputs, - const VariableNameMap& outputs, const AttributeMap& attrs) - : OperatorBase(type, inputs, outputs, attrs), - pdesc_(prog), - block_(block_id) { - for (std::vector>::iterator it = start; - it != end; ++it) { - fused_ops_.push_back(std::move(*it)); - } - - for (std::vector>::iterator it = end; - (*it)->Type() != kFetchOpType; ++it) { - for (auto& var_name_item : (*it)->Inputs()) { - for (auto& var_name : var_name_item.second) { - post_op_inputs_.insert(var_name); - } - } - } - - if ((*(start - 1))->Type() == kFeedOpType && (*end)->Type() == kFetchOpType) { - is_full_ = true; - } - - Process(); -} - -void NgraphOperator::Process() { - auto& bdesc = pdesc_.Block(block_); - for (auto& var : bdesc.AllVars()) { - if (!(var->GetType() == proto::VarType::SELECTED_ROWS || - var->GetType() == proto::VarType::LOD_TENSOR || - var->GetType() == proto::VarType::LOD_TENSOR_ARRAY)) { - continue; - } - - auto var_name = var->Name(); - if (var->Name() == framework::kEmptyVarName) { - continue; - } - - if (var_name != "fetch" && var_name != "feed") { - auto pd_type = var->GetDataType(); - if (pd2ng_type_map.find(pd_type) == pd2ng_type_map.end()) { - PADDLE_THROW("Data type of var %s not found in pd2ng_type_map", - var_name); - } - var_type_map_[var_name] = pd2ng_type_map[pd_type]; - } - - if (var->Persistable()) { - persistables_.insert(var->Name()); - } - } - - for (auto* op : bdesc.AllOps()) { - if (op->Type() == kFetchOpType) { - std::string fetch_target_name = op->Input("X")[0]; - fetches_.insert(fetch_target_name); - } - } -} - -void NgraphOperator::RunImpl(const Scope& scope, - const platform::Place& place) const { - op_state ng_op_state = PARTIAL_TEST; - auto& bdesc = pdesc_.Block(block_); - for (auto* op : bdesc.AllOps()) { - if (op->Type().find("_grad") != std::string::npos) { - ng_op_state = PARTIAL_TRAIN; - break; - } - } - - if (is_full_) { - ng_op_state = ng_op_state == PARTIAL_TEST ? FULL_TEST : FULL_TRAIN; - } - - NgraphEngine ngraph_engine(scope, place, fused_ops_, var_type_map_, - persistables_, fetches_, post_op_inputs_, - ng_op_state); - ngraph_engine.Run(scope, place); -} - -std::unordered_map> - NgraphEngine::func_cache_ = {}; - -std::shared_ptr NgraphEngine::backend_ = - ngraph::runtime::Backend::create("CPU"); - -void NgraphEngine::GetNgInputShape(std::shared_ptr op) { - RuntimeContext ctx(op->Inputs(), op->Outputs(), scope_); - op->RuntimeInferShape(scope_, place_, ctx); - for (auto& var_name_item : op->Inputs()) { - for (auto& var_name : var_name_item.second) { - auto* var = scope_.FindVar(var_name); - if (var && var->IsType()) { - auto* tensor_pd = GetLoDTensorOrSelectedRowsValueFromVar(*var); - auto sp = Ddim2Shape(tensor_pd->dims()); - if (std::find(var_in_.begin(), var_in_.end(), var_name) != - var_in_.end()) { - if (var_node_map_->find(var_name) == var_node_map_->end()) { - auto ng_type = var_type_map_.at(var_name); - auto prm = - std::make_shared(ng_type, sp, true); - (*var_node_map_)[var_name] = prm; - (*var_in_node_map_)[var_name] = prm; - } - } - } - } - } -} - -void NgraphEngine::BuildNgNodes() { - for (auto& var_name : var_out_) { - if (var_node_map_->find(var_name) == var_node_map_->end()) { - auto* var = scope_.FindVar(var_name); - if (var && var->IsType()) { - auto* tensor_pd = GetLoDTensorOrSelectedRowsValueFromVar(*var); - auto& ddim = tensor_pd->dims(); - auto ng_shape = Ddim2Shape(ddim); - auto ng_type = var_type_map_.at(var_name); - auto prm = - std::make_shared(ng_type, ng_shape, true); - (*var_node_map_)[var_name] = prm; - } - } - } - - paddle::framework::NgraphBridge ngb(var_node_map_); - for (auto& op : fused_ops_) { - ngb.BuildNgNode(op); - } -} - -void NgraphEngine::BuildNgIO() { - std::unordered_set inputs; - std::unordered_set outputs; - - for (auto& op : fused_ops_) { - for (auto& var_name_item : op->Inputs()) { - for (auto& var_name : var_name_item.second) { - inputs.insert(var_name); - const bool is_output = outputs.find(var_name) != outputs.end(); - if (!is_output && - std::find(var_in_.begin(), var_in_.end(), var_name) == - var_in_.end()) { - // fill var_in here to keep lhs and rhs order - var_in_.push_back(var_name); - } - } - } - - if (op->Type() != "fill_constant") { - GetNgInputShape(op); - } - - for (auto& var_name_item : op->Outputs()) { - PADDLE_ENFORCE_LE(var_name_item.second.size(), 1, - "op %s has more than 1 output - Not handling yet", - op->Type()); - for (auto& var_name : var_name_item.second) { - outputs.insert(var_name); - } - } - } - - // var_out.clear(); - for (auto& op : fused_ops_) { - for (auto& var_name_item : op->Outputs()) { - PADDLE_ENFORCE_LE(var_name_item.second.size(), 1, - "op %s has more than 1 output - Not handling yet", - op->Type()); - for (auto& var_name : var_name_item.second) { - switch (ng_op_state_) { - case PARTIAL_TEST: - if (post_op_inputs_.find(var_name) != post_op_inputs_.end() || - fetches_.find(var_name) != fetches_.end()) { - var_out_.push_back(var_name); - } - break; - case FULL_TEST: - if (fetches_.find(var_name) != fetches_.end()) { - var_out_.push_back(var_name); - } - break; - case PARTIAL_TRAIN: - if (fetches_.find(var_name) != fetches_.end() || - post_op_inputs_.find(var_name) != post_op_inputs_.end() || - persistables_.find(var_name) != persistables_.end()) { - var_out_.push_back(var_name); - } - break; - case FULL_TRAIN: - if (fetches_.find(var_name) != fetches_.end() || - persistables_.find(var_name) != persistables_.end()) { - var_out_.push_back(var_name); - } - break; - default: - var_out_.push_back(var_name); - } - } - } - } -} - -void NgraphEngine::BuildNgFunction() { - BuildNgNodes(); - ngraph_function_ = nullptr; - ngraph::NodeVector func_outputs; - ngraph::ParameterVector func_inputs; - - for (auto& vo : var_out_) { - func_outputs.push_back(var_node_map_->at(vo)); - } - - for (auto& vi : var_in_) { - std::shared_ptr prm = - std::dynamic_pointer_cast( - var_in_node_map_->at(vi)); - func_inputs.push_back(prm); - } - - ngraph_function_ = - std::make_shared(func_outputs, func_inputs); -} - -std::shared_ptr NgraphEngine::GetCacheKey() { - auto cache_key = std::make_shared(""); - *cache_key += std::to_string(fused_ops_.size()); - for (auto& op : fused_ops_) { - *cache_key += op->Type(); - } - for (auto& var_name : var_in_) { - auto shape = var_node_map_->at(var_name)->get_shape(); - *cache_key += var_name; - *cache_key += var_type_map_.at(var_name).c_type_string(); - for (size_t i = 0; i < shape.size(); ++i) { - *cache_key += std::to_string(shape.at(i)); - } - } - - for (auto& var_name : var_out_) { - auto* var = scope_.FindVar(var_name); - if (var && var->IsType()) { - auto* tensor_pd = GetLoDTensorOrSelectedRowsValueFromVar(*var); - auto& ddim = tensor_pd->dims(); - for (int i = 0; i < ddim.size(); ++i) { - *cache_key += std::to_string(ddim[i]); - } - } - } - return cache_key; -} - -void NgraphEngine::GetNgFunction() { - bool cache_on = true; - if (cache_on) { - std::string cache_key_val = *GetCacheKey(); - if (func_cache_.find(cache_key_val) != func_cache_.end()) { - ngraph_function_ = func_cache_.at(cache_key_val); - } else { - BuildNgFunction(); - func_cache_[cache_key_val] = ngraph_function_; - } - } else { - BuildNgFunction(); - } -} - -void NgraphEngine::Run(const Scope& scope, const platform::Place& place) const { - std::vector> t_in; - std::vector> t_out; - - for (size_t i = 0; i < var_in_.size(); ++i) { - auto vi = var_in_.at(i); - auto sp = var_node_map_->at(vi)->get_shape(); - std::shared_ptr ti; - auto* var = scope.FindVar(vi); - if (var && var->IsType()) { - auto* tensor_pd = GetLoDTensorOrSelectedRowsValueFromVar(*var); - PADDLE_ENFORCE(sp == Ddim2Shape(tensor_pd->dims()), - "Ensure ngraph tensor layout align with paddle tensor"); - if (tensor_pd->type() == proto::VarType::FP32) { - const float* arr = tensor_pd->data(); - ti = backend_->create_tensor(ngraph::element::f32, sp, - const_cast(arr)); - } else if (tensor_pd->type() == proto::VarType::INT32) { - const int* arr = tensor_pd->data(); - ti = backend_->create_tensor(ngraph::element::i32, sp, - const_cast(arr)); - } else if (tensor_pd->type() == proto::VarType::INT64) { - const int64_t* arr = tensor_pd->data(); - ti = backend_->create_tensor(ngraph::element::i64, sp, - const_cast(arr)); - } else if (tensor_pd->type() == proto::VarType::FP64) { - const double* arr = tensor_pd->data(); - ti = backend_->create_tensor(ngraph::element::f64, sp, - const_cast(arr)); - } else if (tensor_pd->type() == proto::VarType::BOOL) { - const bool* arr = tensor_pd->data(); - ti = backend_->create_tensor(ngraph::element::boolean, sp, - const_cast(arr)); - } else { - PADDLE_THROW("Data type not handling for var %s", vi); - } - } else { - PADDLE_THROW("Cannot find var or tensor with var name %s", vi); - } - bool is_test = (ng_op_state_ == PARTIAL_TEST || ng_op_state_ == FULL_TEST) - ? true - : false; - bool is_persistable = - (persistables_.find(vi) != persistables_.end()) ? true : false; - if (is_test && is_persistable) { - ti->set_stale(false); - } - t_in.push_back(ti); - } - - for (size_t i = 0; i < var_out_.size(); ++i) { - auto var_name = var_out_[i]; - auto* var = scope.FindVar(var_name); - std::shared_ptr to; - if (var && var->IsType()) { - auto* tensor_pd = GetMutableLoDTensorOrSelectedRowsValueFromVar(var); - auto dd = tensor_pd->dims(); - ngraph::Shape sp = Ddim2Shape(dd); - auto ng_type = var_type_map_.at(var_name); - if (ng_type == ngraph::element::f32) { - auto pd_arr = tensor_pd->mutable_data(place); - to = backend_->create_tensor(ngraph::element::f32, sp, pd_arr); - } else if (ng_type == ngraph::element::i64) { - auto pd_arr = tensor_pd->mutable_data(place); - to = backend_->create_tensor(ngraph::element::i64, sp, pd_arr); - } else if (ng_type == ngraph::element::f64) { - auto pd_arr = tensor_pd->mutable_data(place); - to = backend_->create_tensor(ngraph::element::f64, sp, pd_arr); - } else if (ng_type == ngraph::element::boolean) { - auto pd_arr = tensor_pd->mutable_data(place); - to = backend_->create_tensor(ngraph::element::boolean, sp, pd_arr); - } else { - PADDLE_THROW("Data type not handled in for var %s", var_name); - } - t_out.push_back(to); - } else { - PADDLE_THROW("Cannot find var or tensor with var name %s", var_name); - } - } - - backend_->call(backend_->compile(ngraph_function_), t_out, t_in); -} // NgraphEngine::RunImpl -} // namespace framework -} // namespace paddle diff --git a/paddle/fluid/framework/ngraph_operator.h b/paddle/fluid/framework/ngraph_operator.h deleted file mode 100644 index ede80f44bea208b66acc3b3f4bc0f4adee4fb860..0000000000000000000000000000000000000000 --- a/paddle/fluid/framework/ngraph_operator.h +++ /dev/null @@ -1,64 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#pragma once - -#include -#include -#include -#include - -#include "paddle/fluid/framework/attribute.h" -#include "paddle/fluid/framework/op_info.h" -#include "paddle/fluid/framework/op_kernel_type.h" -#include "paddle/fluid/framework/operator.h" -#include "paddle/fluid/framework/program_desc.h" -#include "paddle/fluid/framework/scope.h" -#include "paddle/fluid/platform/variant.h" - -#include "ngraph/type/element_type.hpp" - -namespace paddle { -namespace framework { - -class NgraphOperator : public OperatorBase { - public: - static std::vector< - std::vector>::iterator>> - NgraphOpIntervals( - std::vector>* ops); - - explicit NgraphOperator( - const ProgramDesc& prog, size_t block_id, - std::vector>::iterator start, - std::vector>::iterator end, - const std::string& type = "fused_op", const VariableNameMap& inputs = {}, - const VariableNameMap& outputs = {}, const AttributeMap& attrs = {}); - - void RunImpl(const Scope& scope, const platform::Place& place) const final; - - private: - const ProgramDesc pdesc_; - size_t block_; - std::vector> fused_ops_; - std::unordered_map var_type_map_; - std::unordered_set persistables_; - std::unordered_set fetches_; - std::unordered_set post_op_inputs_; - bool is_full_ = false; - - void Process(); -}; -} // namespace framework -} // namespace paddle diff --git a/paddle/fluid/framework/operator.cc b/paddle/fluid/framework/operator.cc index 38f811c0e9e8ecc6a4226e17e621a3c2ef3f78c5..ab3cf308fc04e227d5402712f6bab226fea04711 100644 --- a/paddle/fluid/framework/operator.cc +++ b/paddle/fluid/framework/operator.cc @@ -19,8 +19,6 @@ limitations under the License. */ #include #include #include -#include "gflags/gflags.h" -#include "glog/logging.h" #include "paddle/fluid/framework/data_transform.h" #include "paddle/fluid/framework/executor.h" #include "paddle/fluid/framework/lod_tensor.h" @@ -1075,7 +1073,9 @@ Scope* OperatorWithKernel::PrepareData( proto::VarType::Type OperatorWithKernel::IndicateDataType( const ExecutionContext& ctx) const { - int data_type = -1; + proto::VarType::Type dafault_data_type = + static_cast(-1); + proto::VarType::Type data_type = dafault_data_type; for (auto& input : this->inputs_) { const std::vector vars = ctx.MultiInputVar(input.first); for (size_t i = 0; i < vars.size(); ++i) { @@ -1092,18 +1092,19 @@ proto::VarType::Type OperatorWithKernel::IndicateDataType( if (t != nullptr) { PADDLE_ENFORCE(t->IsInitialized(), "Input %s(%lu)is not initialized", input.first, i); - int tmp = static_cast(t->type()); + proto::VarType::Type tmp = t->type(); PADDLE_ENFORCE( - tmp == data_type || data_type == -1, + tmp == data_type || data_type == dafault_data_type, "DataType of Paddle Op %s must be the same. Get (%d) != (%d)", - Type(), data_type, tmp); + Type(), DataTypeToString(data_type), DataTypeToString(tmp)); data_type = tmp; } } } } - PADDLE_ENFORCE(data_type != -1, "DataType should be indicated by input"); - return static_cast(data_type); + PADDLE_ENFORCE(data_type != dafault_data_type, + "DataType should be indicated by input"); + return data_type; } OpKernelType OperatorWithKernel::GetExpectedKernelType( diff --git a/paddle/fluid/framework/tensor_impl.h b/paddle/fluid/framework/tensor_impl.h index ce3ad18b1fb1c6304eaa60173e6dfad5e9dafb2d..ef5404e4755817cefc925acbf4882ff86d1f0ba3 100644 --- a/paddle/fluid/framework/tensor_impl.h +++ b/paddle/fluid/framework/tensor_impl.h @@ -25,7 +25,8 @@ inline const T* Tensor::data() const { check_memory_size(); bool valid = std::is_same::value || type_ == DataTypeTrait::DataType; - PADDLE_ENFORCE(valid, "Tensor holds the wrong type, it holds %d", type_); + PADDLE_ENFORCE(valid, "Tensor holds the wrong type, it holds %d", + DataTypeToString(type_)); return reinterpret_cast( reinterpret_cast(holder_->ptr()) + offset_); diff --git a/paddle/fluid/imperative/CMakeLists.txt b/paddle/fluid/imperative/CMakeLists.txt index a730b84a916ea2c3e17dd4becaf939cc28160457..5db422119966948f75970874e13d416ea699158a 100644 --- a/paddle/fluid/imperative/CMakeLists.txt +++ b/paddle/fluid/imperative/CMakeLists.txt @@ -1,5 +1,5 @@ if(WITH_PYTHON) -cc_library(layer SRCS layer.cc DEPS proto_desc operator) -cc_library(tracer SRCS tracer.cc DEPS proto_desc) +cc_library(layer SRCS layer.cc DEPS proto_desc operator device_context blas) +cc_library(tracer SRCS tracer.cc DEPS proto_desc device_context) cc_library(engine SRCS engine.cc) endif() diff --git a/paddle/fluid/imperative/layer.cc b/paddle/fluid/imperative/layer.cc index b7df4b8886d629e98225c95eae9a4f2ed9400710..83fc6ee2e299f5fa18d5cc6f220c0be6a66e709d 100644 --- a/paddle/fluid/imperative/layer.cc +++ b/paddle/fluid/imperative/layer.cc @@ -13,6 +13,7 @@ // limitations under the License. #include "paddle/fluid/imperative/layer.h" + #include #include #include @@ -22,6 +23,9 @@ #include "paddle/fluid/framework/lod_tensor.h" #include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/operator.h" +#include "paddle/fluid/framework/tensor_util.h" +#include "paddle/fluid/operators/math/blas.h" +#include "paddle/fluid/platform/device_context.h" #include "paddle/fluid/string/printf.h" namespace paddle { @@ -34,22 +38,66 @@ std::map py_funcs_; using framework::Variable; -void AddTo(Variable* src, Variable* dst) { - framework::LoDTensor* dst_tensor = dst->GetMutable(); - framework::LoDTensor* src_tensor = src->GetMutable(); +namespace detail { + +template +class TensorAddToFunctor : public boost::static_visitor<> { + public: + TensorAddToFunctor(int64_t numel, const T* x, T* y) + : numel_(numel), x_(x), y_(y) {} + + void operator()(const platform::CPUPlace& place) { + platform::CPUDeviceContext* ctx = dynamic_cast( + platform::DeviceContextPool::Instance().Get(place)); + auto blas = operators::math::GetBlas(*ctx); + blas.AXPY(numel_, 1., x_, y_); + } + +#ifdef PADDLE_WITH_CUDA + void operator()(const platform::CUDAPlace& place) { + platform::CUDADeviceContext* ctx = + dynamic_cast( + platform::DeviceContextPool::Instance().Get(place)); + auto blas = operators::math::GetBlas(*ctx); + blas.AXPY(numel_, 1., x_, y_); + } +#else + void operator()(const platform::CUDAPlace& place) { + PADDLE_THROW("Do NOT support gradient merge in place %s", place); + } +#endif + + // there is NO blas in CUDAPinnedPlace + void operator()(const platform::CUDAPinnedPlace& place) { + PADDLE_THROW("Do NOT support gradient merge in place %s", place); + } + + private: + int64_t numel_; + const T* x_; + T* y_; +}; + +} // namespace detail + +void AddTo(Variable* src, Variable* dst, platform::Place place) { + framework::Tensor* dst_tensor = dst->GetMutable(); + framework::Tensor* src_tensor = src->GetMutable(); + // FIXME(minqiyang): loss_grad op will pass a zero grad of label // ugly fix for it if (src_tensor->numel() == 0) { return; } + PADDLE_ENFORCE(dst_tensor->numel() == src_tensor->numel(), "dst_numel %lld vs. src_numel %lld", dst_tensor->numel(), src_tensor->numel()); - float* dst_data = dst_tensor->mutable_data(platform::CPUPlace()); - const float* src_data = src_tensor->data(); - for (int64_t i = 0; i < src_tensor->numel(); ++i) { - dst_data[i] += src_data[i]; - } + + detail::TensorAddToFunctor func( + src_tensor->numel(), src_tensor->data(), + dst_tensor->mutable_data(place)); + boost::apply_visitor(func, place); } class Autograd { @@ -120,66 +168,104 @@ class Autograd { } }; +std::unique_ptr VarBase::NewVarBase(const platform::Place& dst_place, + const bool blocking) const { + PADDLE_ENFORCE(var_->IsInitialized(), + "Variable must be initialized when getting numpy tensor"); + + std::unique_ptr new_var(new VarBase()); + framework::LoDTensor* tensor = + new_var->var_->GetMutable(); + tensor->Resize(var_->Get().dims()); + tensor->set_lod(var_->Get().lod()); + + if (blocking) { + platform::DeviceContext* dev_ctx = + platform::DeviceContextPool::Instance().Get(dst_place); + + framework::TensorCopySync(var_->Get(), dst_place, + tensor); + + dev_ctx->Wait(); + } else { + framework::TensorCopy(var_->Get(), dst_place, tensor); + } + + if (platform::is_gpu_place(dst_place)) { + VLOG(3) << "copy tensor " << var_desc_->Name() << " from gpu"; + } + + return new_var; +} + framework::LoDTensor& VarBase::GradValue() { VLOG(3) << "get var grad " << var_desc_->Name(); return *(grads_->var_->GetMutable()); } std::map> OpBase::ApplyGrad() { - if (!grad_op_desc_ && backward_id_ <= 0) { + if (grad_op_descs_.empty() && backward_id_ <= 0) { LOG(WARNING) << "op with no grad: " << op_desc_->Type(); return {}; } - std::map> grad_outputs; + std::vector grad_outputs; if (backward_id_ > 0) { VLOG(3) << "py_layer_grad"; - grad_outputs[framework::GradVarName(PyLayer::kFwdOut)] = PyLayer::ApplyGrad( - backward_id_, - grad_input_vars_[framework::GradVarName(PyLayer::kFwdInp)]); + grad_outputs.resize(1); + grad_outputs[0][framework::GradVarName(PyLayer::kFwdOut)] = + PyLayer::ApplyGrad( + backward_id_, + grad_input_vars_[0][framework::GradVarName(PyLayer::kFwdInp)]); } else { - VLOG(3) << "op grad " << grad_op_desc_->Type(); - for (auto it : grad_output_vars_) { - auto& outputs = grad_outputs[it.first]; - for (size_t i = 0; i < it.second.size(); ++i) { - // Allocate a new variable - Variable* tmp_var = new framework::Variable(); - tmp_var->GetMutable(); - outputs.push_back(tmp_var); + grad_outputs.resize(grad_op_descs_.size()); + for (size_t k = 0; k < grad_op_descs_.size(); ++k) { + framework::OpDesc* grad_op_desc = grad_op_descs_[k]; + VLOG(3) << "op grad " << grad_op_desc->Type(); + for (auto it : grad_output_vars_[k]) { + auto& outputs = grad_outputs[k][it.first]; + for (size_t i = 0; i < it.second.size(); ++i) { + // Allocate a new variable + Variable* tmp_var = new framework::Variable(); + tmp_var->GetMutable(); + outputs.push_back(tmp_var); + } } - } - framework::RuntimeContext ctx(grad_input_vars_, grad_outputs); + framework::RuntimeContext ctx(grad_input_vars_[k], grad_outputs[k]); - // No need to do compile time infer shape here. - // grad_op_desc_->InferShape(*block_); - grad_op_desc_->InferVarType(block_); + // No need to do compile time infer shape here. + // grad_op_desc_->InferShape(*block_); + grad_op_desc->InferVarType(block_); - std::unique_ptr opbase = - framework::OpRegistry::CreateOp(*grad_op_desc_); - framework::OperatorWithKernel* op_kernel = - dynamic_cast(opbase.get()); - PADDLE_ENFORCE_NOT_NULL(op_kernel, "only support op with kernel"); + std::unique_ptr opbase = + framework::OpRegistry::CreateOp(*grad_op_desc); + framework::OperatorWithKernel* op_kernel = + dynamic_cast(opbase.get()); + PADDLE_ENFORCE_NOT_NULL(op_kernel, "only support op with kernel"); - framework::Scope scope; - platform::CPUPlace place; - PreparedOp p = PreparedOp::Prepare(ctx, *op_kernel, place); - p.op.RuntimeInferShape(scope, place, ctx); - p.func(framework::ExecutionContext(p.op, scope, *p.dev_ctx, p.ctx)); + framework::Scope scope; + PreparedOp p = PreparedOp::Prepare(ctx, *op_kernel, place_); + p.op.RuntimeInferShape(scope, place_, ctx); + p.func(framework::ExecutionContext(p.op, scope, *p.dev_ctx, p.ctx)); + } } - for (auto it : grad_output_vars_) { - auto& outputs = grad_outputs[it.first]; - auto& origin_outputs = it.second; - PADDLE_ENFORCE_EQ(outputs.size(), origin_outputs.size()); - - for (size_t i = 0; i < outputs.size(); ++i) { - framework::Variable* grad = outputs[i]; - framework::Variable* orig_grad = origin_outputs[i]; - AddTo(grad, orig_grad); - delete grad; + for (size_t k = 0; k < grad_output_vars_.size(); ++k) { + for (auto it : grad_output_vars_[k]) { + auto& outputs = grad_outputs[k][it.first]; + auto& origin_outputs = it.second; + PADDLE_ENFORCE_EQ(outputs.size(), origin_outputs.size()); + + for (size_t i = 0; i < outputs.size(); ++i) { + framework::Variable* grad = outputs[i]; + framework::Variable* orig_grad = origin_outputs[i]; + AddTo(grad, orig_grad, place_); + delete grad; + } } } + return input_vars_; } @@ -188,8 +274,10 @@ void VarBase::RunBackward() { VLOG(3) << "start backward"; auto grads_t = grads_->var_->GetMutable(); - float* data = grads_t->mutable_data(platform::CPUPlace()); - std::fill(data, data + grads_t->numel(), 1.0); + operators::math::set_constant( + *(platform::DeviceContextPool::Instance().Get( + var_->GetMutable()->place())), + grads_t, 1.0); PADDLE_ENFORCE( grads_ == diff --git a/paddle/fluid/imperative/layer.h b/paddle/fluid/imperative/layer.h index 0b1077c640e076797ba7e0200dc8d0eb8bfcff16..dc97433a5102b39d03ea5cac3157c027f9d67c98 100644 --- a/paddle/fluid/imperative/layer.h +++ b/paddle/fluid/imperative/layer.h @@ -21,17 +21,21 @@ #include // NOLINT #include // NOLINT #include // NOLINT +#include // NOLINT #include "paddle/fluid/framework/op_desc.h" #include "paddle/fluid/framework/operator.h" #include "paddle/fluid/framework/var_desc.h" #include "paddle/fluid/platform/enforce.h" +#include "paddle/fluid/platform/device_context.h" #include "paddle/fluid/imperative/type_defs.h" namespace paddle { namespace imperative { +class VarBase; + namespace py = ::pybind11; class PreparedOp { @@ -81,6 +85,8 @@ class PreparedOp { return PreparedOp(op, ctx, kernel_iter->second, dev_ctx); } + inline platform::DeviceContext* GetDeviceContext() const { return dev_ctx; } + const framework::OperatorBase& op; const framework::RuntimeContext& ctx; framework::OperatorWithKernel::OpKernelFunc func; @@ -148,6 +154,9 @@ class VarBase { framework::LoDTensor& GradValue(); + std::unique_ptr NewVarBase(const platform::Place& dst_place, + const bool blocking) const; + inline std::string GradName() const { PADDLE_ENFORCE( var_desc_, @@ -175,11 +184,13 @@ class OpBase { OpBase() : op_desc_(nullptr), forward_id_(-1), - grad_op_desc_(nullptr), - backward_id_(-1) {} + backward_id_(-1), + place_(platform::CPUPlace()) {} virtual ~OpBase() { - if (grad_op_desc_) delete grad_op_desc_; + for (framework::OpDesc* desc : grad_op_descs_) { + delete desc; + } } std::map> ApplyGrad(); @@ -188,18 +199,25 @@ class OpBase { // For pure python PyLayer, use `forward_id_`, otherwise, use op_desc_. framework::OpDesc* op_desc_; int forward_id_; - // When has backward, one of `grad_op_desc_` or `backward_id_` is set, + + // When has backward, one of `grad_op_descs_` or `backward_id_` is set, // not both. - framework::OpDesc* grad_op_desc_; + // Note: each fwd op corresponds to a vector of bwd ops. + std::vector grad_op_descs_; int backward_id_; + platform::Place place_; + VarBasePtrMap input_vars_; VarBasePtrMap output_vars_; OpBasePtrMap pre_ops_; std::map> pre_ops_out_idx_; - framework::VariableValueMap grad_input_vars_; - framework::VariableValueMap grad_output_vars_; + // Inputs to a vector of bwd ops. + std::vector grad_input_vars_; + // Outputs to a vector of bwd ops. + std::vector grad_output_vars_; + framework::BlockDesc* block_; }; diff --git a/paddle/fluid/imperative/tracer.cc b/paddle/fluid/imperative/tracer.cc index 843fee41f38f1247473ba06978248659495f8585..cd62807a5532e6b2309cb5a8f679c3097b51c9e9 100644 --- a/paddle/fluid/imperative/tracer.cc +++ b/paddle/fluid/imperative/tracer.cc @@ -14,33 +14,60 @@ #include "paddle/fluid/imperative/tracer.h" +#include "paddle/fluid/operators/math/math_function.h" +#include "paddle/fluid/platform/device_context.h" +#include "paddle/fluid/platform/enforce.h" + namespace paddle { namespace imperative { void CreateGradOp(const framework::OpDesc& op_desc, const std::unordered_set& no_grad_set, const std::vector& grad_sub_block, - framework::OpDesc** grad_op_desc, + std::vector* grad_op_descs, std::unordered_map* grad_to_var) { - std::vector> grad_op_descs = + PADDLE_ENFORCE(grad_op_descs->empty()); + std::vector> descs = framework::OpInfoMap::Instance() .Get(op_desc.Type()) .GradOpMaker()(op_desc, no_grad_set, grad_to_var, grad_sub_block); - PADDLE_ENFORCE(grad_op_descs.size() == 1, "Only support 1 grad op now."); - // TODO(panyx0718): Leak? - *grad_op_desc = grad_op_descs[0].release(); + for (auto& desc : descs) { + grad_op_descs->emplace_back(desc.release()); + } } -void InitVar(framework::Variable* var, framework::Variable* grad_var) { +void InitVar(framework::Variable* var, framework::Variable* grad_var, + platform::DeviceContext* dev_ctx) { + PADDLE_ENFORCE_NOT_NULL(dev_ctx, + "Could not get valid device from forward op"); auto& var_t = var->Get(); - float* data = - grad_var->GetMutable()->mutable_data( - var_t.dims(), platform::CPUPlace()); - std::fill(data, data + var_t.numel(), 0.0); + grad_var->GetMutable()->mutable_data( + var_t.dims(), dev_ctx->GetPlace()); + operators::math::set_constant( + *dev_ctx, grad_var->GetMutable(), 0.0); +} + +platform::Place GetExpectedPlace(platform::Place place, VarBasePtrMap inputs) { + platform::Place result = place; + for (auto it : inputs) { + for (VarBase* var : it.second) { + platform::Place tmp_place = + var->var_->Get().place(); + if (!platform::is_same_place(tmp_place, result)) { + PADDLE_THROW( + "Input variable should keep in the same place: %s, but get place: " + "%s of input %s instead", + result, tmp_place, it.first); + } + } + } + + return result; } void Tracer::Trace(OpBase* op, const VarBasePtrMap& inputs, const VarBasePtrMap& outputs, framework::BlockDesc* block, + const platform::Place expected_place, const bool stop_gradient) { std::map vars; @@ -105,51 +132,59 @@ void Tracer::Trace(OpBase* op, const VarBasePtrMap& inputs, PADDLE_ENFORCE_NOT_NULL(op_kernel, "only support op with kernel"); framework::Scope scope; - platform::CPUPlace place; - PreparedOp p = PreparedOp::Prepare(ctx, *op_kernel, place); - p.op.RuntimeInferShape(scope, place, ctx); - p.func(framework::ExecutionContext(p.op, scope, *p.dev_ctx, p.ctx)); + op->place_ = GetExpectedPlace(expected_place, inputs); + PreparedOp prepared_op = PreparedOp::Prepare(ctx, *op_kernel, op->place_); + prepared_op.op.RuntimeInferShape(scope, op->place_, ctx); + prepared_op.func(framework::ExecutionContext( + prepared_op.op, scope, *prepared_op.dev_ctx, prepared_op.ctx)); if (!stop_gradient) { - framework::OpDesc* grad_op_desc; - // TODO(panyx): Is this leaked? std::unique_ptr> grad_to_var( new std::unordered_map()); - CreateGradOp(*op_desc, {}, {block}, &grad_op_desc, grad_to_var.get()); - op->grad_op_desc_ = grad_op_desc; - - for (auto it : grad_op_desc->Inputs()) { - auto& grad_in_vars = op->grad_input_vars_[it.first]; - for (const std::string& grad_invar : it.second) { - block->FindRecursiveOrCreateVar(grad_invar); - auto var_it = grad_to_var->find(grad_invar); - if (var_it == grad_to_var->end()) { - auto fwd_var_it = vars.find(grad_invar); - PADDLE_ENFORCE(fwd_var_it != vars.end()); - // Forward inputs or outputs. - grad_in_vars.push_back(fwd_var_it->second->var_); - } else { - VarBase* var = vars[var_it->second]; - if (!var->grads_->var_->IsInitialized()) { - InitVar(var->var_, var->grads_->var_); + CreateGradOp(*op_desc, {}, {block}, &op->grad_op_descs_, grad_to_var.get()); + + op->grad_input_vars_.resize(op->grad_op_descs_.size()); + op->grad_output_vars_.resize(op->grad_op_descs_.size()); + for (size_t i = 0; i < op->grad_op_descs_.size(); ++i) { + framework::OpDesc* grad_op_desc = op->grad_op_descs_[i]; + for (auto it : grad_op_desc->Inputs()) { + auto& grad_in_vars = op->grad_input_vars_[i][it.first]; + for (const std::string& grad_invar : it.second) { + block->FindRecursiveOrCreateVar(grad_invar); + auto var_it = grad_to_var->find(grad_invar); + if (var_it == grad_to_var->end()) { + auto fwd_var_it = vars.find(grad_invar); + PADDLE_ENFORCE(fwd_var_it != vars.end()); + // Forward inputs or outputs. + grad_in_vars.push_back(fwd_var_it->second->var_); + } else { + VarBase* var = vars[var_it->second]; + if (!var->grads_->var_->IsInitialized()) { + InitVar(var->var_, var->grads_->var_, + prepared_op.GetDeviceContext()); + } + // Douts. + grad_in_vars.push_back(var->grads_->var_); } - // Douts. - grad_in_vars.push_back(var->grads_->var_); } } - } - for (auto it : grad_op_desc->Outputs()) { - auto& grad_out_vars = op->grad_output_vars_[it.first]; - for (const std::string& grad_outvar : it.second) { - block->FindRecursiveOrCreateVar(grad_outvar); - auto var_it = grad_to_var->find(grad_outvar); - PADDLE_ENFORCE(var_it != grad_to_var->end()); - VarBase* var = vars[var_it->second]; - if (!var->grads_->var_->IsInitialized()) { - InitVar(var->var_, var->grads_->var_); + for (auto it : grad_op_desc->Outputs()) { + auto& grad_out_vars = op->grad_output_vars_[i][it.first]; + for (const std::string& grad_outvar : it.second) { + block->FindRecursiveOrCreateVar(grad_outvar); + auto var_it = grad_to_var->find(grad_outvar); + PADDLE_ENFORCE(var_it != grad_to_var->end(), + "Could not found the grad op output var, should this " + "operator %s's stop gradient be True", + op_desc->Type()); + VarBase* var = vars[var_it->second]; + if (!var->grads_->var_->IsInitialized()) { + InitVar(var->var_, var->grads_->var_, + prepared_op.GetDeviceContext()); + } + grad_out_vars.push_back(var->grads_->var_); } - grad_out_vars.push_back(var->grads_->var_); } } } @@ -178,10 +213,12 @@ std::vector Tracer::PyTrace(OpBase* op, out->TrackPreOp(op, PyLayer::kFwdOut, i, stop_gradient); } if (!stop_gradient) { + op->grad_input_vars_.resize(1); + op->grad_output_vars_.resize(1); auto& grad_input_vars = - op->grad_input_vars_[framework::GradVarName(PyLayer::kFwdInp)]; + op->grad_input_vars_[0][framework::GradVarName(PyLayer::kFwdInp)]; auto& grad_output_vars = - op->grad_output_vars_[framework::GradVarName(PyLayer::kFwdOut)]; + op->grad_output_vars_[0][framework::GradVarName(PyLayer::kFwdOut)]; for (const VarBase* inp : inputs) { grad_input_vars.push_back(inp->var_); @@ -189,16 +226,23 @@ std::vector Tracer::PyTrace(OpBase* op, for (VarBase* out : outputs) { grad_input_vars.push_back(out->var_); } + + platform::CPUPlace place; for (VarBase* out : outputs) { grad_input_vars.push_back(out->grads_->var_); if (!grad_input_vars.back()->IsInitialized()) { - InitVar(out->var_, grad_input_vars.back()); + // TODO(minqiyang): Add GPU support for PyLayer, only support CPU now + InitVar(out->var_, grad_input_vars.back(), + platform::DeviceContextPool::Instance().Get(place)); } } + for (const VarBase* inp : inputs) { grad_output_vars.push_back(inp->grads_->var_); if (!grad_output_vars.back()->IsInitialized()) { - InitVar(inp->var_, grad_output_vars.back()); + // TODO(minqiyang): Add GPU support for PyLayer, only support CPU now + InitVar(inp->var_, grad_output_vars.back(), + platform::DeviceContextPool::Instance().Get(place)); } } } diff --git a/paddle/fluid/imperative/tracer.h b/paddle/fluid/imperative/tracer.h index f225d8abe6c0635d2bdd8dba0b12c7fc3a4110db..690838215581b09ff35a0ea13f30655b77e6e187 100644 --- a/paddle/fluid/imperative/tracer.h +++ b/paddle/fluid/imperative/tracer.h @@ -22,6 +22,7 @@ #include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/imperative/engine.h" #include "paddle/fluid/imperative/layer.h" +#include "paddle/fluid/platform/place.h" namespace paddle { namespace imperative { @@ -34,21 +35,25 @@ void CreateGradOp(const framework::OpDesc& op_desc, void InitVar(framework::Variable* var, framework::Variable* grad_var); +platform::Place GetExpectedPlace(platform::Place place, VarBasePtrMap inputs); + class Tracer { public: explicit Tracer(framework::BlockDesc* root_block) : root_block_(root_block) {} virtual ~Tracer() {} - void Trace(OpBase* op, - const std::map>& inputs, - const std::map>& outputs, - framework::BlockDesc* block, const bool stop_gradient = false); + void Trace(OpBase* op, const VarBasePtrMap& inputs, + const VarBasePtrMap& outputs, framework::BlockDesc* block, + const platform::Place expected_place, + const bool stop_gradient = false); std::vector PyTrace(OpBase* op, const std::vector& inputs, bool stop_gradient = false); private: + platform::Place GetPlace(const VarBasePtrMap& inputs); + framework::BlockDesc* root_block_; }; diff --git a/paddle/fluid/inference/analysis/argument.h b/paddle/fluid/inference/analysis/argument.h index 88ce61f9b928aba1945bddc1f9f6b785834780ca..a2546ead93c3baeb8029f6451d8a60dcc75f8571 100644 --- a/paddle/fluid/inference/analysis/argument.h +++ b/paddle/fluid/inference/analysis/argument.h @@ -28,6 +28,7 @@ #include "paddle/fluid/framework/ir/graph.h" #include "paddle/fluid/framework/program_desc.h" #include "paddle/fluid/framework/scope.h" +#include "paddle/fluid/inference/api/paddle_analysis_config.h" #include "paddle/fluid/platform/variant.h" namespace paddle { @@ -130,10 +131,14 @@ struct Argument { DECL_ARGUMENT_FIELD(tensorrt_max_batch_size, TensorRtMaxBatchSize, int); DECL_ARGUMENT_FIELD(tensorrt_workspace_size, TensorRtWorkspaceSize, int); DECL_ARGUMENT_FIELD(tensorrt_min_subgraph_size, TensorRtMinSubgraphSize, int); + DECL_ARGUMENT_FIELD(tensorrt_precision_mode, TensorRtPrecisionMode, + contrib::AnalysisConfig::Precision); // Memory optimized related. DECL_ARGUMENT_FIELD(enable_memory_optim, EnableMemoryOptim, bool); - DECL_ARGUMENT_FIELD(memory_optim_force_update, MemoryOptimForceUpdate, bool); + DECL_ARGUMENT_FIELD(static_memory_optim, StaticMemoryOptim, bool); + DECL_ARGUMENT_FIELD(static_memory_optim_force_update, + StaticMemoryOptimForceUpdate, bool); // Indicate which kind of sort algorithm is used for operators, the memory // optimization relays on the sort algorithm. DECL_ARGUMENT_FIELD(memory_optim_sort_kind, MemoryOptimSortKind, int); diff --git a/paddle/fluid/inference/analysis/helper.cc b/paddle/fluid/inference/analysis/helper.cc index ca40c01fc57dbcc2ca16770a1b7d798de8b5625b..4f5c50d0d6b9ac94130cb82fb342ae5ee592f2c0 100644 --- a/paddle/fluid/inference/analysis/helper.cc +++ b/paddle/fluid/inference/analysis/helper.cc @@ -36,6 +36,14 @@ void SetAttr(framework::proto::OpDesc *op, const std::string &name, attr->set_i(data); } template <> +void SetAttr(framework::proto::OpDesc *op, const std::string &name, + const bool &data) { + auto *attr = op->add_attrs(); + attr->set_name(name); + attr->set_type(paddle::framework::proto::AttrType::BOOLEAN); + attr->set_b(data); +} +template <> void SetAttr(framework::proto::OpDesc *op, const std::string &name, const int64_t &data) { auto *attr = op->add_attrs(); diff --git a/paddle/fluid/inference/analysis/helper.h b/paddle/fluid/inference/analysis/helper.h index de04713b531dc421b885473cc8956e8ba6b63574..120f6ef27d49ae59ec36304dc3742cd9ca0afa4b 100644 --- a/paddle/fluid/inference/analysis/helper.h +++ b/paddle/fluid/inference/analysis/helper.h @@ -17,6 +17,7 @@ limitations under the License. */ #include #include #include +#include #include #include #include @@ -29,9 +30,14 @@ limitations under the License. */ #include "paddle/fluid/platform/port.h" #ifdef _WIN32 +#include +#include #define GCC_ATTRIBUTE(attr__) ; +#define MKDIR(path) _mkdir(path) #else +#include #define GCC_ATTRIBUTE(attr__) __attribute__((attr__)); +#define MKDIR(path) mkdir(path, S_IRWXU | S_IRWXG | S_IROTH | S_IXOTH) #endif #define __SHOULD_USE_RESULT__ GCC_ATTRIBUTE(warn_unused_result) @@ -163,6 +169,54 @@ static bool PathExists(const std::string &path) { return false; } +static std::string GetDirRoot(const std::string &path) { + char sep = '/'; + +#ifdef _WIN32 + sep = '\\'; +#endif + + size_t i = path.rfind(sep, path.length()); + if (i != std::string::npos) { + return (path.substr(0, i)); + } + return path; +} + +static std::string GetOrCreateModelOptCacheDir(const std::string &model_root) { + std::string opt_cache_dir = model_root + "/_opt_cache/"; + if (!PathExists(opt_cache_dir)) { + PADDLE_ENFORCE(MKDIR(opt_cache_dir.c_str()) != -1, + "Can not create optimize cache directory: %s, Make sure you " + "have permission to write", + opt_cache_dir); + } + return opt_cache_dir; +} + +static std::string GetTrtCalibPath(const std::string &model_root, + const std::string &engine_key) { + return model_root + "/trt_calib_" + engine_key; +} + +// If there is no calib table data file in model_opt_cache_dir, return "". +static std::string GetTrtCalibTableData(const std::string &model_opt_cache_dir, + const std::string &engine_key, + bool enable_int8) { + std::string trt_calib_table_path = + GetTrtCalibPath(model_opt_cache_dir, engine_key); + if (enable_int8 && FileExists(trt_calib_table_path)) { + VLOG(3) << "Calibration table file: " << trt_calib_table_path + << "is found here"; + std::ifstream infile(trt_calib_table_path, std::ios::in); + std::stringstream buffer; + buffer << infile.rdbuf(); + std::string calibration_data(buffer.str()); + return calibration_data; + } + return ""; +} + } // namespace analysis } // namespace inference } // namespace paddle diff --git a/paddle/fluid/inference/analysis/ir_pass_manager.cc b/paddle/fluid/inference/analysis/ir_pass_manager.cc index 4e1464226450b833e6d8dae2be2dcad89dd1e5e4..99611ce84b23896dd173831a03d77c6e0252d998 100644 --- a/paddle/fluid/inference/analysis/ir_pass_manager.cc +++ b/paddle/fluid/inference/analysis/ir_pass_manager.cc @@ -67,6 +67,20 @@ void IRPassManager::CreatePasses(Argument *argument, pass->Set("max_batch_size", new int(argument->tensorrt_max_batch_size())); pass->Set("min_subgraph_size", new int(argument->tensorrt_min_subgraph_size())); + pass->Set("program", + new framework::ProgramDesc *(&argument->main_program())); + + bool enable_int8 = argument->tensorrt_precision_mode() == + contrib::AnalysisConfig::Precision::kInt8; + + pass->Set("enable_int8", new bool(enable_int8)); + std::string model_opt_cache_dir = + argument->Has("model_dir") + ? argument->model_dir() + : GetDirRoot(argument->model_program_path()); + pass->Set( + "model_opt_cache_dir", + new std::string(GetOrCreateModelOptCacheDir(model_opt_cache_dir))); } // graph_ = pass->Apply(std::move(graph_)); @@ -91,11 +105,14 @@ std::unique_ptr IRPassManager::Apply(std::unique_ptr graph) { } framework::proto::ProgramDesc IRPassManager::AcquireProgram( - std::unique_ptr *graph, const ProgramDesc &program) const { + std::unique_ptr *graph, ProgramDesc *program) const { auto pass = framework::ir::PassRegistry::Instance().Get("graph_to_program_pass"); - ProgramDesc desc(program); + // Direct using ProgramDesc desc(argument->main_program()) may cause + // incomplete copies of information. + ProgramDesc desc; + desc.CopyFrom(*program->Proto()); pass->SetNotOwned("program", &desc); auto *the_graph = graph->release(); *graph = pass->Apply(std::unique_ptr(the_graph)); diff --git a/paddle/fluid/inference/analysis/ir_pass_manager.h b/paddle/fluid/inference/analysis/ir_pass_manager.h index 983a582649706fa6eedb5aa459b5ac53b98f658b..2a595cb36b8345157b3fd26afc62aabfa98b87bc 100644 --- a/paddle/fluid/inference/analysis/ir_pass_manager.h +++ b/paddle/fluid/inference/analysis/ir_pass_manager.h @@ -29,6 +29,7 @@ #include "paddle/fluid/framework/program_desc.h" #include "paddle/fluid/framework/scope.h" #include "paddle/fluid/inference/analysis/argument.h" +#include "paddle/fluid/inference/analysis/helper.h" namespace paddle { namespace inference { @@ -42,8 +43,8 @@ class IRPassManager final { std::unique_ptr Apply(std::unique_ptr graph); - framework::proto::ProgramDesc AcquireProgram( - std::unique_ptr *graph, const ProgramDesc &program) const; + framework::proto::ProgramDesc AcquireProgram(std::unique_ptr *graph, + ProgramDesc *program) const; framework::ir::Graph &graph() const { return *graph_; } diff --git a/paddle/fluid/inference/analysis/ir_passes/tensorrt_subgraph_pass.cc b/paddle/fluid/inference/analysis/ir_passes/tensorrt_subgraph_pass.cc index 5f25303cc1eaa6b563f0f8f4289b38499eb487cc..69a9caec030600332c9f11ba255e4e642bd41e96 100644 --- a/paddle/fluid/inference/analysis/ir_passes/tensorrt_subgraph_pass.cc +++ b/paddle/fluid/inference/analysis/ir_passes/tensorrt_subgraph_pass.cc @@ -13,6 +13,7 @@ // limitations under the License. #include +#include #include #include @@ -67,12 +68,33 @@ std::unique_ptr analysis::TensorRtSubgraphPass::ApplyImpl( return graph; } +std::string GenerateEngineKey(const std::set &engine_inputs, + const std::set &engine_outputs) { + std::string engine_hash_key = ""; + for (auto name : engine_inputs) { + engine_hash_key += name; + } + for (auto name : engine_outputs) { + engine_hash_key += name; + } + auto engine_key = std::to_string(std::hash()(engine_hash_key)); + return engine_key; +} + void TensorRtSubgraphPass::CreateTensorRTOp(framework::ir::Node *node, Graph *graph) const { auto *op_desc = node->Op(); auto &subgraph = *Agent(node).subgraph(); PADDLE_ENFORCE(!subgraph.empty()); + framework::ProgramDesc *program_desc = + Get("program"); + // Add new block for TensorRTEngineOP + const framework::BlockDesc &main_block = + program_desc->Block(framework::kRootBlockIndex); + // const framework::BlockDesc& main_block = program_desc->Block(0); + framework::BlockDesc *new_block = program_desc->AppendBlock(main_block); + // An fake block desc. framework::proto::BlockDesc block_proto; framework::BlockDesc block_desc(nullptr, &block_proto); @@ -82,13 +104,18 @@ void TensorRtSubgraphPass::CreateTensorRTOp(framework::ir::Node *node, subgraph.size()); for (auto *node : subgraph) { + auto *new_block_op = new_block->AppendOp(); auto *op = block_desc.AppendOp(); + *new_block_op->Proto() = *node->Op()->Proto(); *op->Proto() = *node->Op()->Proto(); } - // collect inputs - std::unordered_set input_names; - std::unordered_set input_names_with_id; + // Then, we will use the input_names_with_id and output_names_with_id to + // generate the eigine key. + // So, We use set instead of unordered_set here to ensure that the engine key + // is unique. + std::set input_names; + std::set input_names_with_id; for (auto *x : node->inputs) { input_names.insert(x->Name()); input_names_with_id.insert(x->Name() + std::to_string(x->id())); @@ -96,8 +123,8 @@ void TensorRtSubgraphPass::CreateTensorRTOp(framework::ir::Node *node, op_desc->SetInput( "Xs", std::vector(input_names.begin(), input_names.end())); - std::unordered_set output_names; - std::unordered_set output_names_with_id; + std::set output_names; + std::set output_names_with_id; for (auto *x : node->outputs) { output_names.insert(x->Name()); output_names_with_id.insert(x->Name() + std::to_string(x->id())); @@ -182,7 +209,6 @@ void TensorRtSubgraphPass::CreateTensorRTOp(framework::ir::Node *node, // to Tensor. std::vector output_mapping; for (auto name : output_names) { - // LOG(INFO) << name << " " << output_name_map.size(); PADDLE_ENFORCE(output_name_map.count(name) != 0); output_mapping.push_back(output_name_map[name]); } @@ -193,16 +219,29 @@ void TensorRtSubgraphPass::CreateTensorRTOp(framework::ir::Node *node, *vars->Add() = *node->Var()->Proto(); } } + PADDLE_ENFORCE(!block_desc.Proto()->vars().empty(), "the block has no var-desc"); PADDLE_ENFORCE(!output_mapping.empty()); - // Set attrs + op_desc->SetBlockAttr("sub_block", new_block); SetAttr(op_desc->Proto(), "subgraph", block_desc.Proto()->SerializeAsString()); + // Set attrs SetAttr(op_desc->Proto(), "max_batch_size", Get("max_batch_size")); SetAttr(op_desc->Proto(), "workspace_size", Get("workspace_size")); SetAttr(op_desc->Proto(), "parameters", ExtractParameters(graph->Nodes())); SetAttr(op_desc->Proto(), "output_name_mapping", output_mapping); + + auto enable_int8 = Get("enable_int8"); + auto engine_key = + GenerateEngineKey(input_names_with_id, output_names_with_id); + + std::string calibration_data = GetTrtCalibTableData( + Get("model_opt_cache_dir"), engine_key, enable_int8); + SetAttr(op_desc->Proto(), "calibration_data", calibration_data); + + SetAttr(op_desc->Proto(), "enable_int8", enable_int8); + SetAttr(op_desc->Proto(), "engine_key", engine_key); } std::vector ExtractParameters( diff --git a/paddle/fluid/inference/analysis/passes/CMakeLists.txt b/paddle/fluid/inference/analysis/passes/CMakeLists.txt index 691c336ebe4b6a6cb60023859b21665b6a4756a8..9d74dc6c211e4fcb6d1e7de5369eee847f49fc78 100644 --- a/paddle/fluid/inference/analysis/passes/CMakeLists.txt +++ b/paddle/fluid/inference/analysis/passes/CMakeLists.txt @@ -1,6 +1,6 @@ cc_library(ir_graph_build_pass SRCS ir_graph_build_pass.cc DEPS analysis_pass argument ir_pass_manager) cc_library(ir_analysis_pass SRCS ir_analysis_pass.cc DEPS analysis_pass argument ir_pass_manager) -cc_library(memory_optim_pass SRCS memory_optimize_pass.cc DEPS analysis_pass) +cc_library(memory_optim_pass SRCS memory_optimize_pass.cc DEPS analysis_pass zero_copy_tensor) cc_library(ir_params_sync_among_devices_pass SRCS ir_params_sync_among_devices_pass.cc DEPS analysis_pass argument ir_pass_manager) cc_library(ir_graph_to_program_pass SRCS ir_graph_to_program_pass.cc DEPS analysis_pass graph_to_program_pass) diff --git a/paddle/fluid/inference/analysis/passes/ir_graph_to_program_pass.cc b/paddle/fluid/inference/analysis/passes/ir_graph_to_program_pass.cc index f1da37af3cc5fa55eb66a1822aefe96eda1dc4fb..6b3d80fcef0be1527062edbb37ea39cc5d95a168 100644 --- a/paddle/fluid/inference/analysis/passes/ir_graph_to_program_pass.cc +++ b/paddle/fluid/inference/analysis/passes/ir_graph_to_program_pass.cc @@ -31,7 +31,11 @@ void IrGraphToProgramPass::RunImpl(Argument *argument) { } std::unique_ptr graph(argument->main_graph_ptr()); - framework::ProgramDesc desc(argument->main_program()); + + // Direct using ProgramDesc desc(argument->main_program()) may cause + // incomplete copies of information. + framework::ProgramDesc desc; + desc.CopyFrom(*argument->main_program().Proto()); pass->SetNotOwned("program", &desc); auto thegraph = pass->Apply(std::move(graph)); thegraph.release(); // the argument still own the graph. diff --git a/paddle/fluid/inference/analysis/passes/memory_optimize_pass.cc b/paddle/fluid/inference/analysis/passes/memory_optimize_pass.cc index 57683c0b727ef1c922e3a308db28d0af4f193602..3d1be9196fdeacd8ff852dbb595473a687352ccf 100644 --- a/paddle/fluid/inference/analysis/passes/memory_optimize_pass.cc +++ b/paddle/fluid/inference/analysis/passes/memory_optimize_pass.cc @@ -444,6 +444,26 @@ std::vector>> DeseralizeBatchVarShapes( return batch_shapes; } +// Replace the -1 in shape to a real number to fake the shape. +std::vector>> FakeBatchVarShapes( + const framework::ProgramDesc& program) { + std::vector>> res; + res.emplace_back(); + auto& record = res.front(); + const int fake_batch_size = 3; + for (auto* var : program.Block(0).AllVars()) { + if (var->GetType() == + framework::proto::VarType::Type::VarType_Type_LOD_TENSOR) { + auto shape = var->GetShape(); + for (auto& v : shape) { + if (v < 0) v = fake_batch_size; + } + record[var->Name()].assign(shape.begin(), shape.end()); + } + } + return res; +} + // Calculate the average dim of each tensor from the batch shape cache. std::unordered_map GetBatchAverageSize( const std::vector>>& batches) { @@ -478,6 +498,7 @@ std::vector> AnalysisBatchShapesByBatchSize( std::unordered_map var_batchsize_hashes; for (auto& batch : batches) { for (auto& ele : batch) { + PADDLE_ENFORCE(!ele.second.empty()); int batch_size = ele.second.front(); // TODO(Superjomn) might consume large memory here, use combine hash. var_batchsize_hashes[ele.first] << batch_size; @@ -538,9 +559,21 @@ std::vector> AnalysisBatchShapesBySimilarSize( std::string MemoryOptimizePass::repr() const { return "memory optimize pass"; } +std::pair GetRange( + const std::unordered_map& ave_size) { + auto res = std::make_pair(std::numeric_limits::max(), + std::numeric_limits::min()); + for (auto& item : ave_size) { + res.first = std::min(item.second, res.first); + res.second = std::max(item.second, res.second); + } + return res; +} + void MemoryOptimizePass::RunImpl(Argument* argument) { // When force update, should not optimize memory. - if (!argument->enable_memory_optim() || argument->memory_optim_force_update()) + if (!argument->enable_memory_optim() || + argument->static_memory_optim_force_update()) return; graph_ = argument->main_graph_ptr(); @@ -549,21 +582,38 @@ void MemoryOptimizePass::RunImpl(Argument* argument) { argument->model_program_path_valid() ? argument->model_program_path() : ""); VLOG(3) << "Load memory cache from " << path; - if (inference::IsFileExists(path)) { - VLOG(4) << "Performing memory optimize"; - auto batches = DeseralizeBatchVarShapes(path); - auto var_batch_ave_size = GetBatchAverageSize(batches); + std::vector>> batches; + + if (argument->static_memory_optim() && inference::IsFileExists(path)) { + string::PrettyLogInfo("--- Performing static memory optimize"); + batches = DeseralizeBatchVarShapes(path); + } else { + string::PrettyLogInfo("--- Performing dynamic memory optimize"); + batches = FakeBatchVarShapes(argument->main_program()); + } + auto var_batch_ave_size = GetBatchAverageSize(batches); + + // Get min and max memory size. + const auto range = GetRange(var_batch_ave_size); + const int cluster_size = std::max( + static_cast((range.second - range.first) / 100 /*cluster num*/), + 1024); + const int cluster_size1 = std::max( + static_cast((range.second - range.first) / 1000 /*cluster num*/), + 1024); - std::unordered_map tensor_nodes; - space_table_t space_table; - CollectVarMemorySize(var_batch_ave_size, &tensor_nodes, &space_table); + std::unordered_map tensor_nodes; + space_table_t space_table; + CollectVarMemorySize(var_batch_ave_size, &tensor_nodes, &space_table); - std::unordered_map reuse_table; - double max_saving_ratio = 0.; + std::unordered_map reuse_table; + double max_saving_ratio = 0.; - std::vector> strategies; + std::vector> strategies; - for (int sort_kind = 0; sort_kind < 2; sort_kind++) { + for (int sort_kind = 0; sort_kind < 2; sort_kind++) { + if (argument->static_memory_optim()) { + // This strategy only make scene in static memory optimize. strategies.emplace_back([&, sort_kind] { auto clustered_vars_by_batch_size = AnalysisBatchShapesByBatchSize(batches); @@ -572,71 +622,67 @@ void MemoryOptimizePass::RunImpl(Argument* argument) { space_table, &reuse_table, sort_kind, &allocation); return allocation; }); + } - strategies.emplace_back([&, sort_kind] { - auto clustered_vars_by_ave_size = AnalysisBatchShapesBySimilarSize( - space_table, batches, 1024); // interval 1kb - MemoryAllocation allocation; - MakeReusePlan(clustered_vars_by_ave_size, var_batch_ave_size, - space_table, &reuse_table, sort_kind, &allocation); - return allocation; - }); + strategies.emplace_back([&, sort_kind] { + auto clustered_vars_by_ave_size = + AnalysisBatchShapesBySimilarSize(space_table, batches, cluster_size); + MemoryAllocation allocation; + MakeReusePlan(clustered_vars_by_ave_size, var_batch_ave_size, space_table, + &reuse_table, sort_kind, &allocation); + return allocation; + }); + + strategies.emplace_back([&, sort_kind] { + auto clustered_vars_by_ave_size = + AnalysisBatchShapesBySimilarSize(space_table, batches, cluster_size1); + MemoryAllocation allocation; + MakeReusePlan(clustered_vars_by_ave_size, var_batch_ave_size, space_table, + &reuse_table, sort_kind, &allocation); + return allocation; + }); + + strategies.emplace_back([&, sort_kind] { + auto clustered_vars_by_ave_size = AnalysisBatchShapesBySimilarSize( + space_table, batches, + std::numeric_limits::max()); // no intervals + MemoryAllocation allocation; + MakeReusePlan(clustered_vars_by_ave_size, var_batch_ave_size, space_table, + &reuse_table, sort_kind, &allocation); + return allocation; + }); + } - strategies.emplace_back([&, sort_kind] { - auto clustered_vars_by_ave_size = AnalysisBatchShapesBySimilarSize( - space_table, batches, 1024 * 1024); // interval 1MB - MemoryAllocation allocation; - MakeReusePlan(clustered_vars_by_ave_size, var_batch_ave_size, - space_table, &reuse_table, sort_kind, &allocation); - return allocation; - }); + std::function* best_strategy{nullptr}; - strategies.emplace_back([&, sort_kind] { - auto clustered_vars_by_ave_size = AnalysisBatchShapesBySimilarSize( - space_table, batches, - std::numeric_limits::max()); // no intervals - MemoryAllocation allocation; - MakeReusePlan(clustered_vars_by_ave_size, var_batch_ave_size, - space_table, &reuse_table, sort_kind, &allocation); - return allocation; - }); + // Try all strategies to get the best result. + for (auto& strategy : strategies) { + auto allocation = strategy(); + string::PrettyLogDetail("--- get strategy saving %f memory for workspace", + allocation.GetSavingRatio()); + if (allocation.GetSavingRatio() > max_saving_ratio) { + max_saving_ratio = allocation.GetSavingRatio(); + best_strategy = &strategy; } + } + if (!best_strategy) { + LOG(ERROR) << "This model makes poor memory optimize, skip memory optimize"; + return; + } + auto memory_allocation = (*best_strategy)(); - std::function* best_strategy{nullptr}; + string::PrettyLogInfo( + "--- Saved %.2f%s memory for workspace(temporary variables)", + memory_allocation.GetSavingRatio() * 100, "%"); - // Try all strategies to get the best result. - for (auto& strategy : strategies) { - auto allocation = strategy(); - string::PrettyLogDetail("--- get strategy saving %f memory for workspace", - allocation.GetSavingRatio()); - if (allocation.GetSavingRatio() > max_saving_ratio) { - max_saving_ratio = allocation.GetSavingRatio(); - best_strategy = &strategy; - } - } - if (!best_strategy) { - LOG(ERROR) - << "This model makes poor memory optimize, skip memory optimize"; - return; - } - auto memory_allocation = (*best_strategy)(); - - string::PrettyLogH2( - "--- Saved %.2f%s memory for workspace(temporary variables)", - memory_allocation.GetSavingRatio() * 100, "%"); - string::PrettyLogDetail("--- Allocated %d MB", - memory_allocation.allocated / 1024. / 1024.); - string::PrettyLogDetail("--- Saved %d MB", - memory_allocation.saved / 1024. / 1024.); - argument->main_graph().Set(framework::ir::kGraphToProgramVarsToRemove, - new std::unordered_set); - auto& vars2remove = - argument->main_graph().Get>( - framework::ir::kGraphToProgramVarsToRemove); - - PerformReusePlan(reuse_table, memory_allocation.sort_kind, &vars2remove); - argument->SetMemoryOptimSortKind(memory_allocation.sort_kind); - } + argument->main_graph().Set(framework::ir::kGraphToProgramVarsToRemove, + new std::unordered_set); + auto& vars2remove = + argument->main_graph().Get>( + framework::ir::kGraphToProgramVarsToRemove); + + PerformReusePlan(reuse_table, memory_allocation.sort_kind, &vars2remove); + argument->SetMemoryOptimSortKind(memory_allocation.sort_kind); } float MemoryOptimizePass::MemoryAllocation::GetSavingRatio() const { diff --git a/paddle/fluid/inference/analysis/passes/memory_optimize_pass.h b/paddle/fluid/inference/analysis/passes/memory_optimize_pass.h index fa1ad9c8c6aeff60ec4468f41140c57be790af7f..216f416de0d1003b944337ee98fb4e6a22c66fc5 100644 --- a/paddle/fluid/inference/analysis/passes/memory_optimize_pass.h +++ b/paddle/fluid/inference/analysis/passes/memory_optimize_pass.h @@ -15,7 +15,7 @@ #pragma once #include "paddle/fluid/inference/analysis/analysis_pass.h" -#include "paddle/fluid/inference/analysis/passes/memory_optimize_pass.h" +#include "paddle/fluid/platform/port.h" namespace paddle { namespace inference { diff --git a/paddle/fluid/inference/api/analysis_config.cc b/paddle/fluid/inference/api/analysis_config.cc index f9da3004ed8306ef08144d096afa4f86133e492d..8efd514bd8397f099fd07321ad7e5d4ca253e229 100644 --- a/paddle/fluid/inference/api/analysis_config.cc +++ b/paddle/fluid/inference/api/analysis_config.cc @@ -95,12 +95,14 @@ contrib::AnalysisConfig::AnalysisConfig(const contrib::AnalysisConfig &other) { CP_MEMBER(memory_pool_init_size_mb_); CP_MEMBER(enable_memory_optim_); - CP_MEMBER(memory_optim_force_update_); + CP_MEMBER(static_memory_optim_); + CP_MEMBER(static_memory_optim_force_update_); // TensorRT releated. CP_MEMBER(use_tensorrt_); CP_MEMBER(tensorrt_workspace_size_); CP_MEMBER(tensorrt_max_batchsize_); CP_MEMBER(tensorrt_min_subgraph_size_); + CP_MEMBER(tensorrt_precision_mode_); // MKLDNN releated. CP_MEMBER(use_mkldnn_); CP_MEMBER(mkldnn_enabled_op_types_); @@ -140,9 +142,9 @@ void contrib::AnalysisConfig::EnableMKLDNN() { Update(); } -void contrib::AnalysisConfig::EnableTensorRtEngine(int workspace_size, - int max_batch_size, - int min_subgraph_size) { +void contrib::AnalysisConfig::EnableTensorRtEngine( + int workspace_size, int max_batch_size, int min_subgraph_size, + contrib::AnalysisConfig::Precision precision_mode) { #ifdef PADDLE_WITH_CUDA if (!use_gpu()) { LOG(ERROR) << "To use TensorRT engine, please call EnableGpu() first"; @@ -153,6 +155,7 @@ void contrib::AnalysisConfig::EnableTensorRtEngine(int workspace_size, tensorrt_workspace_size_ = workspace_size; tensorrt_max_batchsize_ = max_batch_size; tensorrt_min_subgraph_size_ = min_subgraph_size; + tensorrt_precision_mode_ = precision_mode; Update(); #else @@ -238,7 +241,8 @@ std::string contrib::AnalysisConfig::SerializeInfoCache() { ss << tensorrt_min_subgraph_size_; ss << enable_memory_optim_; - ss << memory_optim_force_update_; + ss << static_memory_optim_; + ss << static_memory_optim_force_update_; ss << use_mkldnn_; for (auto &item : mkldnn_enabled_op_types_) ss << item; @@ -278,9 +282,11 @@ float contrib::AnalysisConfig::fraction_of_gpu_memory_for_pool() const { #endif } -void contrib::AnalysisConfig::EnableMemoryOptim(bool force_update_cache) { +void contrib::AnalysisConfig::EnableMemoryOptim( + bool static_optim, bool force_update_static_cache) { enable_memory_optim_ = true; - memory_optim_force_update_ = force_update_cache; + static_memory_optim_ = static_optim; + static_memory_optim_force_update_ = force_update_static_cache; Update(); } @@ -300,4 +306,16 @@ void contrib::AnalysisConfig::SetModelBuffer(const char *prog_buffer, Update(); } +NativeConfig contrib::AnalysisConfig::ToNativeConfig() const { + NativeConfig config; + config.model_dir = model_dir_; + config.prog_file = prog_file_; + config.param_file = params_file_; + config.use_gpu = use_gpu_; + config.device = device_id_; + config.fraction_of_gpu_memory = fraction_of_gpu_memory_for_pool(); + config.specify_input_name = specify_input_name_; + return config; +} + } // namespace paddle diff --git a/paddle/fluid/inference/api/analysis_predictor.cc b/paddle/fluid/inference/api/analysis_predictor.cc index 2b0cad5faa0e31cb7546d405e05e36754915f653..66374cb7f07b3d9b6bfbff8382a3dfa7e8f2b04f 100644 --- a/paddle/fluid/inference/api/analysis_predictor.cc +++ b/paddle/fluid/inference/api/analysis_predictor.cc @@ -15,6 +15,7 @@ #include "paddle/fluid/inference/api/analysis_predictor.h" #include #include +#include #include #include #include @@ -25,6 +26,7 @@ #include "paddle/fluid/framework/naive_executor.h" #include "paddle/fluid/framework/scope.h" #include "paddle/fluid/framework/var_type_traits.h" +#include "paddle/fluid/inference/analysis/helper.h" #include "paddle/fluid/inference/analysis/passes/memory_optimize_pass.h" #include "paddle/fluid/inference/api/helper.h" #include "paddle/fluid/inference/api/paddle_inference_api.h" @@ -37,6 +39,8 @@ #if PADDLE_WITH_TENSORRT #include "paddle/fluid/inference/tensorrt/convert/op_converter.h" +#include "paddle/fluid/inference/tensorrt/trt_int8_calibrator.h" + #endif DECLARE_bool(profile); @@ -44,6 +48,12 @@ DECLARE_bool(profile); namespace paddle { using contrib::AnalysisConfig; +using inference::Singleton; +#if PADDLE_WITH_TENSORRT +using inference::tensorrt::TRTInt8Calibrator; +using inference::tensorrt::TRTCalibratorEngine; +using inference::tensorrt::TRTCalibratorEngineManager; +#endif namespace { bool IsPersistable(const framework::VarDesc *var) { @@ -113,6 +123,15 @@ bool AnalysisPredictor::PrepareProgram( if (!program) { if (!LoadProgramDesc()) return false; + // If not cloned, the parameters should be loaded. + // If config_.ir_optim() is True, parameters is loaded in + // OptimizeInferenceProgram(), but other persistable variables + // (like RAW type var) are not created in scope. + // If config_.ir_optim() is False, parameters is loaded in LoadParameters(), + // still need to create other persistable variables. + // So in both case, create persistable variables at first. + executor_->CreateVariables(*inference_program_, 0, true, sub_scope_); + // Optimize the program, and load parameters and modify them in the // scope_. // This will change the scope_ address. @@ -120,15 +139,6 @@ bool AnalysisPredictor::PrepareProgram( status_ir_optim_enabled_ = true; OptimizeInferenceProgram(); } else { - // If the parent_scope is passed, we assert that the persistable variables - // are already created, so just create the no persistable variables. - - // If not cloned, the parameters should be loaded - // OptimizeInferenceProgram. - // So in both cases, just the local variables are needed to load, not the - // parematers. - executor_->CreateVariables(*inference_program_, 0, true, sub_scope_); - // Load parameters LOG(INFO) << "load parameters "; LoadParameters(); @@ -298,15 +308,15 @@ void AnalysisPredictor::GetFetchOne(const framework::LoDTensor &fetch, bool AnalysisPredictor::GetFetch(std::vector *outputs, framework::Scope *scope) { VLOG(3) << "Predictor::get_fetch"; - outputs->resize(fetchs_.size()); - for (size_t i = 0; i < fetchs_.size(); ++i) { - int idx = boost::get(fetchs_[i]->GetAttr("col")); + outputs->resize(fetches_.size()); + for (size_t i = 0; i < fetches_.size(); ++i) { + int idx = boost::get(fetches_[i]->GetAttr("col")); PADDLE_ENFORCE((size_t)idx == i); framework::LoDTensor &fetch = framework::GetFetchVariable(*scope, "fetch", idx); auto type = fetch.type(); auto output = &(outputs->at(i)); - output->name = fetchs_[idx]->Input("X")[0]; + output->name = fetches_[idx]->Input("X")[0]; if (type == framework::proto::VarType::FP32) { GetFetchOne(fetch, output); output->dtype = PaddleDType::FLOAT32; @@ -327,7 +337,9 @@ void AnalysisPredictor::OptimizeInferenceProgram() { argument_.SetUseGPU(config_.use_gpu()); argument_.SetGPUDeviceId(config_.gpu_device_id()); argument_.SetEnableMemoryOptim(config_.enable_memory_optim()); - argument_.SetMemoryOptimForceUpdate(config_.memory_optim_force_update_); + argument_.SetStaticMemoryOptim(config_.static_memory_optim_); + argument_.SetStaticMemoryOptimForceUpdate( + config_.static_memory_optim_force_update_); argument_.SetModelFromMemory(config_.model_from_memory_); // Analyze inference_program if (!config_.model_dir().empty()) { @@ -337,6 +349,8 @@ void AnalysisPredictor::OptimizeInferenceProgram() { !config_.params_file().empty(), "Either model_dir or (param_file, prog_file) should be set."); PADDLE_ENFORCE(!config_.prog_file().empty()); + std::string dir = inference::analysis::GetDirRoot(config_.prog_file()); + argument_.SetModelProgramPath(config_.prog_file()); argument_.SetModelParamsPath(config_.params_file()); } @@ -347,6 +361,7 @@ void AnalysisPredictor::OptimizeInferenceProgram() { argument_.SetTensorRtWorkspaceSize(config_.tensorrt_workspace_size_); argument_.SetTensorRtMaxBatchSize(config_.tensorrt_max_batchsize_); argument_.SetTensorRtMinSubgraphSize(config_.tensorrt_min_subgraph_size_); + argument_.SetTensorRtPrecisionMode(config_.tensorrt_precision_mode_); } if (config_.use_mkldnn_) { @@ -361,7 +376,7 @@ void AnalysisPredictor::OptimizeInferenceProgram() { } argument_.SetIrAnalysisPasses(passes); argument_.SetAnalysisPasses(config_.pass_builder()->AnalysisPasses()); - argument_.SetScopeNotOwned(const_cast(scope_.get())); + argument_.SetScopeNotOwned(scope_.get()); Analyzer().Run(&argument_); PADDLE_ENFORCE(argument_.scope_valid()); @@ -422,10 +437,10 @@ void AnalysisPredictor::PrepareFeedFetch() { feed_names_[op->Output("Out")[0]] = idx; } else if (op->Type() == "fetch") { int idx = boost::get(op->GetAttr("col")); - if (fetchs_.size() <= static_cast(idx)) { - fetchs_.resize(idx + 1); + if (fetches_.size() <= static_cast(idx)) { + fetches_.resize(idx + 1); } - fetchs_[idx] = op; + fetches_[idx] = op; } } } @@ -567,7 +582,67 @@ bool AnalysisPredictor::LoadParameters() { return true; } +#if PADDLE_WITH_TENSORRT +bool AnalysisPredictor::SaveTrtCalibToDisk() { + PADDLE_ENFORCE(config_.tensorrt_engine_enabled(), + "This func can be invoked only in trt mode"); + auto &block = inference_program_->Block(0); + for (auto &op_desc : block.AllOps()) { + if (op_desc->Type() == "tensorrt_engine") { + std::string engine_name = + boost::get(op_desc->GetAttr("engine_key")); + if (!Singleton::Global().Has(engine_name)) { + LOG(ERROR) << "You should run the predictor(with trt) on the real data " + "to generate calibration info"; + return false; + } + TRTCalibratorEngine *calib_engine = + Singleton::Global().Get(engine_name); + LOG(INFO) << "Wait for calib threads done."; + calib_engine->calib_->waitAndSetDone(); + LOG(INFO) << "Generating TRT Calibration table data, this may cost a lot " + "of time..."; + calib_engine->thr_->join(); + std::string calibration_table_data = + calib_engine->calib_->getCalibrationTableAsString(); + + if (calibration_table_data.empty()) { + LOG(ERROR) << "the calibration table is empty."; + return false; + } + + std::string model_opt_cache_dir = + argument_.Has("model_dir") + ? argument_.model_dir() + : inference::analysis::GetDirRoot(argument_.model_program_path()); + + std::string calibration_table_data_path = + inference::analysis::GetTrtCalibPath( + inference::analysis::GetOrCreateModelOptCacheDir( + model_opt_cache_dir), + engine_name); + + std::ofstream ofile(calibration_table_data_path, std::ios::out); + LOG(INFO) << "Write Paddle-TRT INT8 calibration table data to file " + << calibration_table_data_path; + ofile << calibration_table_data; + ofile.close(); + } + } + // Free all calibrator resources. + Singleton::Global().DeleteALL(); + return true; +} +#endif + AnalysisPredictor::~AnalysisPredictor() { +#if PADDLE_WITH_TENSORRT + if (config_.tensorrt_engine_enabled() && + config_.tensorrt_precision_mode_ == AnalysisConfig::Precision::kInt8 && + Singleton::Global().Has()) { + SaveTrtCalibToDisk(); + } +#endif if (FLAGS_profile) { platform::DisableProfiler(platform::EventSortingKey::kTotal, "./profile.log"); @@ -638,12 +713,12 @@ bool AnalysisPredictor::need_collect_var_shapes_for_memory_optim() { // check if the cache exists if (!config_.enable_memory_optim()) { need = false; - } else if (config_.enable_memory_optim() && + } else if (config_.static_memory_optim_ && !inference::IsFileExists(inference::analysis::GetMemoryCachePath( config_.model_dir(), config_.prog_file()))) { need = true; - } else if (config_.enable_memory_optim() && - config_.memory_optim_force_update_) { + } else if (config_.static_memory_optim_ && + config_.static_memory_optim_force_update_) { need = true; } @@ -651,6 +726,10 @@ bool AnalysisPredictor::need_collect_var_shapes_for_memory_optim() { return need; } +std::string AnalysisPredictor::GetSeriazlizedProgram() const { + return inference_program_->Proto()->SerializeAsString(); +} + template <> std::unique_ptr CreatePaddlePredictor( const contrib::AnalysisConfig &config) { diff --git a/paddle/fluid/inference/api/analysis_predictor.h b/paddle/fluid/inference/api/analysis_predictor.h index 9095b6ec1af6794c19e94fc9326a48239b3ba145..fa1d0d596df5a3619af74e0fead3a0b376186e08 100644 --- a/paddle/fluid/inference/api/analysis_predictor.h +++ b/paddle/fluid/inference/api/analysis_predictor.h @@ -75,6 +75,8 @@ class AnalysisPredictor : public PaddlePredictor { void SetMkldnnThreadID(int tid); + std::string GetSeriazlizedProgram() const override; + protected: // For memory optimization. bool need_collect_var_shapes_for_memory_optim(); @@ -97,6 +99,21 @@ class AnalysisPredictor : public PaddlePredictor { void GetFetchOne(const framework::LoDTensor &fetchs, PaddleTensor *output_data); +#if PADDLE_WITH_TENSORRT + // When we use Paddle-TRT INT8 engine, we need to generate calibration table + // data first, + // the calibration table contains the range for each op's input and output, + // this whole process can be divided into several steps: + // + // 1. Builds a 32-bit engine, runs it on the calibration set, and records a + // histogram for each + // tensor of the distribution of activation values. + // 2. Builds a calibration table from the histograms. + // + // After step 2, we need to store the calibration table on disk + bool SaveTrtCalibToDisk(); +#endif + // Some more detailed tests, they are made the friends of the predictor, so that // the all the details can be tested. #if PADDLE_WITH_TESTING @@ -115,7 +132,7 @@ class AnalysisPredictor : public PaddlePredictor { std::shared_ptr inference_program_; std::vector feeds_; std::map feed_names_; - std::vector fetchs_; + std::vector fetches_; // Memory buffer for feed inputs. The temporary LoDTensor will cause serious // concurrency problems, wrong results and memory leak, so cache them. std::vector feed_tensors_; diff --git a/paddle/fluid/inference/api/analysis_predictor_tester.cc b/paddle/fluid/inference/api/analysis_predictor_tester.cc index 4688e93d7102109d2c7ece9ba37bc8f2d311dcf1..20b61344da978a87baf654efd4ad2b3ae90454c0 100644 --- a/paddle/fluid/inference/api/analysis_predictor_tester.cc +++ b/paddle/fluid/inference/api/analysis_predictor_tester.cc @@ -215,6 +215,8 @@ TEST(AnalysisPredictor, memory_optim) { { // The first predictor help to cache the memory optimize strategy. auto predictor = CreatePaddlePredictor(config); + LOG(INFO) << "serialized program: " << predictor->GetSeriazlizedProgram(); + ASSERT_FALSE(predictor->GetSeriazlizedProgram().empty()); // Run several times to check the parameters are not reused by mistake. for (int i = 0; i < 5; i++) { diff --git a/paddle/fluid/inference/api/api.cc b/paddle/fluid/inference/api/api.cc index 9be059c73e20ebeeff2c4b6e8e5502e4a56fd0d6..6cd18277d63200f5bccf180a7ae3196b0ce126ff 100644 --- a/paddle/fluid/inference/api/api.cc +++ b/paddle/fluid/inference/api/api.cc @@ -12,6 +12,8 @@ // See the License for the specific language governing permissions and // limitations under the License. +#include +#include "paddle/fluid/framework/commit.h" #include "paddle/fluid/framework/lod_tensor.h" #include "paddle/fluid/framework/scope.h" #include "paddle/fluid/inference/api/paddle_inference_api.h" @@ -97,4 +99,12 @@ void PaddleBuf::Free() { } } +std::string get_version() { + std::stringstream ss; + ss << "version: " << framework::paddle_version() << "\n"; + ss << "commit: " << framework::paddle_commit() << "\n"; + ss << "branch: " << framework::paddle_compile_branch() << "\n"; + return ss.str(); +} + } // namespace paddle diff --git a/paddle/fluid/inference/api/api_tester.cc b/paddle/fluid/inference/api/api_tester.cc index 7a579610eefda24c911edd28b5f3a178aa10ab1e..2c450ef7cead4d5c3870d5e9186eb221e5dc19a0 100644 --- a/paddle/fluid/inference/api/api_tester.cc +++ b/paddle/fluid/inference/api/api_tester.cc @@ -61,4 +61,10 @@ TEST(paddle_inference_api, demo) { predictor->Run({}, &outputs); } +TEST(paddle_inference_api, get_version) { + LOG(INFO) << "paddle version:\n" << get_version(); + auto version = get_version(); + ASSERT_FALSE(version.empty()); +} + } // namespace paddle diff --git a/paddle/fluid/inference/api/paddle_analysis_config.h b/paddle/fluid/inference/api/paddle_analysis_config.h index 1cee8904500636d7b49e6b4e54595dbce6a79954..5b899b26d60dec3634d7016c925143e1ae26992d 100644 --- a/paddle/fluid/inference/api/paddle_analysis_config.h +++ b/paddle/fluid/inference/api/paddle_analysis_config.h @@ -42,6 +42,10 @@ struct AnalysisConfig { explicit AnalysisConfig(const std::string& model_dir); explicit AnalysisConfig(const std::string& prog_file, const std::string& params_file); + enum class Precision { + kFloat32 = 0, + kInt8, + }; /** Set model with a directory. */ @@ -135,7 +139,8 @@ struct AnalysisConfig { * subgraph is less than this, it will not transfer to TensorRT engine. */ void EnableTensorRtEngine(int workspace_size = 1 << 20, - int max_batch_size = 1, int min_subgraph_size = 3); + int max_batch_size = 1, int min_subgraph_size = 3, + Precision precision = Precision::kFloat32); /** A boolean state telling whether the TensorRT engine is used. */ bool tensorrt_engine_enabled() const { return use_tensorrt_; } @@ -162,17 +167,7 @@ struct AnalysisConfig { /** Transform the AnalysisConfig to NativeConfig. */ - NativeConfig ToNativeConfig() const { - NativeConfig config; - config.model_dir = model_dir_; - config.prog_file = prog_file_; - config.param_file = params_file_; - config.use_gpu = use_gpu_; - config.device = device_id_; - config.fraction_of_gpu_memory = fraction_of_gpu_memory_for_pool(); - config.specify_input_name = specify_input_name_; - return config; - } + NativeConfig ToNativeConfig() const; /** Specify the operator type list to use MKLDNN acceleration. * @param op_list the operator type list. */ @@ -195,7 +190,8 @@ struct AnalysisConfig { /** Turn on memory optimize * NOTE still in development, will release latter. */ - void EnableMemoryOptim(bool force_update_cache = false); + void EnableMemoryOptim(bool static_optim = false, + bool force_update_static_cache = false); /** Tell whether the memory optimization is activated. */ bool enable_memory_optim() const; @@ -238,10 +234,12 @@ struct AnalysisConfig { // We set this variable to control the minimum number of nodes in the // subgraph, 3 as default value. int tensorrt_min_subgraph_size_{3}; + Precision tensorrt_precision_mode_; // memory reuse related. bool enable_memory_optim_{false}; - bool memory_optim_force_update_{false}; + bool static_memory_optim_{false}; + bool static_memory_optim_force_update_{false}; bool use_mkldnn_{false}; std::unordered_set mkldnn_enabled_op_types_; diff --git a/paddle/fluid/inference/api/paddle_api.h b/paddle/fluid/inference/api/paddle_api.h index 46b510fd1ec94c59032b8f41a2ac4d6aa87dc150..406983224615fbdb649301f1ffe3fbd136938a61 100644 --- a/paddle/fluid/inference/api/paddle_api.h +++ b/paddle/fluid/inference/api/paddle_api.h @@ -215,6 +215,14 @@ class PaddlePredictor { */ virtual ~PaddlePredictor() = default; + /** \brief Get the serialized model program that executes in inference phase. + * Its data type is ProgramDesc, which is a protobuf message. + */ + virtual std::string GetSeriazlizedProgram() const { + assert(false); // Force raise error. + return "NotImplemented"; + }; + /** The common configs for all the predictors. */ struct Config { @@ -288,4 +296,6 @@ std::unique_ptr CreatePaddlePredictor(const ConfigT& config); int PaddleDtypeSize(PaddleDType dtype); +std::string get_version(); + } // namespace paddle diff --git a/paddle/fluid/inference/api/paddle_pass_builder.h b/paddle/fluid/inference/api/paddle_pass_builder.h index d3a60d209922ebe8d31723ca25c71a952ea08bd6..391932a1ee018c45818457c55fd8f82a22ab7405 100644 --- a/paddle/fluid/inference/api/paddle_pass_builder.h +++ b/paddle/fluid/inference/api/paddle_pass_builder.h @@ -154,13 +154,16 @@ class GpuPassStrategy : public PassStrategy { public: GpuPassStrategy() : PassStrategy({}) { passes_.assign({ - "infer_clean_graph_pass", // - "conv_affine_channel_fuse_pass", // - "conv_eltwiseadd_affine_channel_fuse_pass", // - "conv_bn_fuse_pass", // - "conv_elementwise_add_act_fuse_pass", // - "conv_elementwise_add2_act_fuse_pass", // - "conv_elementwise_add_fuse_pass", // + "infer_clean_graph_pass", // + "conv_affine_channel_fuse_pass", // + "conv_eltwiseadd_affine_channel_fuse_pass", // + "conv_bn_fuse_pass", // +#if CUDNN_VERSION >= 7100 // To run conv_fusion, the version of cudnn must be + // guaranteed at least v7 + "conv_elementwise_add_act_fuse_pass", // + "conv_elementwise_add2_act_fuse_pass", // + "conv_elementwise_add_fuse_pass", // +#endif }); for (int i = 6; i >= 3; i--) { diff --git a/paddle/fluid/inference/tensorrt/CMakeLists.txt b/paddle/fluid/inference/tensorrt/CMakeLists.txt index 9afeafd176c70bc03166ec7732ae5e2faf67ea54..f4977d08c4d051b8a528e122c47948c3c81d153c 100644 --- a/paddle/fluid/inference/tensorrt/CMakeLists.txt +++ b/paddle/fluid/inference/tensorrt/CMakeLists.txt @@ -1,4 +1,4 @@ -nv_library(tensorrt_engine SRCS engine.cc DEPS ${GLOB_OPERATOR_DEPS} framework_proto device_context) +nv_library(tensorrt_engine SRCS engine.cc trt_int8_calibrator.cc DEPS ${GLOB_OPERATOR_DEPS} framework_proto device_context) nv_library(tensorrt_op_teller SRCS op_teller.cc DEPS framework_proto) nv_test(test_tensorrt SRCS test_tensorrt.cc DEPS dynload_cuda device_context dynamic_loader) nv_test(test_tensorrt_engine SRCS test_engine.cc DEPS dynload_cuda tensorrt_engine) diff --git a/paddle/fluid/inference/tensorrt/engine.cc b/paddle/fluid/inference/tensorrt/engine.cc index 78b590f15d639f7b21b403413760948c6343d998..10f48462cfaf8073a4f5537d654d614d36b74db4 100644 --- a/paddle/fluid/inference/tensorrt/engine.cc +++ b/paddle/fluid/inference/tensorrt/engine.cc @@ -69,6 +69,13 @@ void TensorRTEngine::FreezeNetwork() { // build engine. infer_builder_->setMaxBatchSize(max_batch_); infer_builder_->setMaxWorkspaceSize(max_workspace_); + if (enable_int8_) { + infer_builder_->setInt8Mode(true); + PADDLE_ENFORCE( + calibrator_ != nullptr, + "The precision mode is 'INT8', the calibrator should not be nullptr"); + infer_builder_->setInt8Calibrator(calibrator_); + } infer_engine_.reset(infer_builder_->buildCudaEngine(*infer_network_)); PADDLE_ENFORCE(infer_engine_ != nullptr, "build cuda engine failed!"); diff --git a/paddle/fluid/inference/tensorrt/engine.h b/paddle/fluid/inference/tensorrt/engine.h index 65ab7f3caaa746cf339de67706939070a0b7d87d..cdfe09b5a7fd2d1f8548dab9421f671f5a345153 100644 --- a/paddle/fluid/inference/tensorrt/engine.h +++ b/paddle/fluid/inference/tensorrt/engine.h @@ -23,12 +23,14 @@ limitations under the License. */ #include "paddle/fluid/inference/engine.h" #include "paddle/fluid/inference/tensorrt/helper.h" #include "paddle/fluid/inference/tensorrt/plugin/trt_plugin.h" +#include "paddle/fluid/inference/tensorrt/trt_int8_calibrator.h" #include "paddle/fluid/inference/utils/singleton.h" namespace paddle { namespace inference { namespace tensorrt { +class TRTInt8Calibrator; /* * TensorRT Engine. * @@ -55,13 +57,16 @@ class TensorRTEngine : public EngineBase { }; TensorRTEngine(int max_batch, int max_workspace, cudaStream_t stream, - int device = 0, + int device = 0, bool enable_int8 = false, + TRTInt8Calibrator* calibrator = nullptr, nvinfer1::ILogger& logger = NaiveLogger::Global()) : max_batch_(max_batch), max_workspace_(max_workspace), stream_(stream), - logger_(logger), - device_(device) {} + device_(device), + enable_int8_(enable_int8), + calibrator_(calibrator), + logger_(logger) {} virtual ~TensorRTEngine(); @@ -139,8 +144,8 @@ class TensorRTEngine : public EngineBase { // In the normal case, the paddle-trt exists bug when runing the googlenet. // When there are more than two convolutions of 1 * 1 with the same input, the // paddle-tensorrt will do the merging optimization, which fuse those conv - // into - // one conv, and then trigger bug. So, We should use strategy to avoid this + // into one conv, and then trigger bug. So, We should use strategy to avoid + // this // optimization for the time being. This bug will be fixed in the future. std::unordered_map itensor_quote_num; @@ -153,9 +158,14 @@ class TensorRTEngine : public EngineBase { // the max memory size the engine uses int max_workspace_; + cudaStream_t stream_; + // The specific GPU id that the TensorRTEngine bounded to. + int device_; + + bool enable_int8_; + TRTInt8Calibrator* calibrator_; // batch size of the current data, will be updated each Executation. int batch_size_{-1}; - cudaStream_t stream_; nvinfer1::ILogger& logger_; @@ -165,8 +175,6 @@ class TensorRTEngine : public EngineBase { std::unordered_map itensor_map_; - // The specific GPU id that the TensorRTEngine bounded to. - int device_; std::vector> owned_plugin_; // TensorRT related internal members diff --git a/paddle/fluid/inference/tensorrt/trt_int8_calibrator.cc b/paddle/fluid/inference/tensorrt/trt_int8_calibrator.cc new file mode 100644 index 0000000000000000000000000000000000000000..4a85c8b8fe6d70052edd3be59f98582c9b2e86b9 --- /dev/null +++ b/paddle/fluid/inference/tensorrt/trt_int8_calibrator.cc @@ -0,0 +1,147 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/inference/tensorrt/trt_int8_calibrator.h" +#include "glog/logging.h" + +namespace paddle { +namespace inference { +namespace tensorrt { + +// set the batch size before constructing the thread to execute engine +int TRTInt8Calibrator::getBatchSize() const { return batch_size_; } + +TRTInt8Calibrator::TRTInt8Calibrator( + const std::unordered_map& buffers, int batch_size, + std::string engine_name, const platform::Place place) + : batch_size_(batch_size), engine_name_(engine_name) { + int i = 0; + VLOG(4) << "Init a new calibrator: " << engine_name_; + for (const auto it : buffers) { + framework::Tensor temp_tensor; + std::string input_name = it.first; + int data_size = it.second; + int num_ele = data_size / sizeof(int16_t); + framework::DDim data_shape = framework::make_ddim({num_ele}); + temp_tensor.Resize(data_shape); + data_tensors_.push_back(temp_tensor); + data_buffers_[input_name] = std::pair( + static_cast(temp_tensor.mutable_data(place)), num_ele); + i += 1; + } +} + +TRTInt8Calibrator::TRTInt8Calibrator(const std::string& calib_data) + : batch_size_(0), + calib_running_(false), + data_is_set_(false), + done_(true), + calibration_table_(calib_data) {} + +void TRTInt8Calibrator::waitAndSetDone() { + std::unique_lock lk(mut_); + while ((calib_running_ || data_is_set_) && !done_) cond_.wait(lk); + if (!done_) { + done_ = true; + cond_.notify_all(); + } +} + +// There might be more than one input for trt subgraph, +// So, we use a map to store input information. +bool TRTInt8Calibrator::setBatch( + const std::unordered_map& data) { + VLOG(3) << "set batch: " << engine_name_; + std::unique_lock lk(mut_); + // There is a producer and a consumer. The producer set the batch data and + // the consumer get the batch data. The size of the data pool is one. + // So, the producer has to wait for the consumer to finish processing before + // they can set the data. + while ((calib_running_ || data_is_set_) && (!done_)) cond_.wait(lk); + // The done_ is set to true using waitAndSetDone, When all calibration data + // are processed. + if (done_) return false; + + // Sets the batch. + for (const auto& it : data) { + auto dataptr = data_buffers_.find(it.first); + if (dataptr == data_buffers_.end()) { + LOG(FATAL) << "FATAL " << engine_name_ << " input name '" << it.first + << "' does not match with the buffer names"; + } + const auto& d = dataptr->second; + PADDLE_ENFORCE( + cudaMemcpy(d.first, it.second, d.second, cudaMemcpyDeviceToDevice), + "Fail to cudaMemcpy %s for %s", engine_name_, it.first); + } + + data_is_set_ = true; + cond_.notify_all(); + return true; +} + +bool TRTInt8Calibrator::getBatch(void** bindings, const char** names, + int num_bindings) { + VLOG(4) << "get batch: " << engine_name_; + std::unique_lock lk(mut_); + // The consumer has just finished processing a data. + // The producer can set the data again. + calib_running_ = false; + cond_.notify_all(); + + // As long as there is data in the pool, the consumer can get it. + while (!data_is_set_ && !done_) cond_.wait(lk); + if (done_) return false; + + // Gets the batch + for (int i = 0; i < num_bindings; i++) { + auto it = data_buffers_.find(names[i]); + if (it == data_buffers_.end()) { + LOG(FATAL) << "Calibration engine asked for unknown tensor name '" + << names[i] << "' at position " << i; + } + bindings[i] = it->second.first; + } + + data_is_set_ = false; + calib_running_ = true; + VLOG(4) << "get batch done: " << engine_name_; + return true; +} + +void TRTInt8Calibrator::setDone() { + std::unique_lock lk(mut_); + done_ = true; + cond_.notify_all(); +} + +const void* TRTInt8Calibrator::readCalibrationCache(size_t& length) { + if (calibration_table_.empty()) return nullptr; + length = calibration_table_.size(); + return calibration_table_.data(); +} + +void TRTInt8Calibrator::writeCalibrationCache(const void* ptr, + std::size_t length) { + calibration_table_ = std::string((const char*)ptr, length); + VLOG(4) << "Got calibration data for " << engine_name_ << " " << ptr + << " length=" << length; +} +TRTInt8Calibrator::~TRTInt8Calibrator() { + VLOG(4) << "Destroying calibrator for " << engine_name_; +} + +} // namespace tensorrt +} // namespace inference +} // namespace paddle diff --git a/paddle/fluid/inference/tensorrt/trt_int8_calibrator.h b/paddle/fluid/inference/tensorrt/trt_int8_calibrator.h new file mode 100644 index 0000000000000000000000000000000000000000..919f5d55f88c3a6473f66371e2f3d91f3c4721c5 --- /dev/null +++ b/paddle/fluid/inference/tensorrt/trt_int8_calibrator.h @@ -0,0 +1,128 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include "paddle/fluid/framework/tensor.h" +#include "paddle/fluid/inference/tensorrt/engine.h" +#include "paddle/fluid/platform/place.h" + +namespace paddle { +namespace inference { +namespace tensorrt { + +class TensorRTEngine; + +struct TRTInt8Calibrator : public nvinfer1::IInt8EntropyCalibrator { + public: + TRTInt8Calibrator(const std::unordered_map& buffers, + int batch_size, std::string engine_name, + const platform::Place place); + + explicit TRTInt8Calibrator(const std::string& calibration_data); + ~TRTInt8Calibrator(); + + int getBatchSize() const override; + + bool getBatch(void* bindings[], const char* names[], + int num_bindings) override; + + bool setBatch(const std::unordered_map& data); + void setDone(); + void waitAndSetDone(); + + const void* readCalibrationCache(std::size_t& length) override; + void writeCalibrationCache(const void* ptr, std::size_t length) override; + const std::string& getCalibrationTableAsString() { + return calibration_table_; + } + + private: + const int batch_size_; + + bool calib_running_{true}; + bool data_is_set_{false}; + bool done_{false}; + + std::mutex mut_; + std::condition_variable cond_; + + std::unordered_map> data_buffers_; + std::vector data_tensors_; + + std::string engine_name_; + std::string calibration_table_; +}; + +class TRTCalibratorEngine { + public: + TRTCalibratorEngine() {} + std::unique_ptr calib_; + std::unique_ptr thr_; + std::unique_ptr engine_; +}; +/* + * Manager to control the TensorRT Int8 calibration creation and deltetion. + */ +class TRTCalibratorEngineManager { + public: + bool Has() const { return res_.size() > 0; } + bool Has(const std::string& name) const { + if (res_.count(name) == 0) return false; + return res_.at(name).get() != nullptr; + } + + // Get Int8Calibrator via name + TRTCalibratorEngine* Get(const std::string& name) const { + return res_.at(name).get(); + } + + // Look up or create a calibrator. + TRTCalibratorEngine* LookupOrCreate(const std::string& engine_name) { + if (res_.count(engine_name) == 0) { + auto* p = new TRTCalibratorEngine; + res_[engine_name].reset(p); + } + return res_.at(engine_name).get(); + } + + // Create an Int8Calibrator + TRTCalibratorEngine* Create(const std::string& engine_name) { + auto* p = new TRTCalibratorEngine; + res_[engine_name].reset(p); + return p; + } + + void DeleteALL() { + for (auto& item : res_) { + item.second.reset(nullptr); + } + } + + private: + std::unordered_map> res_; +}; + +} // namespace tensorrt +} // namespace inference +} // namespace paddle diff --git a/paddle/fluid/inference/tests/api/CMakeLists.txt b/paddle/fluid/inference/tests/api/CMakeLists.txt index 423c39813f05af0d6aaade184914e6777c9b8a83..07b9e0e051bce13f6caeca54a664019c55d80fa6 100644 --- a/paddle/fluid/inference/tests/api/CMakeLists.txt +++ b/paddle/fluid/inference/tests/api/CMakeLists.txt @@ -54,6 +54,7 @@ else() message(WARNING "These tests has been disabled in OSX or WITH_MKL=OFF before being fixed: \n test_analyzer_seq_pool1") endif() + # RNN2 set(RNN2_INSTALL_DIR "${INFERENCE_DEMO_INSTALL_DIR}/rnn2") download_model_and_data(${RNN2_INSTALL_DIR} "rnn2_model.tar.gz" "rnn2_data.txt.tar.gz") @@ -115,6 +116,10 @@ if (NOT EXISTS ${MOBILENET_INSTALL_DIR}) endif() inference_analysis_api_test_with_refer_result(test_analyzer_mobilenet_transpose ${MOBILENET_INSTALL_DIR} analyzer_vis_tester.cc SERIAL) +# googlenet +inference_analysis_api_test_with_fake_data(test_analyzer_googlenet + "${INFERENCE_DEMO_INSTALL_DIR}/googlenet" analyzer_resnet50_tester.cc "googlenet.tar.gz" SERIAL) + # resnet50 inference_analysis_api_test_with_fake_data(test_analyzer_resnet50 "${INFERENCE_DEMO_INSTALL_DIR}/resnet50" analyzer_resnet50_tester.cc "resnet50_model.tar.gz" SERIAL) diff --git a/paddle/fluid/inference/tests/api/analyzer_dam_tester.cc b/paddle/fluid/inference/tests/api/analyzer_dam_tester.cc index 4ec9404ab42bcd9cc0608f033cb2777106a29583..e78ab942d113323fecf5510dca85fb5db734efc8 100644 --- a/paddle/fluid/inference/tests/api/analyzer_dam_tester.cc +++ b/paddle/fluid/inference/tests/api/analyzer_dam_tester.cc @@ -253,7 +253,7 @@ void compare(bool use_mkldnn = false) { } // Compare result of NativeConfig and AnalysisConfig with memory optimization. -TEST(Analyzer_dam, compare_with_memory_optim) { +TEST(Analyzer_dam, compare_with_static_memory_optim) { // The small dam will core in CI, but works in local. if (FLAGS_max_turn_num == 9) { contrib::AnalysisConfig cfg, cfg1; @@ -263,7 +263,7 @@ TEST(Analyzer_dam, compare_with_memory_optim) { SetInput(&input_slots_all); // Run the first time to force to update memory cache SetConfig(&cfg); - cfg.EnableMemoryOptim(true); + cfg.EnableMemoryOptim(true, true /*force update*/); CompareNativeAndAnalysis( reinterpret_cast(&cfg), @@ -271,7 +271,7 @@ TEST(Analyzer_dam, compare_with_memory_optim) { // Run second time to use the memory cache and perform memory optimization. SetConfig(&cfg1); - cfg1.EnableMemoryOptim(); + cfg1.EnableMemoryOptim(true, false /*do not force update*/); CompareNativeAndAnalysis( reinterpret_cast(&cfg1), @@ -279,6 +279,24 @@ TEST(Analyzer_dam, compare_with_memory_optim) { } } +TEST(Analyzer_dam, compare_with_dynamic_memory_optim) { + // The small dam will core in CI, but works in local. + if (FLAGS_max_turn_num == 9) { + contrib::AnalysisConfig cfg, cfg1; + DataRecord data(FLAGS_infer_data, FLAGS_batch_size); + + std::vector> input_slots_all; + SetInput(&input_slots_all); + // Run the first time to force to update memory cache + SetConfig(&cfg); + cfg.EnableMemoryOptim(); + + CompareNativeAndAnalysis( + reinterpret_cast(&cfg), + input_slots_all); + } +} + TEST(Analyzer_dam, compare) { compare(); } #ifdef PADDLE_WITH_MKLDNN diff --git a/paddle/fluid/memory/allocation/legacy_allocator.cc b/paddle/fluid/memory/allocation/legacy_allocator.cc index 64aa63ffe9705d75e70c8d9d9cbc433dd6358596..5d8684f083bda8499000c9fd0a7617cf129db13b 100644 --- a/paddle/fluid/memory/allocation/legacy_allocator.cc +++ b/paddle/fluid/memory/allocation/legacy_allocator.cc @@ -14,6 +14,7 @@ #include "paddle/fluid/memory/allocation/legacy_allocator.h" #include +#include #include #include "glog/logging.h" #include "paddle/fluid/memory/detail/buddy_allocator.h" @@ -37,7 +38,7 @@ template void *Alloc(const Place &place, size_t size); template -void Free(const Place &place, void *p); +void Free(const Place &place, void *p, size_t size); template size_t Used(const Place &place); @@ -52,6 +53,11 @@ size_t memory_usage(const platform::Place &p); using BuddyAllocator = detail::BuddyAllocator; +std::unordered_map> + gpu_mem_info; + BuddyAllocator *GetCPUBuddyAllocator() { // We tried thread_local for inference::RNN1 model, but that not works much // for multi-thread test. @@ -98,7 +104,8 @@ void *Alloc(const platform::CPUPlace &place, size_t size) { } template <> -void Free(const platform::CPUPlace &place, void *p) { +void Free(const platform::CPUPlace &place, void *p, + size_t size) { VLOG(10) << "Free pointer=" << p << " on " << platform::Place(place); GetCPUBuddyAllocator()->Free(p); } @@ -177,9 +184,16 @@ void *Alloc(const platform::CUDAPlace &place, LOG(WARNING) << "GPU memory used: " << string::HumanReadableSize(Used(place)); platform::SetDeviceId(cur_dev); - } - if (FLAGS_init_allocated_mem) { - cudaMemset(ptr, 0xEF, size); + } else { + gpu_mem_info[place.device].first += size; + if (gpu_mem_info[place.device].first > gpu_mem_info[place.device].second) { + gpu_mem_info[place.device].second = gpu_mem_info[place.device].first; + VLOG(3) << "device: " << place.device << " peak memory usage : " + << (gpu_mem_info[place.device].second >> 20) << " MiB"; + } + if (FLAGS_init_allocated_mem) { + cudaMemset(ptr, 0xEF, size); + } } return ptr; #else @@ -188,9 +202,11 @@ void *Alloc(const platform::CUDAPlace &place, } template <> -void Free(const platform::CUDAPlace &place, void *p) { +void Free(const platform::CUDAPlace &place, void *p, + size_t size) { #ifdef PADDLE_WITH_CUDA GetGPUBuddyAllocator(place.device)->Free(p); + gpu_mem_info[place.device].first -= size; #else PADDLE_THROW("'CUDAPlace' is not supported in CPU only device."); #endif @@ -243,7 +259,7 @@ void *Alloc(const platform::CUDAPinnedPlace &place, template <> void Free(const platform::CUDAPinnedPlace &place, - void *p) { + void *p, size_t size) { #ifdef PADDLE_WITH_CUDA GetCUDAPinnedBuddyAllocator()->Free(p); #else @@ -264,15 +280,17 @@ struct AllocVisitor : public boost::static_visitor { }; struct FreeVisitor : public boost::static_visitor { - inline explicit FreeVisitor(void *ptr) : ptr_(ptr) {} + inline explicit FreeVisitor(void *ptr, size_t size) + : ptr_(ptr), size_(size) {} template inline void operator()(const Place &place) const { - Free(place, ptr_); + Free(place, ptr_, size_); } private: void *ptr_; + size_t size_; }; size_t Usage::operator()(const platform::CPUPlace &cpu) const { @@ -304,8 +322,9 @@ Allocation *LegacyAllocator::AllocateImpl(size_t size, Allocator::Attr attr) { } void LegacyAllocator::Free(Allocation *allocation) { - boost::apply_visitor(legacy::FreeVisitor(allocation->ptr()), - allocation->place()); + boost::apply_visitor( + legacy::FreeVisitor(allocation->ptr(), allocation->size()), + allocation->place()); delete allocation; } } // namespace allocation diff --git a/paddle/fluid/operators/CMakeLists.txt b/paddle/fluid/operators/CMakeLists.txt index 992a2bdd5ad639bf6176328e94da6eb71a41790c..e099425b94221bf1229e936fc1781615d13dbc26 100644 --- a/paddle/fluid/operators/CMakeLists.txt +++ b/paddle/fluid/operators/CMakeLists.txt @@ -13,6 +13,7 @@ add_subdirectory(detection) add_subdirectory(elementwise) add_subdirectory(fused) add_subdirectory(metrics) +add_subdirectory(ngraph) add_subdirectory(optimizers) add_subdirectory(reduce_ops) add_subdirectory(sequence_ops) @@ -66,7 +67,7 @@ set(COMMON_OP_DEPS ${OP_HEADER_DEPS}) set(COMMON_OP_DEPS ${COMMON_OP_DEPS} selected_rows_functor selected_rows lod_tensor maxouting unpooling pooling lod_rank_table context_project sequence_pooling executor) set(COMMON_OP_DEPS ${COMMON_OP_DEPS} dynload_warpctc) set(COMMON_OP_DEPS ${COMMON_OP_DEPS} sequence_padding sequence_scale cos_sim_functor memory jit_kernel_helper concat_and_split cross_entropy softmax vol2col im2col sampler tree2col) -set(COMMON_OP_DEPS ${COMMON_OP_DEPS} sequence2batch lstm_compute matrix_bit_code gru_compute activation_functions) +set(COMMON_OP_DEPS ${COMMON_OP_DEPS} sequence2batch lstm_compute matrix_bit_code gru_compute activation_functions beam_search) if (WITH_GPU) set(COMMON_OP_DEPS ${COMMON_OP_DEPS} depthwise_conv prelu) endif() @@ -86,7 +87,6 @@ set(GLOB_OPERATOR_DEPS ${OPERATOR_DEPS} CACHE INTERNAL "Global Op dependencies") cc_test(gather_test SRCS gather_test.cc DEPS tensor) cc_test(scatter_test SRCS scatter_test.cc DEPS tensor math_function) cc_test(beam_search_decode_op_test SRCS beam_search_decode_op_test.cc DEPS lod_tensor) -cc_test(beam_search_op_test SRCS beam_search_op_test.cc DEPS lod_tensor beam_search_op) cc_test(strided_memcpy_test SRCS strided_memcpy_test.cc DEPS tensor memory) cc_test(save_load_op_test SRCS save_load_op_test.cc DEPS save_op load_op) cc_test(save_load_combine_op_test SRCS save_load_combine_op_test.cc DEPS save_combine_op load_combine_op) diff --git a/paddle/fluid/operators/beam_search_op.cc b/paddle/fluid/operators/beam_search_op.cc index 30f700f1d91c5a81f39594b6dab7e5e717c9818f..e78ecc1a12309fe084a4165e5bb0d8bfb1dcf957 100644 --- a/paddle/fluid/operators/beam_search_op.cc +++ b/paddle/fluid/operators/beam_search_op.cc @@ -12,205 +12,15 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ -#include -#include +#include "paddle/fluid/operators/beam_search_op.h" + #include #include - -#include "paddle/fluid/framework/lod_tensor.h" #include "paddle/fluid/framework/op_registry.h" -#include "paddle/fluid/operators/beam_search_op.h" namespace paddle { namespace operators { -void BeamSearch::operator()(const framework::LoDTensor &pre_ids, - const framework::LoDTensor &pre_scores, - framework::LoDTensor *selected_ids, - framework::LoDTensor *selected_scores) { - auto abs_lod = framework::ToAbsOffset(ids_->lod()); - auto &high_level = abs_lod[lod_level_]; - - auto items = SelectTopBeamSizeItems(pre_ids, pre_scores); - auto selected_items = ToMap(items, high_level.back()); - VLOG(3) << "selected_items:"; - for (size_t i = 0; i < selected_items.size(); ++i) { - VLOG(3) << "offset:" << i; - for (auto &item : selected_items[i]) { - VLOG(3) << ItemToString(item); - } - } - - PruneEndBeams(pre_ids, &selected_items); - // calculate the output tensor's height - size_t num_instances = std::accumulate( - std::begin(selected_items), std::end(selected_items), 0, - [](size_t a, std::vector &b) { return a + b.size(); }); - // the output tensor shape should be [num_instances, 1] - auto dims = framework::make_ddim( - std::vector({static_cast(num_instances), 1})); - selected_ids->Resize(dims); - selected_scores->Resize(dims); - - std::map> hash; - framework::LoD new_lod; - auto *ids_data = selected_ids->mutable_data(platform::CPUPlace()); - auto *scores_data = - selected_scores->mutable_data(platform::CPUPlace()); - - // fill in data - std::vector low_level; - size_t low_offset = 0; - for (auto &items : selected_items) { - low_level.push_back(low_offset); - for (auto &item : items) { - ids_data[low_offset] = item.id; - scores_data[low_offset] = item.score; - low_offset++; - } - } - low_level.push_back(low_offset); - - // fill lod - framework::LoD lod(2); - lod[0].assign(high_level.begin(), high_level.end()); - lod[1].assign(low_level.begin(), low_level.end()); - if (!framework::CheckLoD(lod)) { - PADDLE_THROW("lod %s is not right", framework::LoDToString(lod)); - } - selected_ids->set_lod(lod); - selected_scores->set_lod(lod); -} - -void BeamSearch::PruneEndBeams(const framework::LoDTensor &pre_ids, - std::vector> *items) { - auto *pre_ids_data = pre_ids.data(); - auto abs_lod = framework::ToAbsOffset(ids_->lod()); - auto &high_level = abs_lod[lod_level_]; - for (size_t src_idx = 0; src_idx < high_level.size() - 1; ++src_idx) { - size_t src_prefix_start = high_level[src_idx]; - size_t src_prefix_end = high_level[src_idx + 1]; - bool finish_flag = true; - for (size_t offset = src_prefix_start; offset < src_prefix_end; offset++) { - for (auto &item : items->at(offset)) { - if (item.id != static_cast(end_id_) || - pre_ids_data[offset] != end_id_) { - finish_flag = false; - break; - } - } - if (!finish_flag) break; - } - if (finish_flag) { // all branchs of the beam (source sentence) end and - // prune this beam - for (size_t offset = src_prefix_start; offset < src_prefix_end; offset++) - items->at(offset).clear(); - } - } -} - -std::vector> BeamSearch::ToMap( - const std::vector> &items, size_t element_num) { - std::vector> result; - result.resize(element_num); - for (auto &entries : items) { - for (const auto &item : entries) { - result[item.offset].push_back(item); - } - } - return result; -} - -std::vector> BeamSearch::SelectTopBeamSizeItems( - const framework::LoDTensor &pre_ids, - const framework::LoDTensor &pre_scores) { - std::vector> result; - std::vector items; - // for each source sentence, select the top beam_size items across all - // candidate sets. - while (NextItemSet(pre_ids, pre_scores, &items)) { - std::nth_element( - std::begin(items), std::begin(items) + beam_size_, std::end(items), - [](const Item &a, const Item &b) { return a.score > b.score; }); - // prune the top beam_size items. - if (items.size() > beam_size_) { - items.resize(beam_size_); - } - result.emplace_back(items); - } - VLOG(3) << "SelectTopBeamSizeItems result size " << result.size(); - for (auto &items : result) { - VLOG(3) << "item set:"; - for (auto &item : items) { - VLOG(3) << ItemToString(item); - } - } - - return result; -} - -// the candidates of a source -bool BeamSearch::NextItemSet(const framework::LoDTensor &pre_ids, - const framework::LoDTensor &pre_scores, - std::vector *items) { - if (sent_offset_ >= ids_->NumElements(lod_level_)) { - return false; - } - // find the current candidates - auto ids = *ids_; - auto scores = *scores_; - - auto abs_lod = framework::ToAbsOffset(ids.lod()); - - auto *ids_data = ids.data(); - auto *scores_data = scores.data(); - - size_t instance_dim = 1; - for (int i = 1; i < ids.dims().size(); i++) { - instance_dim *= ids.dims()[i]; - } - - auto *pre_ids_data = pre_ids.data(); - auto *pre_scores_data = pre_scores.data(); - items->clear(); - items->reserve(framework::product(ids.dims())); - for (size_t offset = abs_lod[lod_level_][sent_offset_]; - offset < abs_lod[lod_level_][sent_offset_ + 1]; offset++) { - auto pre_id = pre_ids_data[offset]; - auto pre_score = pre_scores_data[offset]; - if (pre_id == end_id_) { - // Allocate all probability mass to eos_id for finished branchs and the - // other candidate ids can be ignored. - items->emplace_back(offset, end_id_, pre_score); - } else { - for (size_t d = 0; d < instance_dim; d++) { - const size_t dim_offset = offset * instance_dim + d; - items->emplace_back(offset, ids_data[dim_offset], - scores_data[dim_offset]); - } - } - } - - sent_offset_++; - return true; -} - -std::ostream &operator<<(std::ostream &os, const BeamSearch::Item &item) { - os << "{"; - os << "offset: " << item.offset << ", "; - os << "id: " << item.id << ", "; - os << "score: " << item.score << ""; - os << "}"; - - return os; -} - -std::string ItemToString(const BeamSearch::Item &item) { - std::ostringstream stream; - stream << item; - return stream.str(); -} - class BeamSearchOpMaker : public framework::OpProtoAndCheckerMaker { public: void Make() override { @@ -219,18 +29,23 @@ class BeamSearchOpMaker : public framework::OpProtoAndCheckerMaker { "(LoDTensor) The LoDTensor containing the selected ids at the " "previous step. It should be a tensor with shape (batch_size, 1) " "and lod `[[0, 1, ... , batch_size], [0, 1, ..., batch_size]]` at " - "thefirst step."); + "the first step."); AddInput("pre_scores", "(LoDTensor) The LoDTensor containing the accumulated " "scores corresponding to the selected ids at the previous step."); AddInput("ids", "(LoDTensor) The LoDTensor containing the candidates ids. Its " - "shape should be (batch_size * beam_size, K), where K supposed to " - "be beam_size."); + "shape should be (batch_size * beam_size, W). If not set, it will " + "be calculated out according to Input(scores) in this operator.") + .AsDispensable(); AddInput("scores", - "(LoDTensor) The LodTensor containing the accumulated scores " - "corresponding to Input(ids) and its shape is the same as the " - "shape of Input(ids)."); + "(LoDTensor) The LoDTensor containing the current scores " + "corresponding to Input(ids). If Input(ids) is not nullptr, its " + "shape is the same as that of Input(ids)." + "If is_accumulated is true, Input(scores) is accumulated scores " + "and will be used derectedly. Else, each score will be " + "transformed to the log field and accumulate Input(pre_sores) " + "first."); AddOutput("selected_ids", "A LodTensor that stores the IDs selected by beam search."); AddOutput("selected_scores", @@ -242,6 +57,9 @@ class BeamSearchOpMaker : public framework::OpProtoAndCheckerMaker { AddAttr("beam_size", "beam size for beam search"); AddAttr("end_id", "the token id which indicates the end of a sequence"); + AddAttr("is_accumulated", + "Whether the Input(scores) is accumulated scores.") + .SetDefault(true); AddComment(R"DOC( This operator does the search in beams for one time step. @@ -265,10 +83,9 @@ class BeamSearchOp : public framework::OperatorWithKernel { public: using framework::OperatorWithKernel::OperatorWithKernel; - protected: void InferShape(framework::InferShapeContext *ctx) const override { for (const std::string &arg : - std::vector({"pre_ids", "ids", "scores"})) { + std::vector({"pre_ids", "scores"})) { PADDLE_ENFORCE(ctx->HasInput(arg), "BeamSearch need input argument '%s'", arg); } @@ -279,12 +96,22 @@ class BeamSearchOp : public framework::OperatorWithKernel { } } + protected: framework::OpKernelType GetExpectedKernelType( const framework::ExecutionContext &ctx) const override { - framework::OpKernelType kt = framework::OpKernelType( - ctx.Input("pre_ids")->type(), - platform::CPUPlace()); - return kt; + auto *scores = ctx.Input("scores"); + size_t level = ctx.Attr("level"); + size_t batch_size = scores->lod()[level].size() - 1; + // The current CUDA kernel only support cases with batch_size < 4. + // Compute on CPU for cases with batch_size > 4. + if (batch_size <= 4) { + return framework::OpKernelType( + ctx.Input("pre_ids")->type(), ctx.GetPlace()); + } else { + return framework::OpKernelType( + ctx.Input("pre_ids")->type(), + platform::CPUPlace()); + } } }; diff --git a/paddle/fluid/operators/beam_search_op.cu.cc b/paddle/fluid/operators/beam_search_op.cu.cc new file mode 100644 index 0000000000000000000000000000000000000000..4ef9476eee5d3fac4decd7273da824b2f2349199 --- /dev/null +++ b/paddle/fluid/operators/beam_search_op.cu.cc @@ -0,0 +1,24 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/operators/beam_search_op.h" +#include "paddle/fluid/framework/op_registry.h" + +namespace ops = paddle::operators; +REGISTER_OP_CUDA_KERNEL( + beam_search, + ops::BeamSearchOpKernel, + ops::BeamSearchOpKernel, + ops::BeamSearchOpKernel, + ops::BeamSearchOpKernel); diff --git a/paddle/fluid/operators/beam_search_op.h b/paddle/fluid/operators/beam_search_op.h index b5e2ed05924cc8b7bc06058b9b1103ba10be486e..1b939e742de06aedf187d25d002d19e0a4fafc9d 100644 --- a/paddle/fluid/operators/beam_search_op.h +++ b/paddle/fluid/operators/beam_search_op.h @@ -4,7 +4,7 @@ Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at -http://www.apache.org/licenses/LICENSE-2.0 + http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, @@ -14,187 +14,12 @@ limitations under the License. */ #pragma once -#include -#include -#include "paddle/fluid/framework/lod_tensor.h" #include "paddle/fluid/framework/operator.h" +#include "paddle/fluid/operators/math/beam_search.h" namespace paddle { namespace operators { -/* - * This is an implementation of beam search. - * - * To explain the details, lets take machine translation task for example, in - * this task, one source sentence is translated to multiple target sentences, - * during this period, one sentence will be translated to multiple translation - * prefixes(target sentence that have not ended), in each time step a prefix - * will have some candidates, input the candidate ids and their corresponding - * scores (probabilities), it will sort and select the top beam_size candidates - * for each source sentence, and store the selected candidates's score and their - * corresponding ids to LoDTensors. - * - * A detailed example: - * - * Input - * - * ids: - * LoD (should have 2 levels) - * first level: [0, 1, 4] - * second level: [0, 1, 2, 3, 4] - * - * tensor's data - * [ - * [4, 2, 5] - * [2, 1, 3] - * [3, 5, 2] - * [8, 2, 1] - * ] - * - * scores: - * LoD same as `ids` - * tensor's data - * [ - * [0.5, 0.3, 0.2] - * [0.6, 0.3, 0.1] - * [0.9, 0.5, 0.1] - * [0.7, 0.5, 0.1] - * ] - * - * the inputs means that there are 2 source sentences to translate, and the - * first source has 1 prefix, the second source has 2 prefix. - * - * lets assume beam size is 2, and the beam search's output should be - * LoD - * first level: - * [0, 1, 2] - * second level: - * [0, 2, 4] - * - * id tensor's data - * [[ - * 4, - * 1, - * 3, - * 8, - * ]] - * - * score tensor's data - * [[ - * 0.5, - * 0.3, - * 0.9, - * 0.7 - * ]] - * - * TODO all the prune operations should be in the beam search, so it is better - * to split the beam search algorithm into a sequence of smaller operators, and - * the prune operators can be inserted in this sequence. - */ -class BeamSearch { - public: - // TODO(superjom) make type customizable - using id_t = size_t; - using score_t = float; - /* - * Input the arguments that needed by this class. - */ - BeamSearch(const framework::LoDTensor& ids, - const framework::LoDTensor& scores, size_t level, size_t beam_size, - int end_id) - : beam_size_(beam_size), - ids_(&ids), - scores_(&scores), - lod_level_(level), - end_id_(end_id) {} - - /* - * The main function of beam search. - * - * @selected_ids: a [None, 1]-shaped tensor with LoD. - * In a machine translation model, it might be the candidate term id sets, - * each set stored as a varience-length sequence. - * The format might be described with a two-level LoD - * - [[0 1] - * - [0 1 2]] - * - [[] - * - [0 1]] - * the first level of LoD tells that there are two source sentences. The - * second level describes the details of the candidate id set's offsets in - * the - * source sentences. - * - * @selected_scores: a LoD tensor with the same shape and LoD with - * selected_ids. - * It stores the corresponding scores of candidate ids in selected_ids. - * - * Return false if all the input tensor is empty, in machine translation task - * that means no candidates is provided, and the task will stop running. - */ - void operator()(const framework::LoDTensor& pre_ids, - const framework::LoDTensor& pre_scores, - framework::LoDTensor* selected_ids, - framework::LoDTensor* selected_scores); - /* - * The basic items help to sort. - */ - struct Item { - Item() {} - Item(size_t offset, size_t id, float score) - : offset(offset), id(id), score(score) {} - // offset in the higher lod level. - size_t offset; - // // prefix id in the lower lod level. - // size_t prefix; - // the candidate id - id_t id; - // the corresponding score - score_t score; - }; - - protected: - /* - * Prune the source sentences all branchs finished, and it is optional. - * Pruning must one step later than finishing (thus pre_ids is needed here), - * since the end tokens must be writed out. - */ - void PruneEndBeams(const framework::LoDTensor& pre_ids, - std::vector>* items); - - /* - * Transform the items into a map whose key is offset, value is the items. - * NOTE low performance. - */ - std::vector> ToMap( - const std::vector>& inputs, size_t element_num); - - /* - * For each source, select top beam_size records. - */ - std::vector> SelectTopBeamSizeItems( - const framework::LoDTensor& pre_ids, - const framework::LoDTensor& pre_scores); - - /* - * Get the items of next source sequence, return false if no remaining items. - */ - bool NextItemSet(const framework::LoDTensor& pre_ids, - const framework::LoDTensor& pre_scores, - std::vector* items); - - private: - size_t beam_size_; - const framework::LoDTensor* ids_; - const framework::LoDTensor* scores_; - size_t lod_level_{0}; - size_t sent_offset_{0}; - int end_id_{0}; -}; - -std::ostream& operator<<(std::ostream& os, const BeamSearch::Item& item); - -std::string ItemToString(const BeamSearch::Item& item); - template class BeamSearchOpKernel : public framework::OpKernel { public: @@ -203,7 +28,7 @@ class BeamSearchOpKernel : public framework::OpKernel { auto* scores = context.Input("scores"); auto* pre_ids = context.Input("pre_ids"); auto* pre_scores = context.Input("pre_scores"); - PADDLE_ENFORCE_NOT_NULL(ids); + PADDLE_ENFORCE_NOT_NULL(scores); PADDLE_ENFORCE_NOT_NULL(pre_ids); PADDLE_ENFORCE_NOT_NULL(pre_scores); @@ -211,14 +36,20 @@ class BeamSearchOpKernel : public framework::OpKernel { size_t level = context.Attr("level"); size_t beam_size = context.Attr("beam_size"); int end_id = context.Attr("end_id"); - BeamSearch alg(*ids, *scores, level, beam_size, end_id); + bool is_accumulated = context.Attr("is_accumulated"); + auto selected_ids = context.Output("selected_ids"); auto selected_scores = context.Output("selected_scores"); PADDLE_ENFORCE_NOT_NULL(selected_ids); PADDLE_ENFORCE_NOT_NULL(selected_scores); - alg(*pre_ids, *pre_scores, selected_ids, selected_scores); + + math::BeamSearchFunctor alg; + alg(context.template device_context(), pre_ids, pre_scores, + ids, scores, selected_ids, selected_scores, level, beam_size, end_id, + is_accumulated); } }; + } // namespace operators } // namespace paddle diff --git a/paddle/fluid/operators/beam_search_op_test.cc b/paddle/fluid/operators/beam_search_op_test.cc deleted file mode 100644 index 40b46781daa989fcd89887a3c01e97e39ea71255..0000000000000000000000000000000000000000 --- a/paddle/fluid/operators/beam_search_op_test.cc +++ /dev/null @@ -1,92 +0,0 @@ -/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. - - Licensed under the Apache License, Version 2.0 (the "License"); - you may not use this file except in compliance with the License. - You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. */ - -#include "paddle/fluid/operators/beam_search_op.h" - -#include -#include - -namespace paddle { -namespace test { - -using std::vector; -using framework::LoDTensor; -using framework::LoD; -using operators::BeamSearch; -using paddle::platform::CPUPlace; -using std::cout; -using std::endl; - -void CreateInput(LoDTensor* ids, LoDTensor* scores) { - LoD lod; - vector level0({0, 2, 4}); - vector level1({0, 1, 2, 3, 4}); - lod.push_back(level0); - lod.push_back(level1); - ids->set_lod(lod); - scores->set_lod(lod); - - auto dims = framework::make_ddim(vector({4, 3})); - ids->Resize(dims); - scores->Resize(dims); - CPUPlace place; - - auto* ids_data = ids->mutable_data(place); - auto* scores_data = scores->mutable_data(place); - vector _ids({4, 2, 5, 2, 1, 3, 3, 5, 2, 8, 2, 1}); - vector _scores( - {0.5f, 0.3f, 0.2f, 0.6f, 0.3f, 0.1f, 0.9f, 0.5f, 0.1f, 0.7f, 0.5f, 0.1f}); - - for (int i = 0; i < 12; i++) { - ids_data[i] = _ids[i]; - scores_data[i] = _scores[i]; - } -} - -// It seems that beam_search_op has bugs. -TEST(DISABLED_beam_search_op, run) { - CPUPlace place; - LoDTensor ids, scores; - CreateInput(&ids, &scores); - - LoDTensor pre_ids; - pre_ids.Resize(framework::make_ddim(vector(4, 1))); - for (int i = 0; i < 4; i++) { - pre_ids.mutable_data(place)[i] = i + 1; - } - LoDTensor pre_scores; - pre_scores.Resize(framework::make_ddim(vector(4, 1))); - for (int i = 0; i < 4; i++) { - pre_scores.mutable_data(place)[i] = 0.1 * (i + 1); - } - - BeamSearch beamsearch(ids, scores, (size_t)0, (size_t)2, 0); - LoDTensor sids, sscores; - beamsearch(pre_ids, pre_scores, &sids, &sscores); - - LOG(INFO) << "score: " << sscores << endl; - - ASSERT_EQ(sids.lod(), sscores.lod()); - - vector tids({4, 2, 3, 8}); - vector tscores({0.5f, 0.6f, 0.9f, 0.7f}); - - for (int i = 0; i < 4; i++) { - ASSERT_EQ(tids[i], sids.data()[i]); - ASSERT_EQ(tscores[i], sscores.data()[i]); - } -} - -} // namespace test -} // namespace paddle diff --git a/paddle/fluid/operators/bpr_loss_op.h b/paddle/fluid/operators/bpr_loss_op.h index e223be7af82146e7c69c7c5aab8f08d0fe0d1710..f9570e4e2ed0d9ac8739410eb7cd7397ad09fae4 100644 --- a/paddle/fluid/operators/bpr_loss_op.h +++ b/paddle/fluid/operators/bpr_loss_op.h @@ -87,8 +87,8 @@ class BprLossGradientOpKernel : public framework::OpKernel { auto* label = ctx.Input("Label"); auto* dx = ctx.Output(framework::GradVarName("X")); - const int step_size = x->dims()[0]; - const int num_classes = x->dims()[1]; + const size_t step_size = static_cast(x->dims()[0]); + const size_t num_classes = static_cast(x->dims()[1]); T* dx_data = dx->mutable_data(ctx.GetPlace()); const T* dy_data = dy->data(); const T* x_data = x->data(); diff --git a/paddle/fluid/operators/conv_fusion_op.cu.cc b/paddle/fluid/operators/conv_fusion_op.cu.cc index f97ebecfdd90beade3bef824c04ad7b2763eb036..d8b997cca613f660046106512fc03bf55f9b992d 100644 --- a/paddle/fluid/operators/conv_fusion_op.cu.cc +++ b/paddle/fluid/operators/conv_fusion_op.cu.cc @@ -104,9 +104,7 @@ class CUDNNConvFusionOpKernel : public framework::OpKernel { // ------------------- cudnn conv algorithm --------------------- cudnnConvolutionFwdAlgo_t algo; auto handle = dev_ctx.cudnn_handle(); - - Tensor cudnn_workspace; - void* cudnn_workspace_ptr = nullptr; + auto workspace_handle = dev_ctx.cudnn_workspace_handle(); CUDNN_ENFORCE(platform::dynload::cudnnSetConvolutionMathType( cudnn_conv_desc, CUDNN_DEFAULT_MATH)); @@ -120,24 +118,19 @@ class CUDNNConvFusionOpKernel : public framework::OpKernel { workspace_size_limit, &algo)); VLOG(3) << "cuDNN forward algo " << algo; } else { - cudnn_workspace = - ctx.AllocateTmpTensor( - framework::make_ddim( - {static_cast(workspace_size_limit)}), - dev_ctx); - cudnn_workspace_ptr = static_cast(cudnn_workspace.data()); - auto search_func = [&]() { int returned_algo_count; std::array fwd_perf_stat; - - CUDNN_ENFORCE(platform::dynload::cudnnFindConvolutionForwardAlgorithmEx( - handle, cudnn_input_desc, input_data, cudnn_filter_desc, - filter_data, cudnn_conv_desc, cudnn_output_desc, output_data, - kNUM_CUDNN_FWD_ALGS, &returned_algo_count, fwd_perf_stat.data(), - cudnn_workspace_ptr, workspace_size_limit)); - + auto cudnn_find_func = [&](void* cudnn_workspace) { + CUDNN_ENFORCE( + platform::dynload::cudnnFindConvolutionForwardAlgorithmEx( + handle, cudnn_input_desc, input_data, cudnn_filter_desc, + filter_data, cudnn_conv_desc, cudnn_output_desc, output_data, + kNUM_CUDNN_FWD_ALGS, &returned_algo_count, + fwd_perf_stat.data(), cudnn_workspace, workspace_size_limit)); + }; + workspace_handle.RunFunc(cudnn_find_func, workspace_size_limit); VLOG(3) << "Perf result: (algo: stat, time, memory)"; for (int i = 0; i < returned_algo_count; ++i) { const auto& stat = fwd_perf_stat[i]; @@ -188,15 +181,6 @@ class CUDNNConvFusionOpKernel : public framework::OpKernel { PADDLE_ENFORCE_LE(workspace_size_in_bytes, workspace_size_limit, "workspace_size to be allocated exceeds the limit"); - if (!cudnn_workspace_ptr) { - cudnn_workspace = - ctx.AllocateTmpTensor( - framework::make_ddim( - {static_cast(workspace_size_in_bytes)}), - dev_ctx); - cudnn_workspace_ptr = static_cast(cudnn_workspace.data()); - } - if ((activation == "identity") && (!residual)) { // Only the CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_PRECOMP_GEMM algo is // enabled with CUDNN_ACTIVATION_IDENTITY in cuDNN lib. @@ -204,12 +188,13 @@ class CUDNNConvFusionOpKernel : public framework::OpKernel { // cudnnConvolutionForward and cudnnAddTensor // ------------- cudnn conv forward and bias add --------------------- ScalingParamType alpha = 1.0f, beta = 0.0f; - - CUDNN_ENFORCE(platform::dynload::cudnnConvolutionForward( - handle, &alpha, cudnn_input_desc, input_data, cudnn_filter_desc, - filter_data, cudnn_conv_desc, algo, cudnn_workspace_ptr, - workspace_size_in_bytes, &beta, cudnn_output_desc, output_data)); - + auto cudnn_func = [&](void* cudnn_workspace) { + CUDNN_ENFORCE(platform::dynload::cudnnConvolutionForward( + handle, &alpha, cudnn_input_desc, input_data, cudnn_filter_desc, + filter_data, cudnn_conv_desc, algo, cudnn_workspace, + workspace_size_in_bytes, &beta, cudnn_output_desc, output_data)); + }; + workspace_handle.RunFunc(cudnn_func, workspace_size_in_bytes); CUDNN_ENFORCE(platform::dynload::cudnnAddTensor( handle, &alpha, cudnn_bias_desc, bias_data, &alpha, cudnn_output_desc, output_data)); @@ -220,13 +205,15 @@ class CUDNNConvFusionOpKernel : public framework::OpKernel { // ------------------- cudnn conv+bias+act forward -------------------- ScalingParamType alpha1 = 1.0f; ScalingParamType alpha2 = residual ? 1.0f : 0.0f; - - CUDNN_ENFORCE(platform::dynload::cudnnConvolutionBiasActivationForward( - handle, &alpha1, cudnn_input_desc, input_data, cudnn_filter_desc, - filter_data, cudnn_conv_desc, algo, cudnn_workspace_ptr, - workspace_size_in_bytes, &alpha2, cudnn_output_desc, residual_data, - cudnn_bias_desc, bias_data, cudnn_act_desc, cudnn_output_desc, - output_data)); + auto cudnn_func = [&](void* cudnn_workspace) { + CUDNN_ENFORCE(platform::dynload::cudnnConvolutionBiasActivationForward( + handle, &alpha1, cudnn_input_desc, input_data, cudnn_filter_desc, + filter_data, cudnn_conv_desc, algo, cudnn_workspace, + workspace_size_in_bytes, &alpha2, cudnn_output_desc, residual_data, + cudnn_bias_desc, bias_data, cudnn_act_desc, cudnn_output_desc, + output_data)); + }; + workspace_handle.RunFunc(cudnn_func, workspace_size_in_bytes); } std::vector channels = ctx.Attr>("split_channels"); if (channels.size()) { diff --git a/paddle/fluid/operators/conv_transpose_cudnn_op.cu.cc b/paddle/fluid/operators/conv_transpose_cudnn_op.cu.cc index 016cf8448c5e07fdedab8c5e4a7d0ae9e2ded1ee..f44094ca6b7b7f23f2e7593ad79e4e2a6f0d3070 100644 --- a/paddle/fluid/operators/conv_transpose_cudnn_op.cu.cc +++ b/paddle/fluid/operators/conv_transpose_cudnn_op.cu.cc @@ -104,18 +104,16 @@ class CUDNNConvTransposeOpKernel : public framework::OpKernel { int output_offset = output->numel() / output->dims()[0] / groups; int filter_offset = filter->numel() / groups; T alpha = 1.0f, beta = 0.0f; - - auto temp_allocation = - platform::DeviceTemporaryAllocator::Instance().Get(dev_ctx).Allocate( - workspace_size_in_bytes); - void* cudnn_workspace = temp_allocation->ptr(); - + auto workspace_handle = dev_ctx.cudnn_workspace_handle(); for (int g = 0; g < groups; g++) { - CUDNN_ENFORCE(platform::dynload::cudnnConvolutionBackwardData( - handle, &alpha, cudnn_filter_desc, filter_data + filter_offset * g, - cudnn_input_desc, input_data + input_offset * g, cudnn_conv_desc, - algo, cudnn_workspace, workspace_size_in_bytes, &beta, - cudnn_output_desc, output_data + output_offset * g)); + auto cudnn_func = [&](void* cudnn_workspace) { + CUDNN_ENFORCE(platform::dynload::cudnnConvolutionBackwardData( + handle, &alpha, cudnn_filter_desc, filter_data + filter_offset * g, + cudnn_input_desc, input_data + input_offset * g, cudnn_conv_desc, + algo, cudnn_workspace, workspace_size_in_bytes, &beta, + cudnn_output_desc, output_data + output_offset * g)); + }; + workspace_handle.RunFunc(cudnn_func, workspace_size_in_bytes); } } }; @@ -211,22 +209,20 @@ class CUDNNConvTransposeGradOpKernel : public framework::OpKernel { output_grad->numel() / output_grad->dims()[0] / groups; int filter_offset = filter->numel() / groups; T alpha = 1.0f, beta = 0.0f; - - auto temp_allocation = - platform::DeviceTemporaryAllocator::Instance().Get(dev_ctx).Allocate( - workspace_size_in_bytes); - void* cudnn_workspace = temp_allocation->ptr(); - + auto workspace_handle = dev_ctx.cudnn_workspace_handle(); if (input_grad) { T* input_grad_data = input_grad->mutable_data(ctx.GetPlace()); // Because beta is zero, it is unnecessary to reset input_grad. for (int g = 0; g < groups; g++) { - CUDNN_ENFORCE(platform::dynload::cudnnConvolutionForward( - handle, &alpha, cudnn_output_desc, - output_grad_data + output_grad_offset * g, cudnn_filter_desc, - filter_data + filter_offset * g, cudnn_conv_desc, data_algo, - cudnn_workspace, workspace_size_in_bytes, &beta, cudnn_input_desc, - input_grad_data + input_offset * g)); + auto cudnn_func = [&](void* cudnn_workspace) { + CUDNN_ENFORCE(platform::dynload::cudnnConvolutionForward( + handle, &alpha, cudnn_output_desc, + output_grad_data + output_grad_offset * g, cudnn_filter_desc, + filter_data + filter_offset * g, cudnn_conv_desc, data_algo, + cudnn_workspace, workspace_size_in_bytes, &beta, cudnn_input_desc, + input_grad_data + input_offset * g)); + }; + workspace_handle.RunFunc(cudnn_func, workspace_size_in_bytes); } } @@ -236,12 +232,15 @@ class CUDNNConvTransposeGradOpKernel : public framework::OpKernel { // Because beta is zero, it is unnecessary to reset filter_grad. // Gradient with respect to the filter for (int g = 0; g < groups; g++) { - CUDNN_ENFORCE(platform::dynload::cudnnConvolutionBackwardFilter( - handle, &alpha, cudnn_output_desc, - output_grad_data + output_grad_offset * g, cudnn_input_desc, - input_data + input_offset * g, cudnn_conv_desc, filter_algo, - cudnn_workspace, workspace_size_in_bytes, &beta, cudnn_filter_desc, - filter_grad_data + filter_offset * g)); + auto cudnn_func = [&](void* cudnn_workspace) { + CUDNN_ENFORCE(platform::dynload::cudnnConvolutionBackwardFilter( + handle, &alpha, cudnn_output_desc, + output_grad_data + output_grad_offset * g, cudnn_input_desc, + input_data + input_offset * g, cudnn_conv_desc, filter_algo, + cudnn_workspace, workspace_size_in_bytes, &beta, + cudnn_filter_desc, filter_grad_data + filter_offset * g)); + }; + workspace_handle.RunFunc(cudnn_func, workspace_size_in_bytes); } } } diff --git a/paddle/fluid/operators/distributed/CMakeLists.txt b/paddle/fluid/operators/distributed/CMakeLists.txt index fa8abf4ceca93c0e738cfe18921115f86e86bc34..03f47b594d9bc4a186aacd2eb457335f2a1bf752 100644 --- a/paddle/fluid/operators/distributed/CMakeLists.txt +++ b/paddle/fluid/operators/distributed/CMakeLists.txt @@ -20,7 +20,7 @@ if(WITH_GRPC) collective_client.cc collective_server.cc ${GRPC_SRCS} PROTO send_recv.proto - DEPS lod_tensor selected_rows_functor memory) + DEPS lod_tensor selected_rows_functor memory scope ${GRPC_DEPS}) set_source_files_properties(grpc_serde_test.cc rpc_server_test.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS}) set(RPC_DEPS sendrecvop_rpc ${GRPC_DEPS}) @@ -32,15 +32,17 @@ else() set(BRPC_SRCS brpc/brpc_client.cc brpc/brpc_server.cc brpc/brpc_sendrecvop_utils.cc brpc/brpc_variable_response.cc brpc/brpc_rdma_pool.cc) set_source_files_properties(${BRPC_SRCS} parameter_prefetch.cc parameter_send.cc rpc_server_test.cc brpc/brpc_serde_test.cc collective_server.cc collective_server_test.cc collective_client.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS}) + set(BRPC_DEPS brpc ssl crypto protobuf leveldb snappystream snappy zlib) + brpc_library(sendrecvop_rpc SRCS sendrecvop_utils.cc request_handler_impl.cc rpc_client.cc rpc_server.cc variable_response.cc collective_client.cc collective_server.cc ${BRPC_SRCS} PROTO send_recv.proto - DEPS lod_tensor selected_rows memory) + DEPS lod_tensor selected_rows memory scope ${BRPC_DEPS}) - set(RPC_DEPS sendrecvop_rpc brpc ssl crypto protobuf leveldb snappystream snappy zlib) + set(RPC_DEPS sendrecvop_rpc ${BRPC_DEPS}) cc_test(brpc_serde_test SRCS brpc/brpc_serde_test.cc DEPS ${RPC_DEPS} gflags glog executor proto_desc lookup_sparse_table_op SERIAL) endif() diff --git a/paddle/fluid/operators/distributed/brpc/brpc_client.cc b/paddle/fluid/operators/distributed/brpc/brpc_client.cc index 87bdb83503783b32720eb57bd303ad7eb4bc17a8..b8e63f42e2040730ac79c57651d86d9e3176fa01 100644 --- a/paddle/fluid/operators/distributed/brpc/brpc_client.cc +++ b/paddle/fluid/operators/distributed/brpc/brpc_client.cc @@ -62,7 +62,7 @@ VarHandlePtr BRPCClient::AsyncSendVar(const std::string& ep, const std::string var_name_val = var_name; const framework::Scope* p_scope = &scope; const auto ch_ptr = GetChannel(ep_val); - const std::string method = "SendRPC"; + const std::string method = kSendRPC; VarHandlePtr var_h(new VarHandle(ep, method, var_name_val, p_ctx, p_scope)); framework::AsyncIO([=] { @@ -156,15 +156,18 @@ VarHandlePtr BRPCClient::_AsyncGetVar(const std::string& ep, const platform::DeviceContext& ctx, const framework::Scope& scope, const std::string& var_name, + const std::string& out_var_name, const std::string& method_name, int64_t time_out) { const platform::DeviceContext* p_ctx = &ctx; const std::string ep_val = ep; const std::string var_name_val = var_name; + const std::string out_varname_val = out_var_name; const framework::Scope* p_scope = &scope; const auto ch_ptr = GetChannel(ep_val); - const std::string method = "GetRPC"; - VarHandlePtr var_h(new VarHandle(ep, method, var_name_val, p_ctx, p_scope)); + const std::string method = kGetRPC; + VarHandlePtr var_h( + new VarHandle(ep, method, out_varname_val, p_ctx, p_scope)); framework::AsyncIO([=] { auto ch_ctx = ch_ptr->Pop(); @@ -175,6 +178,7 @@ VarHandlePtr BRPCClient::_AsyncGetVar(const std::string& ep, sendrecv::VariableMessage req; req.set_varname(var_name_val); + req.set_out_varname(out_varname_val); req.set_trainer_id(trainer_id_); google::protobuf::Closure* done = brpc::NewCallback( @@ -182,8 +186,10 @@ VarHandlePtr BRPCClient::_AsyncGetVar(const std::string& ep, platform::RecordRPCEvent record_event(method, p_ctx); - if (method_name == "GetMonomerVariable") { + if (method_name == kGetMonomerRPC) { ch_ctx->stub->GetMonomerVariable(cntl, &req, response, done); + } else if (method_name == kGetNoBarrierRPC) { + ch_ctx->stub->GetVariableNoBarrier(cntl, &req, response, done); } else { ch_ctx->stub->GetVariable(cntl, &req, response, done); } @@ -198,25 +204,39 @@ VarHandlePtr BRPCClient::_AsyncGetVar(const std::string& ep, return var_h; } +VarHandlePtr BRPCClient::AsyncGetVarNoBarrier( + const std::string& ep, const platform::DeviceContext& ctx, + const framework::Scope& scope, const std::string& var_name, + const std::string& out_var_name, int64_t time_out) { + std::string var_name_no_barrier = + string::Sprintf("%s%s", var_name, WITHOUT_BARRIER_MESSAGE); + + return _AsyncGetVar(ep, ctx, scope, var_name_no_barrier, out_var_name, + kGetNoBarrierRPC, time_out); +} + VarHandlePtr BRPCClient::AsyncGetMonomerVariable( const std::string& ep, const platform::DeviceContext& ctx, const framework::Scope& scope, const std::string& var_name, int64_t time_out) { - return _AsyncGetVar(ep, ctx, scope, var_name, "GetMonomerVariable", time_out); + return _AsyncGetVar(ep, ctx, scope, var_name, var_name, kGetMonomerRPC, + time_out); } VarHandlePtr BRPCClient::AsyncGetMonomerBarrier(const std::string& ep, const std::string& var_name, int64_t time_out) { - return AsyncSendMessage(ep, "GetMonomerBarrier", var_name, time_out); + return AsyncSendMessage(ep, kSendMonomerFetchBarrierRPC, var_name, time_out); } VarHandlePtr BRPCClient::AsyncGetVar(const std::string& ep, const platform::DeviceContext& ctx, const framework::Scope& scope, const std::string& var_name, + const std::string& out_var_name, int64_t time_out) { - return _AsyncGetVar(ep, ctx, scope, var_name, "GetVariable", time_out); + return _AsyncGetVar(ep, ctx, scope, var_name, out_var_name, kGetRPC, + time_out); } VarHandlePtr BRPCClient::AsyncPrefetchVar(const std::string& ep, @@ -234,7 +254,7 @@ VarHandlePtr BRPCClient::AsyncPrefetchVar(const std::string& ep, const framework::Scope* p_scope = &scope; const auto ch_ptr = GetChannel(ep_val); - const std::string method = "PrefetchRPC"; + const std::string method = kPrefetchRPC; VarHandlePtr var_h( new VarHandle(ep, method, out_var_name_val, p_ctx, p_scope)); @@ -270,7 +290,7 @@ VarHandlePtr BRPCClient::AsyncPrefetchVar(const std::string& ep, VarHandlePtr BRPCClient::AsyncSendBatchBarrier(const std::string& ep, int64_t time_out) { - return AsyncSendMessage(ep, "BatchBarrierRPC", BATCH_BARRIER_MESSAGE, + return AsyncSendMessage(ep, kBatchBarrierRPC, BATCH_BARRIER_MESSAGE, time_out); } @@ -286,7 +306,7 @@ VarHandlePtr BRPCClient::AsyncSendFetchBarrier(const std::string& ep, sendrecv::VariableMessage req; req.set_varname(FETCH_BARRIER_MESSAGE); - const std::string method = "FetchBarrierRPC"; + const std::string method = kFetchBarrierRPC; // var handle VarHandlePtr var_h( new VarHandle(ep, method, FETCH_BARRIER_MESSAGE, nullptr, nullptr)); @@ -367,7 +387,7 @@ ChannelQueuePtr BRPCClient::GetChannel(const std::string& ep) { VarHandlePtr BRPCClient::AsyncSendComplete(const std::string& ep, int64_t time_out) { - return AsyncSendMessage(ep, "SendCompleteRPC", COMPLETE_MESSAGE, time_out); + return AsyncSendMessage(ep, kSendCompleteRPC, COMPLETE_MESSAGE, time_out); } void BRPCClient::SendComplete() { @@ -394,9 +414,9 @@ VarHandlePtr BRPCClient::AsyncSendVarMessage( google::protobuf::Closure* done = brpc::NewCallback( &HandleSendResponse, cntl, response, var_h, ch_ptr, ch_ctx, this); - if (method_name == "CheckPointNotifyRPC") { + if (method_name == kCheckPointNotifyRPC) { ch_ctx->stub->CheckpointNotify(cntl, &req, response, done); - } else if (method_name == "GetMonomerBarrier") { + } else if (method_name == kSendMonomerFetchBarrierRPC) { ch_ctx->stub->GetMonomerBarrier(cntl, &req, response, done); } else { ch_ctx->stub->SendVariable(cntl, &req, response, done); diff --git a/paddle/fluid/operators/distributed/brpc/brpc_client.h b/paddle/fluid/operators/distributed/brpc/brpc_client.h index 2066ade8a5621f2c201b76690421a943db44535e..501a593b11d35c160348e42ee47216a85647aac4 100644 --- a/paddle/fluid/operators/distributed/brpc/brpc_client.h +++ b/paddle/fluid/operators/distributed/brpc/brpc_client.h @@ -65,6 +65,7 @@ class BRPCClient : public RPCClient { const platform::DeviceContext& ctx, const framework::Scope& scope, const std::string& var_name, + const std::string& out_var_name, int64_t time_out = FLAGS_rpc_deadline) override; VarHandlePtr AsyncGetMonomerBarrier( @@ -76,6 +77,13 @@ class BRPCClient : public RPCClient { const framework::Scope& scope, const std::string& var_name, int64_t time_out = FLAGS_rpc_deadline) override; + VarHandlePtr AsyncGetVarNoBarrier(const std::string& ep, + const platform::DeviceContext& ctx, + const framework::Scope& scope, + const std::string& var_name, + const std::string& out_varname, + int64_t time_out = FLAGS_rpc_deadline); + VarHandlePtr AsyncPrefetchVar(const std::string& ep, const platform::DeviceContext& ctx, const framework::Scope& scope, @@ -103,6 +111,7 @@ class BRPCClient : public RPCClient { const platform::DeviceContext& ctx, const framework::Scope& scope, const std::string& var_name, + const std::string& out_var_name, const std::string& method_name, int64_t time_out = FLAGS_rpc_deadline); diff --git a/paddle/fluid/operators/distributed/brpc/brpc_server.cc b/paddle/fluid/operators/distributed/brpc/brpc_server.cc index cbe0bd09c7b272c35b78818aa9e26feeb5497779..fea9b09414638b607ca7f7d558ce14a2d5bfa03d 100644 --- a/paddle/fluid/operators/distributed/brpc/brpc_server.cc +++ b/paddle/fluid/operators/distributed/brpc/brpc_server.cc @@ -45,6 +45,13 @@ class BRPCServiceImpl : public SendRecvService { rpc_server_->GetThreadNum(distributed::kRequestGet))); } + it = rpc_call_map.find(distributed::kRequestGetNoBarrier); + if (it != rpc_call_map.end()) { + request_getnobarrier_h_ = it->second; + getnobarrier_threads_.reset(new paddle::framework::ThreadPool( + rpc_server_->GetThreadNum(distributed::kRequestGetNoBarrier))); + } + it = rpc_call_map.find(distributed::kRequestPrefetch); if (it != rpc_call_map.end()) { request_prefetch_h_ = it->second; @@ -112,6 +119,14 @@ class BRPCServiceImpl : public SendRecvService { [=] { _GetVariable(cntl_butil, request, response, done); }); } + void GetVariableNoBarrier(google::protobuf::RpcController* cntl_butil, + const VariableMessage* request, + VariableMessage* response, + google::protobuf::Closure* done) override { + getnobarrier_threads_->Run( + [=] { _GetVariableNoBarrier(cntl_butil, request, response, done); }); + } + void _GetVariable(google::protobuf::RpcController* cntl_butil, const VariableMessage* request, VariableMessage* response, google::protobuf::Closure* done) { @@ -122,23 +137,59 @@ class BRPCServiceImpl : public SendRecvService { brpc::Controller* cntl = static_cast(cntl_butil); std::string varname = request->varname(); + std::string out_varname = request->out_varname(); VLOG(3) << "RequestGet varname:" << varname + << ", out_varname:" << out_varname << ", trainer_id:" << request->trainer_id() << ", from:" << cntl->remote_side(); auto scope = request_get_h_->scope(); - auto invar = scope->FindVar(varname); + paddle::framework::Variable* invar = nullptr; + int trainer_id = request->trainer_id(); + paddle::framework::Variable* outvar = nullptr; + + request_get_h_->Handle(varname, scope, invar, &outvar, trainer_id, + out_varname); + + if (outvar) { + distributed::SerializeToIOBuf(out_varname, outvar, + *request_get_h_->dev_ctx(), response, + &cntl->response_attachment(), "", false); + } + } + + void _GetVariableNoBarrier(google::protobuf::RpcController* cntl_butil, + const VariableMessage* request, + VariableMessage* response, + google::protobuf::Closure* done) { + PADDLE_ENFORCE(request_getnobarrier_h_ != nullptr, + "RequestGetNoBarrier handler should be registed first!"); + + brpc::ClosureGuard done_guard(done); + brpc::Controller* cntl = static_cast(cntl_butil); + + std::string varname = request->varname(); + std::string out_varname = request->out_varname(); int trainer_id = request->trainer_id(); + + VLOG(3) << "RequestGetNoBarrier varname:" << varname + << ", out_varname:" << out_varname << ", trainer_id:" << trainer_id + << ", from:" << cntl->remote_side(); + + auto scope = request_getnobarrier_h_->scope(); + paddle::framework::Variable* invar = nullptr; paddle::framework::Variable* outvar = nullptr; - request_get_h_->Handle(varname, scope, invar, &outvar, trainer_id); + request_getnobarrier_h_->Handle(varname, scope, invar, &outvar, trainer_id, + out_varname); if (outvar) { - distributed::SerializeToIOBuf(varname, outvar, *request_get_h_->dev_ctx(), - response, &cntl->response_attachment(), "", - false); + distributed::SerializeToIOBuf( + out_varname, outvar, *request_getnobarrier_h_->dev_ctx(), response, + &cntl->response_attachment(), "", false); } } + void PrefetchVariable(google::protobuf::RpcController* cntl_butil, const VariableMessage* request, VariableMessage* response, @@ -282,6 +333,7 @@ class BRPCServiceImpl : public SendRecvService { private: distributed::RequestHandler* request_send_h_{nullptr}; distributed::RequestHandler* request_get_h_{nullptr}; + distributed::RequestHandler* request_getnobarrier_h_{nullptr}; distributed::RequestHandler* request_prefetch_h_{nullptr}; distributed::RequestHandler* request_checkpoint_h_{nullptr}; distributed::RequestHandler* request_get_monomer_handler_h_{nullptr}; @@ -289,9 +341,10 @@ class BRPCServiceImpl : public SendRecvService { distributed::RPCServer* rpc_server_{nullptr}; - // FIXME(gongwb): brpc should support process one rpce use one threadpool. + // FIXME(gongwb): brpc should support process one rpc use one threadpool. std::unique_ptr send_threads_; std::unique_ptr get_threads_; + std::unique_ptr getnobarrier_threads_; std::unique_ptr prefetch_threads_; std::unique_ptr checkpoint_notify_threads_; }; diff --git a/paddle/fluid/operators/distributed/request_handler_impl.cc b/paddle/fluid/operators/distributed/request_handler_impl.cc index 9a5dc5a86167ba9c9d4380832eeceb785a759037..e5318f98ca9844d7642eb688522380c4bcf347aa 100644 --- a/paddle/fluid/operators/distributed/request_handler_impl.cc +++ b/paddle/fluid/operators/distributed/request_handler_impl.cc @@ -54,9 +54,20 @@ bool RequestSendHandler::Handle(const std::string& varname, // Async if (!sync_mode_) { VLOG(3) << "async process var: " << varname; - executor_->RunPreparedContext((*grad_to_prepared_ctx_)[varname].get(), - scope); - delete scope; + if (varname == BATCH_BARRIER_MESSAGE) { + PADDLE_THROW( + "async mode should not recv BATCH_BARRIER_MESSAGE or " + "COMPLETE_MESSAGE"); + } + + try { + executor_->RunPreparedContext((*grad_to_prepared_ctx_)[varname].get(), + scope); + delete scope; + } catch (std::exception& e) { + LOG(ERROR) << "async: run sub program error " << e.what(); + return false; + } return true; } else { // sync rpc_server_->WaitCond(kRequestSend); diff --git a/paddle/fluid/operators/distributed/rpc_server.cc b/paddle/fluid/operators/distributed/rpc_server.cc index cc5b9c29a12ec5386041dfeea22fd388d94115e6..c3a46e348c69a20953f013c7de772a37db5f4844 100644 --- a/paddle/fluid/operators/distributed/rpc_server.cc +++ b/paddle/fluid/operators/distributed/rpc_server.cc @@ -39,27 +39,33 @@ void RPCServer::SavePort() const { port_file.open(file_path); port_file << selected_port_; port_file.close(); - VLOG(4) << "selected port written to " << file_path; + VLOG(3) << "selected port written to " << file_path; } void RPCServer::WaitBarrier(const std::string& rpc_name) { + VLOG(3) << "WaitBarrier in: " << rpc_name; std::unique_lock lock(this->mutex_); barrier_cond_.wait(lock, [this, &rpc_name] { return ((barrier_counter_[rpc_name] == client_num_ && client_num_ != 0) || exit_flag_.load()); }); - VLOG(3) << "batch_barrier_: " << rpc_name << " " - << barrier_counter_[rpc_name]; + VLOG(3) << "WaitBarrier out: " << rpc_name + << " counter: " << barrier_counter_[rpc_name]; } void RPCServer::IncreaseBatchBarrier(const std::string rpc_name) { - VLOG(4) << "RPCServer begin IncreaseBatchBarrier " << rpc_name; + VLOG(3) << "RPCServer begin IncreaseBatchBarrier " << rpc_name; + // barrier msg should make sure that it's in the right cond(send|recv) + WaitCond(rpc_name); int b = 0; std::unique_lock lock(mutex_); b = ++barrier_counter_[rpc_name]; + VLOG(3) << rpc_name << " barrier_counter: " << b; if (b >= client_num_) { lock.unlock(); + VLOG(3) << "BatchBarrier counter reach " << client_num_ << " for " + << rpc_name; barrier_cond_.notify_all(); lock.lock(); } @@ -71,7 +77,7 @@ void RPCServer::Complete() { client_num_--; need_reset_all_vars_ = true; - VLOG(4) << "decrease client_num to: " << client_num_; + VLOG(3) << "decrease client_num to: " << client_num_; if (cur_cond_.load() == rpc_cond_map_[kRequestGet]) { barrier_counter_[kRequestGet]--; } @@ -105,8 +111,8 @@ void RPCServer::RegisterRPC(const std::string& rpc_name, static int cond = -1; rpc_cond_map_[rpc_name] = ++cond; - VLOG(4) << "RegisterRPC rpc_name:" << rpc_name << ", handler:" << handler - << ", cond:" << rpc_cond_map_[rpc_name]; + VLOG(3) << "RegisterRPC rpc_name: " << rpc_name << ", handler: " << handler + << ", cond: " << rpc_cond_map_[rpc_name]; } void RPCServer::SetCond(const std::string& rpc_name) { @@ -120,7 +126,7 @@ void RPCServer::SetCond(const std::string& rpc_name) { } void RPCServer::WaitCond(const std::string& rpc_name) { - VLOG(4) << "RPCServer WaitCond " << rpc_name; + VLOG(3) << "RPCServer WaitCond in " << rpc_name; int cond = 0; { std::unique_lock lock(mutex_); @@ -130,6 +136,7 @@ void RPCServer::WaitCond(const std::string& rpc_name) { std::unique_lock lock(mutex_); rpc_cond_.wait( lock, [=] { return (cur_cond_.load() == cond || exit_flag_.load()); }); + VLOG(3) << "RPCServer WaitCond out " << rpc_name; } void RPCServer::RegisterVar(const std::string& var_name, @@ -151,7 +158,7 @@ void RPCServer::RegisterVar(const std::string& var_name, } rpc_cond_.notify_all(); - VLOG(4) << "RegisterVar context:" << h.String(); + VLOG(3) << "RegisterVar context:" << h.String(); } void RPCServer::IncreaseVarBarrier(const std::string& var_name) { @@ -167,11 +174,11 @@ void RPCServer::IncreaseVarBarrier(const std::string& var_name) { barrier_cond_.notify_all(); } - VLOG(4) << "IncreaseVarBarrier context:" << h.String(); + VLOG(3) << "IncreaseVarBarrier context:" << h.String(); } void RPCServer::WaitVarBarrier(const std::string& var_name) { - VLOG(4) << "WaitBarrier var_name:" << var_name; + VLOG(3) << "WaitVarBarrier var_name:" << var_name; std::unique_lock lock(mutex_); barrier_cond_.wait(lock, [&]() { @@ -179,11 +186,11 @@ void RPCServer::WaitVarBarrier(const std::string& var_name) { exit_flag_.load()); }); - VLOG(4) << "WaitBarrier context: " << var_map_[var_name].String(); + VLOG(3) << "WaitVarBarrier context: " << var_map_[var_name].String(); } void RPCServer::SetVarCond(const std::string& var_name) { - VLOG(4) << "SetVarCond var_name:" << var_name; + VLOG(3) << "SetVarCond var_name:" << var_name; { std::unique_lock lock(mutex_); if (var_map_.find(var_name) != var_map_.end()) { @@ -193,14 +200,14 @@ void RPCServer::SetVarCond(const std::string& var_name) { } void RPCServer::WaitVarCond(const std::string& var_name) { - VLOG(4) << "WaitVarCond var_name:" << var_name; + VLOG(3) << "WaitVarCond var_name:" << var_name; std::unique_lock lock(mutex_); rpc_cond_.wait(lock, [=] { return (var_map_.find(var_name) != var_map_.end() || exit_flag_.load()); }); - VLOG(4) << "WaitVarCond var_name:" << var_name << " end"; + VLOG(3) << "WaitVarCond var_name:" << var_name << " end"; } MonomerHandle RPCServer::GetMonomer(const std::string& var_name) { diff --git a/paddle/fluid/operators/distributed/variable_response.cc b/paddle/fluid/operators/distributed/variable_response.cc index 47ff568a1135f2f0a146faa4d5d6fc422a344f51..7825b4fc82b1f7580fea8ab4961facaf7fd64397 100644 --- a/paddle/fluid/operators/distributed/variable_response.cc +++ b/paddle/fluid/operators/distributed/variable_response.cc @@ -117,8 +117,9 @@ bool VariableResponse::CopyLodTensorData( tensor->mutable_data(ctx.GetPlace(), ToVarType(meta_.data_type())); VLOG(6) << "Tensor.memory_size = " << tensor->memory_size() - << ", Buffer Size = " << length; - PADDLE_ENFORCE_EQ(tensor->memory_size(), static_cast(length)); + << ", Buffer Size = " << length << ", dims:" << dims + << ", numel:" << tensor->numel(); + PADDLE_ENFORCE_GE(tensor->memory_size(), static_cast(length)); return ReadRaw(input, ctx, tensor->place(), tensor_data, length); } diff --git a/paddle/fluid/operators/distributed_ops/listen_and_serv_op.cc b/paddle/fluid/operators/distributed_ops/listen_and_serv_op.cc index 53968831ea0d640d13fc69ce1855257e8deed54c..5b30ed472d51a37a0705d1717395da9e4ff7d743 100644 --- a/paddle/fluid/operators/distributed_ops/listen_and_serv_op.cc +++ b/paddle/fluid/operators/distributed_ops/listen_and_serv_op.cc @@ -137,7 +137,9 @@ void ListenAndServOp::RunSyncLoop( while (true) { // Get from multiple trainers, we don't care about the order in which // the gradients arrives, just add suffix 0~n and merge the gradient. + VLOG(3) << "wait all clients to send gradient"; rpc_service_->SetCond(distributed::kRequestSend); + VLOG(3) << "wait all clients to send send_barrier"; rpc_service_->WaitBarrier(distributed::kRequestSend); if (rpc_service_->IsExit()) { @@ -168,12 +170,16 @@ void ListenAndServOp::RunSyncLoop( } ParallelExecuteBlocks(parallel_blkids, executor, optimize_prepared, program, recv_scope); - VLOG(2) << "run all blocks spent " << GetTimestamp() - ts << "(ms)"; + VLOG(3) << "run all blocks spent " << GetTimestamp() - ts << "(ms)"; + VLOG(3) << "ResetReceivedVars"; ResetReceivedVars(recv_scope, dev_ctx, rpc_service_->NeedResetAllVars()); + VLOG(3) << "wait all clients to get parameters back"; rpc_service_->SetCond(distributed::kRequestGet); + VLOG(3) << "wait all clients to send fetch_barrier"; rpc_service_->WaitBarrier(distributed::kRequestGet); + VLOG(3) << "ResetBarrierCounter"; rpc_service_->ResetBarrierCounter(); } // while(true) } diff --git a/paddle/fluid/operators/distributed_ops/merge_ids_op.h b/paddle/fluid/operators/distributed_ops/merge_ids_op.h index 99c57590191d58a12760fb335df76037685d1ced..05c00251b97bb5071102a43208c1fbbfa4ef8d2d 100644 --- a/paddle/fluid/operators/distributed_ops/merge_ids_op.h +++ b/paddle/fluid/operators/distributed_ops/merge_ids_op.h @@ -43,9 +43,9 @@ class MergeIdsOpKernel : public framework::OpKernel { PADDLE_ENFORCE_EQ(ids.size(), outs.size(), "the number of Ids and Out should be the same"); - size_t row_ids_size = 0; - int row_size = 0; - int embedding_size = 0; + int64_t row_ids_size = 0; + int64_t row_size = 0; + int64_t embedding_size = 0; for (size_t i = 0; i < x_tensors.size(); ++i) { const auto *x_tensor = x_tensors[i]; @@ -69,7 +69,7 @@ class MergeIdsOpKernel : public framework::OpKernel { for (size_t i = 0; i < x_tensors.size(); ++i) { const auto *row_id = row_ids[i]; - for (int j = 0; j < row_id->numel(); ++j) { + for (auto j = 0; j < row_id->numel(); ++j) { int64_t key = row_id->data()[j]; std::tuple val = std::make_tuple(i, j); selected_rows_idx_map.insert(std::make_pair(key, val)); @@ -84,13 +84,13 @@ class MergeIdsOpKernel : public framework::OpKernel { out->set_lod(out_ids->lod()); - int nums = static_cast(out_ids->dims()[0]); + auto nums = out_ids->dims()[0]; auto *out_data = out->mutable_data( framework::make_ddim({nums, embedding_size}), place); - for (int j = 0; j < nums; ++j) { - int id = out_ids->data()[j]; - auto row_tuple = selected_rows_idx_map[id]; - int64_t row_idx = std::get<1>(row_tuple); + for (auto j = 0; j < nums; ++j) { + auto id = out_ids->data()[j]; + auto row_tuple = selected_rows_idx_map.at(id); + auto row_idx = std::get<1>(row_tuple); const auto *x_tensor = x_tensors[std::get<0>(row_tuple)]; memcpy(out_data + embedding_size * j, diff --git a/paddle/fluid/operators/elementwise/elementwise_op_function.h b/paddle/fluid/operators/elementwise/elementwise_op_function.h index 7bb6934e1496cc989eee8ba82f56959522803bfb..cb8a4e7e1502e7e6ceb48e51452c2c7ab8313972 100644 --- a/paddle/fluid/operators/elementwise/elementwise_op_function.h +++ b/paddle/fluid/operators/elementwise/elementwise_op_function.h @@ -277,68 +277,6 @@ class TransformFunctor { Functor func_; }; -#define EIGEN_FUNCTOR(name, eigen_op) \ - struct Eigen##name##Functor { \ - template \ - inline void Run(const framework::Tensor *x, const framework::Tensor *y, \ - framework::Tensor *z, \ - const framework::ExecutionContext &ctx) { \ - auto x_e = framework::EigenVector::Flatten(*x); \ - auto y_e = framework::EigenVector::Flatten(*y); \ - auto z_e = framework::EigenVector::Flatten(*z); \ - z_e.device( \ - *ctx.template device_context().eigen_device()) = \ - eigen_op(x_e, y_e); \ - } \ - template \ - inline void RunBroadCast(const framework::Tensor *x, \ - const framework::Tensor *y, framework::Tensor *z, \ - const framework::ExecutionContext &ctx, int pre, \ - int n) { \ - auto x_e = framework::EigenVector::Flatten(*x); \ - auto y_e = framework::EigenVector::Flatten(*y); \ - auto z_e = framework::EigenVector::Flatten(*z); \ - auto y_bcast = y_e.reshape(Eigen::DSizes(1, n)) \ - .broadcast(Eigen::DSizes(pre, 1)) \ - .reshape(Eigen::DSizes(x_e.size())); \ - z_e.device( \ - *ctx.template device_context().eigen_device()) = \ - eigen_op(x_e, y_bcast); \ - } \ - template \ - inline void RunBroadCast2(const framework::Tensor *x, \ - const framework::Tensor *y, \ - framework::Tensor *z, \ - const framework::ExecutionContext &ctx, int pre, \ - int n, int post) { \ - auto x_e = framework::EigenVector::Flatten(*x); \ - auto y_e = framework::EigenVector::Flatten(*y); \ - auto z_e = framework::EigenVector::Flatten(*z); \ - auto y_bcast = y_e.reshape(Eigen::DSizes(1, n, 1)) \ - .broadcast(Eigen::DSizes(pre, 1, post)) \ - .reshape(Eigen::DSizes(x_e.size())); \ - z_e.device( \ - *ctx.template device_context().eigen_device()) = \ - eigen_op(x_e, y_bcast); \ - } \ - } - -#define EIGEN_ADD(x, y) ((x) + (y)) - -EIGEN_FUNCTOR(Add, EIGEN_ADD); - -#define EIGEN_SUB(x, y) ((x) - (y)) - -EIGEN_FUNCTOR(Sub, EIGEN_SUB); - -#define EIGEN_MUL(x, y) ((x) * (y)) - -EIGEN_FUNCTOR(Mul, EIGEN_MUL); - -#define EIGEN_DIV(x, y) ((x) / (y)) - -EIGEN_FUNCTOR(Div, EIGEN_DIV); - template struct ElemwiseGradNoBroadcast { const T *x_; diff --git a/paddle/fluid/operators/fused/fusion_conv_inception_op.cu b/paddle/fluid/operators/fused/fusion_conv_inception_op.cu index c72a966c575d4a63471905b82643e96454f08187..6e13887866485bd114ebf12f4bdfa8d60fca6d01 100644 --- a/paddle/fluid/operators/fused/fusion_conv_inception_op.cu +++ b/paddle/fluid/operators/fused/fusion_conv_inception_op.cu @@ -216,19 +216,18 @@ class CUDNNConvInceptionFusionOpKernel : public framework::OpKernel { out_datas.push_back( static_cast(output_data + (oc0 + oc1 + oc2) * h * w)); - auto temp_allocation = - platform::DeviceTemporaryAllocator::Instance().Get(dev_ctx).Allocate( - workspace_size_in_bytes); - void* cudnn_workspace = temp_allocation->ptr(); - for (int i = 0; i < 4; ++i) { - CUDNN_ENFORCE(platform::dynload::cudnnConvolutionBiasActivationForward( - handle, &alpha, in_desc[i], in_datas[i], filter_desc[i], - static_cast(filters[i]->data()), conv_desc[i], - algo[i], cudnn_workspace, workspace_size_in_bytes, &beta, out_desc[i], - out_datas[i], bias_desc[i], - static_cast(bias[i]->data()), cudnn_act_desc, - out_desc[i], out_datas[i])); + auto func = [&](void* cudnn_workspace) { + CUDNN_ENFORCE(platform::dynload::cudnnConvolutionBiasActivationForward( + handle, &alpha, in_desc[i], in_datas[i], filter_desc[i], + static_cast(filters[i]->data()), conv_desc[i], + algo[i], cudnn_workspace, workspace_size_in_bytes, &beta, + out_desc[i], out_datas[i], bias_desc[i], + static_cast(bias[i]->data()), cudnn_act_desc, + out_desc[i], out_datas[i])); + }; + auto workspace_handle = dev_ctx.cudnn_workspace_handle(); + workspace_handle.RunFunc(func, workspace_size_in_bytes); } cudnnTensorDescriptor_t x_desc; diff --git a/paddle/fluid/operators/grid_sampler_op.cc b/paddle/fluid/operators/grid_sampler_op.cc index 14a2524bd8f4a9f7685c84f1d9767f5f7eedf0e7..241184c6f4a19a1da0d6d75c5d4e2b372c14e9da 100644 --- a/paddle/fluid/operators/grid_sampler_op.cc +++ b/paddle/fluid/operators/grid_sampler_op.cc @@ -43,12 +43,14 @@ class GridSampleOp : public framework::OperatorWithKernel { PADDLE_ENFORCE(grid_dims[3] == 2, "Input(Grid) dims[3] should be 2."); PADDLE_ENFORCE_EQ(grid_dims[0], x_dims[0], "Input(X) and Input(Grid) dims[0] should be equal."); - PADDLE_ENFORCE_EQ( - grid_dims[1], x_dims[2], - "Input(X) dims[2] and Input(Grid) dims[1] should be equal."); - PADDLE_ENFORCE_EQ( - grid_dims[2], x_dims[3], - "Input(X) dims[3] and Input(Grid) dims[2] should be equal."); + if (ctx->IsRuntime()) { + PADDLE_ENFORCE_EQ( + grid_dims[1], x_dims[2], + "Input(X) dims[2] and Input(Grid) dims[1] should be equal."); + PADDLE_ENFORCE_EQ( + grid_dims[2], x_dims[3], + "Input(X) dims[3] and Input(Grid) dims[2] should be equal."); + } ctx->SetOutputDim("Output", x_dims); ctx->ShareLoD("X", "Output"); diff --git a/paddle/fluid/operators/jit/CMakeLists.txt b/paddle/fluid/operators/jit/CMakeLists.txt index 262094f9224407bb412f5b189a748efe13cb04b2..35775d7ec9efcdbad69e4491792f7d4e513832ad 100644 --- a/paddle/fluid/operators/jit/CMakeLists.txt +++ b/paddle/fluid/operators/jit/CMakeLists.txt @@ -21,5 +21,5 @@ endif() cc_library(jit_kernel_helper SRCS ${jit_kernel_cc_srcs} DEPS ${JIT_KERNEL_DEPS}) cc_test(jit_kernel_test SRCS test.cc DEPS jit_kernel_helper) if(NOT WIN32) - cc_binary(jit_kernel_benchmark SRCS benchmark.cc DEPS jit_kernel_helper device_tracer) + cc_binary(jit_kernel_benchmark SRCS benchmark.cc DEPS jit_kernel_helper device_tracer tensor) endif() diff --git a/paddle/fluid/operators/jit/benchmark.cc b/paddle/fluid/operators/jit/benchmark.cc index 74d6a87247821eb1d17cc97b8d8b4bcf1c832f79..186c37c56ec9410ac9a31503e33e7e334d0afc40 100644 --- a/paddle/fluid/operators/jit/benchmark.cc +++ b/paddle/fluid/operators/jit/benchmark.cc @@ -18,6 +18,7 @@ #include #include "gflags/gflags.h" #include "glog/logging.h" +#include "paddle/fluid/framework/tensor.h" #include "paddle/fluid/operators/jit/kernels.h" #include "paddle/fluid/platform/device_tracer.h" #include "paddle/fluid/platform/place.h" @@ -155,14 +156,22 @@ void BenchAllImpls(const typename KernelTuples::attr_type& attr, Args... args) { LOG(INFO) << loginfos.str(); } +using Tensor = paddle::framework::Tensor; + template void BenchXYZNKernel() { for (int d : TestSizes()) { - std::vector x(d), y(d), z(d); - RandomVec(d, x.data()); - RandomVec(d, y.data()); - BenchAllImpls, PlaceType>(d, x.data(), y.data(), - z.data(), d); + Tensor x, y, z; + x.Resize({d}); + y.Resize({d}); + z.Resize({d}); + T* x_data = x.mutable_data(PlaceType()); + T* y_data = y.mutable_data(PlaceType()); + T* z_data = z.mutable_data(PlaceType()); + RandomVec(d, x_data); + RandomVec(d, y_data); + BenchAllImpls, PlaceType>(d, x.data(), + y.data(), z_data, d); } } @@ -170,9 +179,13 @@ template void BenchAXYNKernel() { for (int d : TestSizes()) { const T a = static_cast(3); - std::vector x(d), y(d); - RandomVec(d, x.data()); - BenchAllImpls, PlaceType>(d, &a, x.data(), y.data(), + Tensor x, y; + x.Resize({d}); + y.Resize({d}); + T* x_data = x.mutable_data(PlaceType()); + T* y_data = y.mutable_data(PlaceType()); + RandomVec(d, x_data); + BenchAllImpls, PlaceType>(d, &a, x.data(), y_data, d); } } @@ -180,9 +193,13 @@ void BenchAXYNKernel() { template void BenchXYNKernel() { for (int d : TestSizes()) { - std::vector x(d), y(d); - RandomVec(d, x.data()); - BenchAllImpls, PlaceType>(d, x.data(), y.data(), d); + Tensor x, y; + x.Resize({d}); + y.Resize({d}); + T* x_data = x.mutable_data(PlaceType()); + T* y_data = y.mutable_data(PlaceType()); + RandomVec(d, x_data); + BenchAllImpls, PlaceType>(d, x.data(), y_data, d); } } @@ -192,16 +209,23 @@ void BenchLSTMKernel() { for (int d : TestSizes()) { const jit::lstm_attr_t attr(d, jit::kVSigmoid, jit::kVTanh, jit::kVTanh, use_peephole); - std::vector x(4 * d), ct_1(d), ct(d), ht(d), wp(3 * d), checked(2 * d); - RandomVec(4 * d, x.data(), -2.f, 2.f); - RandomVec(3 * d, wp.data(), -2.f, 2.f); - RandomVec(d, ct_1.data(), -2.f, 2.f); - const T* ct_1_data = ct_1.data(); - const T* wp_data = wp.data(); - T* x_data = x.data(); - T* checked_data = checked.data(); - T* ct_data = ct.data(); - T* ht_data = ht.data(); + Tensor x, ct_1, ct, ht, wp, checked; + x.Resize({4 * d}); + ct_1.Resize({d}); + ct.Resize({d}); + ht.Resize({d}); + wp.Resize({3 * d}); + checked.Resize({2 * d}); + auto place = PlaceType(); + RandomVec(x.numel(), x.mutable_data(place), -2.f, 2.f); + RandomVec(wp.numel(), wp.mutable_data(place), -2.f, 2.f); + RandomVec(ct_1.numel(), ct_1.mutable_data(place), -2.f, 2.f); + const T* ct_1_data = ct_1.data(); + const T* wp_data = wp.data(); + T* x_data = x.mutable_data(place); + T* checked_data = checked.mutable_data(place); + T* ct_data = ct.mutable_data(place); + T* ht_data = ht.mutable_data(place); jit::lstm_t step; step.gates = x_data; step.ct_1 = ct_1_data; @@ -220,12 +244,16 @@ template void BenchGRUKernel() { for (int d : TestSizes()) { const jit::gru_attr_t attr(d, jit::kVSigmoid, jit::kVTanh); - std::vector x(3 * d), ht_1(d), ht(d); - RandomVec(3 * d, x.data(), -2.f, 2.f); - RandomVec(d, ht_1.data(), -2.f, 2.f); - const T* ht_1_data = ht_1.data(); - T* x_data = x.data(); - T* ht_data = ht.data(); + auto place = PlaceType(); + Tensor x, ht_1, ht; + x.Resize({3 * d}); + ht_1.Resize({d}); + ht.Resize({d}); + RandomVec(3 * d, x.mutable_data(place), -2.f, 2.f); + RandomVec(d, ht_1.mutable_data(place), -2.f, 2.f); + const T* ht_1_data = ht_1.data(); + T* x_data = x.mutable_data(place); + T* ht_data = ht.mutable_data(place); jit::gru_t step; step.gates = x_data; step.ht_1 = ht_1_data; @@ -243,10 +271,12 @@ void BenchSeqPoolKernel() { jit::seq_pool_attr_t attr(w, type); for (int h : TestSizes()) { attr.h = h; - std::vector x(h * w), y(w); - RandomVec(h * w, x.data(), -2.f, 2.f); - const T* x_data = x.data(); - T* y_data = y.data(); + Tensor x, y; + x.Resize({h * w}); + y.Resize({w}); + RandomVec(h * w, x.mutable_data(PlaceType()), -2.f, 2.f); + const T* x_data = x.data(); + T* y_data = y.mutable_data(PlaceType()); BenchAllImpls, PlaceType>(attr, x_data, y_data, &attr); } @@ -259,12 +289,15 @@ void BenchMatMulKernel() { for (int m : {1, 2, 3, 4}) { for (int n : TestSizes()) { for (int k : TestSizes()) { - std::vector a(m * k), b(k * n), c(m * n); - RandomVec(m * k, a.data(), -2.f, 2.f); - RandomVec(k * n, b.data(), -2.f, 2.f); - const T* a_data = a.data(); - const T* b_data = b.data(); - T* c_data = c.data(); + Tensor a, b, c; + a.Resize({m * k}); + b.Resize({k * n}); + c.Resize({m * n}); + RandomVec(m * k, a.mutable_data(PlaceType()), -2.f, 2.f); + RandomVec(k * n, b.mutable_data(PlaceType()), -2.f, 2.f); + const T* a_data = a.data(); + const T* b_data = b.data(); + T* c_data = c.mutable_data(PlaceType()); BenchAllImpls, PlaceType>(k, a_data, b_data, c_data, m, n, k); } diff --git a/paddle/fluid/operators/lrn_mkldnn_op.cc b/paddle/fluid/operators/lrn_mkldnn_op.cc index 4e4f977fcc742856b877ef0b7f9a3cc9879aefce..097ba01d401dbc7969e30f576cac2567c874ed99 100644 --- a/paddle/fluid/operators/lrn_mkldnn_op.cc +++ b/paddle/fluid/operators/lrn_mkldnn_op.cc @@ -67,7 +67,13 @@ class LRNMKLDNNOpKernel : public paddle::framework::OpKernel { mid->mutable_data(ctx.GetPlace()); const int n = ctx.Attr("n"); - const float alpha = ctx.Attr("alpha"); + // MKL-DNN implements LRN in a caffe way: + // http://caffe.berkeleyvision.org/tutorial/layers/lrn.html + // Where sum of squares is divided by size of normalization window + // this is not the case for PaddlePaddle LRN. + // Hence we need to compensate for this diffrence by + // multipliing alpha by size of window(n) + const float alpha = ctx.Attr("alpha") * static_cast(n); const float beta = ctx.Attr("beta"); const float k = ctx.Attr("k"); const bool is_test = ctx.Attr("is_test"); @@ -78,10 +84,7 @@ class LRNMKLDNNOpKernel : public paddle::framework::OpKernel { auto dims = paddle::framework::vectorize2int(x->dims()); auto src_md = paddle::platform::MKLDNNMemDesc( - dims, mkldnn::memory::data_type::f32, mkldnn::memory::format::nchw); - - auto dst_md = paddle::platform::MKLDNNMemDesc( - dims, mkldnn::memory::data_type::f32, mkldnn::memory::format::nchw); + dims, mkldnn::memory::data_type::f32, x->format()); auto forward_desc = mkldnn::lrn_forward::desc{mkldnn::prop_kind::forward, mkldnn::lrn_across_channels, @@ -92,8 +95,6 @@ class LRNMKLDNNOpKernel : public paddle::framework::OpKernel { k}; auto src_memory_pd = mkldnn::memory::primitive_desc{src_md, mkldnn_engine}; - auto dst_memory = mkldnn::memory{{dst_md, mkldnn_engine}, - static_cast(output_data)}; if (!is_test) { const std::string key = ctx.op().Output("Out"); @@ -110,11 +111,16 @@ class LRNMKLDNNOpKernel : public paddle::framework::OpKernel { src_memory->set_data_handle( static_cast(const_cast(input_data))); + auto dst_memory = mkldnn::memory(forward_pd->dst_primitive_desc(), + static_cast(output_data)); auto workspace_memory = insert_to_context( key_workspace_memory, dev_ctx, forward_pd->workspace_primitive_desc()); run_primitive(*forward_pd, *src_memory, *workspace_memory, dst_memory); + + out->set_layout(framework::DataLayout::kMKLDNN); + out->set_format(platform::GetMKLDNNFormat(dst_memory)); } else { auto forward_pd = mkldnn::lrn_forward::primitive_desc{forward_desc, mkldnn_engine}; @@ -122,8 +128,13 @@ class LRNMKLDNNOpKernel : public paddle::framework::OpKernel { src_memory_pd, static_cast(const_cast(input_data))}; auto workspace_memory = mkldnn::memory{forward_pd.workspace_primitive_desc()}; + auto dst_memory = mkldnn::memory(forward_pd.dst_primitive_desc(), + static_cast(output_data)); run_primitive(forward_pd, src_memory, workspace_memory, dst_memory); + + out->set_layout(framework::DataLayout::kMKLDNN); + out->set_format(platform::GetMKLDNNFormat(dst_memory)); } } }; @@ -151,7 +162,7 @@ class LRNMKLDNNGradOpKernel : public paddle::framework::OpKernel { const std::string key_workspace_memory = key + "@lrn_workspace_memory"; const int n = ctx.Attr("n"); - const float alpha = ctx.Attr("alpha"); + const float alpha = ctx.Attr("alpha") * static_cast(n); const float beta = ctx.Attr("beta"); const float k = ctx.Attr("k"); diff --git a/paddle/fluid/operators/math/CMakeLists.txt b/paddle/fluid/operators/math/CMakeLists.txt index dc27e543f0dfd65e556f9e3a138778972ad6982f..6bbb7155dda9b2c844f793a63adb861c2ed956e8 100644 --- a/paddle/fluid/operators/math/CMakeLists.txt +++ b/paddle/fluid/operators/math/CMakeLists.txt @@ -54,6 +54,7 @@ math_library(sequence_padding) math_library(sequence_pooling DEPS math_function jit_kernel_helper) math_library(sequence_scale) math_library(softmax DEPS math_function) +math_library(beam_search DEPS math_function) math_library(matrix_bit_code) @@ -68,6 +69,7 @@ cc_test(im2col_test SRCS im2col_test.cc DEPS im2col) cc_test(vol2col_test SRCS vol2col_test.cc DEPS vol2col) cc_test(sequence_padding_test SRCS sequence_padding_test.cc DEPS sequence_padding) cc_test(sequence_pooling_test SRCS sequence_pooling_test.cc DEPS sequence_pooling) +cc_test(beam_search_test SRCS beam_search_test.cc DEPS beam_search) if(WITH_GPU) nv_test(math_function_gpu_test SRCS math_function_test.cu DEPS math_function) nv_test(selected_rows_functor_gpu_test SRCS selected_rows_functor_test.cu.cc DEPS selected_rows_functor math_function) diff --git a/paddle/fluid/operators/math/beam_search.cc b/paddle/fluid/operators/math/beam_search.cc new file mode 100644 index 0000000000000000000000000000000000000000..fb7119273a734feba870fdabade6a4faa1d5e9a3 --- /dev/null +++ b/paddle/fluid/operators/math/beam_search.cc @@ -0,0 +1,283 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/operators/math/beam_search.h" +#include +#include + +namespace paddle { +namespace operators { +namespace math { + +template +class BeamSearchFunctor { + public: + void operator()(const platform::CPUDeviceContext &context, + const framework::LoDTensor *pre_ids, + const framework::LoDTensor *pre_scores, + const framework::LoDTensor *ids, + const framework::LoDTensor *scores, + framework::LoDTensor *selected_ids, + framework::LoDTensor *selected_scores, size_t level, + size_t beam_size, int end_id, bool is_accumulated) { + auto abs_lod = framework::ToAbsOffset(scores->lod()); + auto &high_level = abs_lod[level]; + + auto items = SelectTopBeamSizeItems(pre_ids, pre_scores, ids, scores, level, + beam_size, end_id, is_accumulated); + auto selected_items = ToMap(items, high_level.back()); + if (FLAGS_v == 3) { + VLOG(3) << "selected_items:"; + for (size_t i = 0; i < selected_items.size(); ++i) { + VLOG(3) << "offset: " << i; + for (auto &item : selected_items[i]) { + VLOG(3) << item.ToString(); + } + } + } + + PruneEndBeams(pre_ids, abs_lod, &selected_items, level, end_id); + // calculate the output tensor's height + size_t num_instances = std::accumulate( + std::begin(selected_items), std::end(selected_items), 0, + [](size_t a, std::vector &b) { return a + b.size(); }); + // the output tensor shape should be [num_instances, 1] + auto dims = framework::make_ddim( + std::vector({static_cast(num_instances), 1})); + selected_ids->Resize(dims); + selected_scores->Resize(dims); + + auto *selected_ids_data = + selected_ids->mutable_data(platform::CPUPlace()); + auto *selected_scores_data = + selected_scores->mutable_data(platform::CPUPlace()); + + // fill in data + std::vector low_level; + size_t low_offset = 0; + for (auto &items : selected_items) { + low_level.push_back(low_offset); + for (auto &item : items) { + selected_ids_data[low_offset] = item.id; + selected_scores_data[low_offset] = item.score; + low_offset++; + } + } + low_level.push_back(low_offset); + + // fill lod + framework::LoD lod(2); + lod[0].assign(high_level.begin(), high_level.end()); + lod[1].assign(low_level.begin(), low_level.end()); + if (!framework::CheckLoD(lod)) { + PADDLE_THROW("lod %s is not right", framework::LoDToString(lod)); + } + selected_ids->set_lod(lod); + selected_scores->set_lod(lod); + } + + /* + * The basic items help to sort. + */ + struct Item { + Item() {} + Item(size_t offset, size_t id, float score) + : offset(offset), id(id), score(score) {} + // offset in the higher lod level. + size_t offset; + // prefix id in the lower lod level. + // size_t prefix; + // the candidate id + size_t id; + // the corresponding score + float score; + + inline bool operator<(const Item &in) const { + return (score < in.score) || + ((score == in.score) && (offset < in.offset)); + } + + inline void operator=(const Item &in) { + offset = in.offset; + id = in.id; + score = in.score; + } + + std::string ToString() { + std::ostringstream os; + os << "{"; + os << "offset: " << offset << ", "; + os << "id: " << id << ", "; + os << "score: " << score << ""; + os << "}"; + return os.str(); + } + }; + + protected: + /* + * Prune the source sentences all branchs finished, and it is optional. + * Pruning must one step later than finishing (thus pre_ids is needed here), + * since the end tokens must be writed out. + */ + void PruneEndBeams(const framework::LoDTensor *pre_ids, + const framework::LoD &abs_lod, + std::vector> *items, size_t lod_level, + int end_id) { + auto *pre_ids_data = pre_ids->data(); + auto &high_level = abs_lod[lod_level]; + for (size_t src_idx = 0; src_idx < high_level.size() - 1; ++src_idx) { + size_t src_prefix_start = high_level[src_idx]; + size_t src_prefix_end = high_level[src_idx + 1]; + bool finish_flag = true; + for (size_t offset = src_prefix_start; offset < src_prefix_end; + offset++) { + for (auto &item : items->at(offset)) { + if (item.id != static_cast(end_id) || + pre_ids_data[offset] != end_id) { + finish_flag = false; + break; + } + } + if (!finish_flag) break; + } + if (finish_flag) { // all branchs of the beam (source sentence) end and + // prune this beam + for (size_t offset = src_prefix_start; offset < src_prefix_end; + offset++) + items->at(offset).clear(); + } + } + } + + /* + * Transform the items into a map whose key is offset, value is the items. + * NOTE low performance. + */ + std::vector> ToMap( + const std::vector> &items, size_t element_num) { + std::vector> result; + result.resize(element_num); + for (auto &entries : items) { + for (const auto &item : entries) { + result[item.offset].push_back(item); + } + } + return result; + } + + void Insert(std::vector *top_beam_ptr, const Item &item, + size_t beam_size) { + std::vector &top_beam = *top_beam_ptr; + + size_t num_beams = top_beam.size(); + if (num_beams < beam_size) { + top_beam.resize(num_beams + 1); + num_beams++; + } else { + if (item < top_beam[beam_size - 1]) { + return; + } + } + + for (int k = static_cast(num_beams) - 2; k >= 0; --k) { + if (top_beam[k] < item) { + top_beam[k + 1] = top_beam[k]; + } else { + top_beam[k + 1] = item; + return; + } + } + top_beam[0] = item; + } + + /* + * For each source, select top beam_size records. + */ + std::vector> SelectTopBeamSizeItems( + const framework::LoDTensor *pre_ids, + const framework::LoDTensor *pre_scores, const framework::LoDTensor *ids, + const framework::LoDTensor *scores, size_t lod_level, size_t beam_size, + int end_id, bool is_accumulated) { + std::vector> result; + + // find the current candidates + auto abs_lod = framework::ToAbsOffset(scores->lod()); + + auto *pre_ids_data = pre_ids->data(); + auto *pre_scores_data = pre_scores->data(); + + auto *ids_data = ids ? ids->data() : nullptr; + auto *scores_data = scores->data(); + + size_t num_seqs = scores->NumElements(lod_level); + size_t seq_width = 1; + for (int i = 1; i < scores->dims().size(); i++) { + seq_width *= scores->dims()[i]; + } + + for (size_t seq_id = 0; seq_id < num_seqs; ++seq_id) { + size_t seq_offset_start = abs_lod[lod_level][seq_id]; + size_t seq_offset_end = abs_lod[lod_level][seq_id + 1]; + + std::vector top_beam; + top_beam.reserve(beam_size); + + for (size_t offset = seq_offset_start; offset < seq_offset_end; + ++offset) { + auto pre_id = pre_ids_data[offset]; + auto pre_score = pre_scores_data[offset]; + if (pre_id == end_id) { + // Allocate all probability mass to end_id for finished branchs and + // the other candidate ids can be ignored. + Item item(offset, end_id, pre_score); + Insert(&top_beam, item, beam_size); + } else { + size_t index = offset * seq_width; + for (size_t d = 0; d < seq_width; d++, index++) { + int64_t id = ids_data ? ids_data[index] : static_cast(d); + float score = is_accumulated + ? scores_data[index] + : pre_score + std::log(scores_data[index]); + Item item(offset, id, score); + Insert(&top_beam, item, beam_size); + } + } + } + + result.emplace_back(top_beam); + } + + if (FLAGS_v == 3) { + VLOG(3) << "SelectTopBeamSizeItems result size " << result.size(); + for (auto &items : result) { + VLOG(3) << "item set:"; + for (auto &item : items) { + VLOG(3) << item.ToString(); + } + } + } + + return result; + } +}; + +template class BeamSearchFunctor; +template class BeamSearchFunctor; +template class BeamSearchFunctor; +template class BeamSearchFunctor; + +} // namespace math +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/math/beam_search.cu b/paddle/fluid/operators/math/beam_search.cu new file mode 100644 index 0000000000000000000000000000000000000000..d94e3023ce537cb9fa456e079c4fa3cf57fb954d --- /dev/null +++ b/paddle/fluid/operators/math/beam_search.cu @@ -0,0 +1,393 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/operators/math/beam_search.h" +#include "paddle/fluid/platform/cuda_device_function.h" + +namespace paddle { +namespace operators { +namespace math { + +struct Triple { + __device__ __forceinline__ Triple() {} + __device__ __forceinline__ Triple(int o, int i, float s) + : offset(o), id(i), score(s) {} + + __device__ __forceinline__ void set(int o, int i, float s) { + offset = o; + id = i; + score = s; + } + + __device__ __forceinline__ void operator=(const Triple& in) { + offset = in.offset; + id = in.id; + score = in.score; + } + + __device__ __forceinline__ bool operator<(const float s) const { + return score < s; + } + + __device__ __forceinline__ bool operator<(const Triple& in) const { + return (score < in.score) || ((score == in.score) && (offset < in.offset)); + } + + int offset; + int id; + float score; +}; + +__device__ __forceinline__ void Insert(Triple* top_beam, const Triple& p, + int beam_size) { + if (p < top_beam[beam_size - 1]) { + return; + } + for (int k = beam_size - 2; k >= 0; --k) { + if (top_beam[k] < p) { + top_beam[k + 1] = top_beam[k]; + } else { + top_beam[k + 1] = p; + return; + } + } + top_beam[0] = p; +} + +template +__device__ __forceinline__ int SelectTopBeam( + Triple* top_beam, const int64_t* pre_ids, const float* pre_scores, + const int64_t* ids, const float* scores, const int seq_offset_start, + const int seq_offset_end, const int seq_width, int beam_size, int end_id, + int used_threads) { + // top_beam is shared memory + const int tid = threadIdx.x; + const int tid_of_seq = threadIdx.x % MaxThreadsPerSeq; + + int num_used_threads = used_threads; + + Triple* top_beam_local = top_beam + tid * beam_size; + if (tid_of_seq < num_used_threads) { + for (int i = 0; i < beam_size; ++i) { + top_beam_local[i].set(-1, -1, -INFINITY); + } + + for (int offset = seq_offset_start; offset < seq_offset_end; ++offset) { + int pre_id = static_cast(pre_ids[offset]); + if (pre_id == end_id) { + if (tid_of_seq == 0) { + Triple tmp(offset, end_id, pre_scores[offset]); + Insert(top_beam_local, tmp, beam_size); + } + } else { + int index = offset * seq_width + tid_of_seq; + if (!IsAccumulated) { + float pre_score = pre_scores[offset]; + for (int i = tid_of_seq; i < seq_width; i += num_used_threads) { + float score = pre_score + __logf(scores[index]); + int id = ids ? static_cast(ids[index]) : i; + Triple tmp(offset, id, score); + Insert(top_beam_local, tmp, beam_size); + index += num_used_threads; + } + } else { + for (int i = tid_of_seq; i < seq_width; i += num_used_threads) { + int id = ids ? static_cast(ids[index]) : i; + float score = scores[index]; + Triple tmp(offset, id, score); + Insert(top_beam_local, tmp, beam_size); + index += num_used_threads; + } + } + } + } + } + + while (num_used_threads > 1) { + if (num_used_threads > 16) { + __syncthreads(); + } + + num_used_threads = num_used_threads >> 1; + if (tid_of_seq < num_used_threads) { + int index_in_sh = (num_used_threads + tid) * beam_size; + for (int i = 0; i < beam_size; i++) { + Insert(top_beam_local, top_beam[index_in_sh], beam_size); + index_in_sh++; + } + } + } + + if (tid_of_seq == 0) { + int num_items = 0; + for (int i = 0; i < beam_size; ++i) { + num_items = + (top_beam_local[i].score > -INFINITY) ? num_items + 1 : num_items; + } + return num_items; + } + + return 0; +} + +__device__ __forceinline__ bool PruneEndBeams(Triple* top_beam_local, + const int64_t* pre_ids, + const int end_id, int num_items) { + bool finish_flag = true; + for (int i = 0; i < num_items; ++i) { + int offset = top_beam_local[i].offset; + if (top_beam_local[i].id != end_id || + static_cast(pre_ids[offset]) != end_id) { + finish_flag = false; + break; + } + } + return finish_flag; +} + +__device__ __forceinline__ void WriteBack( + int64_t* selected_ids, float* selected_scores, size_t* selected_offsets, + Triple* top_beam_local, const int seq_offset_start, + const int seq_offset_end, const int selected_seq_start, + const int selected_seq_length) { + const int tid = threadIdx.x; // use 1 thread only for each sequence + int global_index = selected_seq_start; + for (int global_offset = seq_offset_start; global_offset < seq_offset_end; + ++global_offset) { + for (int local_index = 0; local_index < selected_seq_length; + ++local_index) { + if (top_beam_local[local_index].offset == global_offset) { + selected_ids[global_index] = + static_cast(top_beam_local[local_index].id); + selected_scores[global_index] = top_beam_local[local_index].score; + global_index++; + } + } + selected_offsets[global_offset + 1] = static_cast(global_index); + } +} + +template +__device__ void BeamSearchDetails( + int64_t* selected_ids, float* selected_scores, size_t* selected_offsets, + const int64_t* pre_ids, const float* pre_scores, const int64_t* ids, + const float* scores, const int seq_offset_start, const int seq_offset_end, + const int seq_width, int beam_size, int end_id, bool is_accumulated, + int num_used_threads) { + __shared__ Triple top_beam[MaxLength]; + + int num_items = 0; + if (is_accumulated) { + num_items = SelectTopBeam( + top_beam, pre_ids, pre_scores, ids, scores, seq_offset_start, + seq_offset_end, seq_width, beam_size, end_id, num_used_threads); + } else { + num_items = SelectTopBeam( + top_beam, pre_ids, pre_scores, ids, scores, seq_offset_start, + seq_offset_end, seq_width, beam_size, end_id, num_used_threads); + } + + const int tid = threadIdx.x; // use 1 thread only for each sequence + const int tid_of_seq = tid % MaxThreadsPerSeq; + if (tid_of_seq == 0) { + // Use 1 thread for each sequence. + Triple* top_beam_local = top_beam + tid * beam_size; + bool finish_flag = + PruneEndBeams(top_beam_local, pre_ids, end_id, num_items); + + int selected_seq_start = 0; + int selected_seq_length = finish_flag ? 0 : num_items; + + if (MaxSeqs > 1) { + const int seq_id = (MaxSeqs > 1) ? tid / MaxThreadsPerSeq : tid; + __shared__ int shared_mem[MaxSeqs]; + + // [0, MaxSeqs - 1], length of each sequences + shared_mem[seq_id] = selected_seq_length; + __syncthreads(); + + for (int s = 0; s < seq_id; ++s) { + selected_seq_start += shared_mem[s]; + } + + if (seq_id == 0) { + selected_offsets[0] = 0; + } + } else { + selected_offsets[0] = 0; + } + + WriteBack(selected_ids, selected_scores, selected_offsets, top_beam_local, + seq_offset_start, seq_offset_end, selected_seq_start, + selected_seq_length); + } +} + +template +__global__ void BeamSearchKernel(int64_t* selected_ids, float* selected_scores, + size_t* selected_offsets, + const int64_t* pre_ids, + const float* pre_scores, const int64_t* ids, + const float* scores, const size_t* seq_offsets, + const int num_seqs, const int seq_width, + int beam_size, int end_id, bool is_accumulated, + int num_used_threads) { + const int tid = threadIdx.x; + const int seq_id = (MaxSeqs > 1) ? tid / MaxThreadsPerSeq : tid; + + int seq_offset_start = static_cast(seq_offsets[seq_id]); + int seq_offset_end = static_cast(seq_offsets[seq_id + 1]); + + BeamSearchDetails( + selected_ids, selected_scores, selected_offsets, pre_ids, pre_scores, ids, + scores, seq_offset_start, seq_offset_end, seq_width, beam_size, end_id, + is_accumulated, num_used_threads); +} + +template +__global__ void BeamSearchKernelSingle( + int64_t* selected_ids, float* selected_scores, size_t* selected_offsets, + const int64_t* pre_ids, const float* pre_scores, const int64_t* ids, + const float* scores, const int seq_length, const int seq_width, + int beam_size, int end_id, bool is_accumulated, int num_used_threads) { + const int seq_offset_start = 0; + const int seq_offset_end = seq_length; + + BeamSearchDetails( + selected_ids, selected_scores, selected_offsets, pre_ids, pre_scores, ids, + scores, seq_offset_start, seq_offset_end, seq_width, beam_size, end_id, + is_accumulated, num_used_threads); +} + +static inline int GetNumUsedThreads(const int max_threads_per_seq, + const int seq_width, int beam_size) { + int num_used_threads = (seq_width + beam_size - 1) / beam_size; + num_used_threads = max_threads_per_seq < num_used_threads + ? max_threads_per_seq + : num_used_threads; + + num_used_threads = + num_used_threads > 32 + ? (num_used_threads >> 5) << 5 + : (num_used_threads > 16 + ? 32 + : (num_used_threads > 8 + ? 16 + : (num_used_threads > 4 + ? 8 + : (num_used_threads > 2 ? 4 + : num_used_threads)))); + return num_used_threads; +} + +template +class BeamSearchFunctor { + public: + void operator()(const platform::CUDADeviceContext& context, + const framework::LoDTensor* pre_ids, + const framework::LoDTensor* pre_scores, + const framework::LoDTensor* ids, + const framework::LoDTensor* scores, + framework::LoDTensor* selected_ids, + framework::LoDTensor* selected_scores, size_t level, + size_t beam_size, int end_id, bool is_accumulated) { + auto abs_lod = framework::ToAbsOffset(scores->lod()); + + const int64_t* pre_ids_data = pre_ids->data(); + const float* pre_scores_data = pre_scores->data(); + const int64_t* ids_data = ids ? ids->data() : nullptr; + const float* scores_data = scores->data(); + + const size_t num_seqs = abs_lod[level].size() - 1; + size_t seq_width = 1; + for (int i = 1; i < scores->dims().size(); i++) { + seq_width *= scores->dims()[i]; + } + + // Reserve a big enough memory. + auto selected_dims = + framework::make_ddim({static_cast(num_seqs * beam_size), 1}); + int64_t* selected_ids_data = + selected_ids->mutable_data(selected_dims, context.GetPlace()); + float* selected_scores_data = + selected_scores->mutable_data(selected_dims, context.GetPlace()); + + framework::LoD selected_lod(2); + selected_lod[0].assign(abs_lod[level].begin(), abs_lod[level].end()); + selected_lod[1].resize(scores->dims()[0] + 1); + size_t* selected_offsets = + selected_lod[1].CUDAMutableData(context.GetPlace()); + + if (num_seqs == 1) { + const int seq_length = static_cast(abs_lod[level][1]); + const int kMaxThreadsPerSeq = 1024; + int num_used_threads = + GetNumUsedThreads(kMaxThreadsPerSeq, static_cast(seq_width), + static_cast(beam_size)); + switch (platform::RoundToPowerOfTwo(beam_size * seq_width)) { + CUDA_LAUNCH_KERNEL_HELPER( + BeamSearchKernelSingle<<< + 1, kMaxThreadsPerSeq, 0, context.stream()>>>( + selected_ids_data, selected_scores_data, selected_offsets, + pre_ids_data, pre_scores_data, ids_data, scores_data, + seq_length, static_cast(seq_width), + static_cast(beam_size), static_cast(end_id), + is_accumulated, num_used_threads)); + } + } else if (num_seqs <= 4) { + const size_t* seq_offsets = abs_lod[level].CUDAData(context.GetPlace()); + // Use only 1 block + const int kMaxThreadsPerSeq = 32; + const int kMaxSeqs = 4; + int num_used_threads = + GetNumUsedThreads(kMaxThreadsPerSeq, static_cast(seq_width), + static_cast(beam_size)); + switch (platform::RoundToPowerOfTwo(beam_size * num_seqs * 32)) { + CUDA_LAUNCH_KERNEL_HELPER( + BeamSearchKernel<<< + 1, num_seqs * kMaxThreadsPerSeq, 0, context.stream()>>>( + selected_ids_data, selected_scores_data, selected_offsets, + pre_ids_data, pre_scores_data, ids_data, scores_data, + seq_offsets, static_cast(num_seqs), + static_cast(seq_width), static_cast(beam_size), + end_id, is_accumulated, num_used_threads)); + } + } else { + LOG(FATAL) << "Not implemented."; + } + + context.Wait(); + if (!framework::CheckLoD(selected_lod)) { + PADDLE_THROW("lod %s is not right", framework::LoDToString(selected_lod)); + } + + selected_ids->set_lod(selected_lod); + selected_scores->set_lod(selected_lod); + if (selected_lod[1].back() < num_seqs * beam_size) { + auto final_selected_dims = framework::make_ddim( + {static_cast(selected_lod[1].back()), 1}); + selected_ids->Resize(final_selected_dims); + selected_scores->Resize(final_selected_dims); + } + } +}; + +template class BeamSearchFunctor; +template class BeamSearchFunctor; +template class BeamSearchFunctor; +template class BeamSearchFunctor; + +} // namespace math +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/math/beam_search.h b/paddle/fluid/operators/math/beam_search.h new file mode 100644 index 0000000000000000000000000000000000000000..3cd17f426c5596582c91f2b3f0cc5ba513e3aa4b --- /dev/null +++ b/paddle/fluid/operators/math/beam_search.h @@ -0,0 +1,119 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + +http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +#include +#include +#include "paddle/fluid/framework/lod_tensor.h" +#include "paddle/fluid/platform/device_context.h" + +namespace paddle { +namespace operators { +namespace math { + +/* + * This is an implementation of beam search. + * + * To explain the details, lets take machine translation task for example, in + * this task, one source sentence is translated to multiple target sentences, + * during this period, one sentence will be translated to multiple translation + * prefixes(target sentence that have not ended), in each time step a prefix + * will have some candidates, input the candidate ids and their corresponding + * scores (probabilities), it will sort and select the top beam_size candidates + * for each source sentence, and store the selected candidates's score and their + * corresponding ids to LoDTensors. + * + * A detailed example: + * + * Input + * + * ids: + * - LoD (should have 2 levels) + * - first level: [0, 1, 4] + * - second level: [0, 1, 2, 3, 4] + * - tensor's data: + * [[4, 2, 5] + * [2, 1, 3] + * [3, 5, 2] + * [8, 2, 1]] + * + * scores: + * - LoD same as `ids` + * - tensor's data + * [[0.5, 0.3, 0.2] + * [0.6, 0.3, 0.1] + * [0.9, 0.5, 0.1] + * [0.7, 0.5, 0.1]] + * + * The inputs means that there are 2 source sentences to translate, and the + * first source has 1 prefix, the second source has 2 prefix. + * + * Lets assume beam size is 2, and the beam search's output should be + * - LoD + * - first level: [0, 1, 2] + * - second level: [0, 2, 4] + * - id tensor's data + * [[4, + * 1, + * 3, + * 8]] + * - score tensor's data + * [[0.5, + * 0.3, + * 0.9, + * 0.7]] + * + * TODO all the prune operations should be in the beam search, so it is better + * to split the beam search algorithm into a sequence of smaller operators, and + * the prune operators can be inserted in this sequence. + */ +template +class BeamSearchFunctor { + public: + /* + * The main function of beam search. + * + * @selected_ids: a [None, 1]-shaped tensor with LoD. + * In a machine translation model, it might be the candidate term id sets, + * each set stored as a varience-length sequence. + * The format might be described with a two-level LoD + * - [[0 1], + * [0 1 2]] + * - [[] + * [0 1]] + * the first level of LoD tells that there are two source sentences. The + * second level describes the details of the candidate id set's offsets in + * the source sentences. + * + * @selected_scores: a LoD tensor with the same shape and LoD with + * selected_ids. + * It stores the corresponding scores of candidate ids in selected_ids. + * + * Return false if all the input tensor is empty, in machine translation task + * that means no candidates is provided, and the task will stop running. + */ + void operator()(const DeviceContext& context, + const framework::LoDTensor* pre_ids, + const framework::LoDTensor* pre_scores, + const framework::LoDTensor* ids, + const framework::LoDTensor* scores, + framework::LoDTensor* selected_ids, + framework::LoDTensor* selected_scores, size_t level, + size_t beam_size, int end_id, bool is_accumulated); +}; + +} // namespace math +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/math/beam_search_test.cc b/paddle/fluid/operators/math/beam_search_test.cc new file mode 100644 index 0000000000000000000000000000000000000000..1c29ee95f6b109209316e4e8c8f3cda37eac62ae --- /dev/null +++ b/paddle/fluid/operators/math/beam_search_test.cc @@ -0,0 +1,141 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/operators/math/beam_search.h" +#include +#include + +void PrepareCPUTensors(paddle::framework::LoDTensor* ids, + paddle::framework::LoDTensor* scores, + paddle::framework::LoDTensor* pre_ids, + paddle::framework::LoDTensor* pre_scores) { + // lod + paddle::framework::LoD lod; + std::vector level0({0, 2, 4}); + std::vector level1({0, 1, 2, 3, 4}); + lod.push_back(level0); + lod.push_back(level1); + ids->set_lod(lod); + scores->set_lod(lod); + + auto dims = paddle::framework::make_ddim({4, 3}); + ids->Resize(dims); + scores->Resize(dims); + + paddle::platform::CPUPlace place; + auto* ids_data = ids->mutable_data(place); + auto* scores_data = scores->mutable_data(place); + std::vector ids_vec_data({4, 2, 5, 2, 1, 3, 3, 5, 2, 8, 2, 1}); + std::vector scores_vec_data( + {0.6f, 0.3f, 0.5f, 0.2f, 0.3f, 0.1f, 0.9f, 0.5f, 0.1f, 0.7f, 0.5f, 0.1f}); + + CHECK_EQ(static_cast(ids->numel()), ids_vec_data.size()); + CHECK_EQ(static_cast(ids->numel()), scores_vec_data.size()); + + for (int i = 0; i < ids->numel(); i++) { + ids_data[i] = ids_vec_data[i]; + scores_data[i] = scores_vec_data[i]; + } + + // pre_ids + pre_ids->Resize(paddle::framework::make_ddim({4, 1})); + for (int i = 0; i < 4; i++) { + pre_ids->mutable_data(place)[i] = i + 1; + } + + // pre_scores + pre_scores->Resize(paddle::framework::make_ddim({4, 1})); + for (int i = 0; i < 4; i++) { + pre_scores->mutable_data(place)[i] = 0.1 * (i + 1); + } +} + +template +void TestBeamSearch() { + paddle::framework::LoDTensor ids; + paddle::framework::LoDTensor scores; + paddle::framework::LoDTensor pre_ids; + paddle::framework::LoDTensor pre_scores; + + auto* place = new Place(); + DeviceContext* context = new DeviceContext(*place); + if (paddle::platform::is_cpu_place(*place)) { + PrepareCPUTensors(&ids, &scores, &pre_ids, &pre_scores); + } else { + paddle::framework::LoDTensor cpu_ids; + paddle::framework::LoDTensor cpu_scores; + paddle::framework::LoDTensor cpu_pre_ids; + paddle::framework::LoDTensor cpu_pre_scores; + + PrepareCPUTensors(&cpu_ids, &cpu_scores, &cpu_pre_ids, &cpu_pre_scores); + + TensorCopySync(cpu_ids, *place, &ids); + TensorCopySync(cpu_scores, *place, &scores); + TensorCopySync(cpu_pre_ids, *place, &pre_ids); + TensorCopySync(cpu_pre_scores, *place, &pre_scores); + + ids.set_lod(cpu_ids.lod()); + scores.set_lod(cpu_scores.lod()); + pre_ids.set_lod(cpu_pre_ids.lod()); + pre_scores.set_lod(cpu_pre_scores.lod()); + } + + paddle::framework::LoDTensor selected_ids; + paddle::framework::LoDTensor selected_scores; + + size_t level = 0; + size_t beam_size = 2; + int end_id = 0; + paddle::operators::math::BeamSearchFunctor beamsearch; + beamsearch(*context, &pre_ids, &pre_scores, &ids, &scores, &selected_ids, + &selected_scores, level, beam_size, end_id, true); + + ASSERT_EQ(selected_ids.lod(), selected_scores.lod()); + + paddle::framework::LoDTensor cpu_selected_ids; + paddle::framework::LoDTensor cpu_selected_scores; + if (paddle::platform::is_cpu_place(*place)) { + cpu_selected_ids = selected_ids; + cpu_selected_scores = selected_scores; + } else { + TensorCopySync(selected_ids, paddle::platform::CPUPlace(), + &cpu_selected_ids); + TensorCopySync(selected_scores, paddle::platform::CPUPlace(), + &cpu_selected_scores); + cpu_selected_ids.set_lod(selected_ids.lod()); + cpu_selected_scores.set_lod(selected_scores.lod()); + } + + std::vector expected_ids({4, 5, 3, 8}); + std::vector expected_scores({0.6f, 0.5f, 0.9f, 0.7f}); + for (int i = 0; i < 4; i++) { + ASSERT_EQ(expected_ids[i], cpu_selected_ids.data()[i]); + ASSERT_EQ(expected_scores[i], cpu_selected_scores.data()[i]); + } + + delete place; + delete context; +} + +TEST(BeamSearch, CPU) { + TestBeamSearch(); +} + +#ifdef PADDLE_WITH_CUDA +TEST(BeamSearch, GPU) { + TestBeamSearch(); +} +#endif diff --git a/paddle/fluid/operators/math/sampler.cc b/paddle/fluid/operators/math/sampler.cc index 2708f3bcd8f1d2cab19c74b57fdf9f903d9dc65d..238d9f2905058d267ffbee0669594920d7a9e031 100644 --- a/paddle/fluid/operators/math/sampler.cc +++ b/paddle/fluid/operators/math/sampler.cc @@ -13,6 +13,7 @@ See the License for the specific language governing permissions and limitations under the License. */ #include "paddle/fluid/operators/math/sampler.h" +#include #include #include #include @@ -77,7 +78,14 @@ int64_t CustomSampler::Sample() const { auto index = (*int_dist_)(*random_engine_); auto p = (*real_dist_)(*random_engine_); if (p > alias_probs_[index]) { - return alias_[index]; + int alias = alias_[index]; + + if (alias == exceptional_val) { + LOG(WARNING) << "WARNING: CustomSampler get alias " << exceptional_val; + return index; + } + + return alias; } else { return index; } diff --git a/paddle/fluid/operators/math/sampler.h b/paddle/fluid/operators/math/sampler.h index 98e0b898a504e3bd6b37c3cc772c179eab6038a4..3fa5a7ae336a9be984324411b88570aea99c2c78 100644 --- a/paddle/fluid/operators/math/sampler.h +++ b/paddle/fluid/operators/math/sampler.h @@ -116,6 +116,7 @@ class CustomSampler : public Sampler { const float* alias_probs_; const int* alias_; const float* probs_; + const int exceptional_val = -1; std::shared_ptr random_engine_; std::shared_ptr> real_dist_; std::shared_ptr> int_dist_; diff --git a/paddle/fluid/operators/math/selected_rows_functor_test.cc b/paddle/fluid/operators/math/selected_rows_functor_test.cc index f15b37a1e3f0ae9c7612c4f74470472393ff4ad6..aedb82da2f0fb2f15e1586d351af7c9d4364852b 100644 --- a/paddle/fluid/operators/math/selected_rows_functor_test.cc +++ b/paddle/fluid/operators/math/selected_rows_functor_test.cc @@ -354,7 +354,7 @@ TEST(selected_rows_functor, cpu_merge_add_multi) { auto* out_data = output->value().data(); for (size_t i = 0; i < ret_rows.size(); ++i) { - for (size_t j = 0; j < row_numel; ++j) { + for (size_t j = 0; j < static_cast(row_numel); ++j) { EXPECT_EQ(out_data[i * row_numel + j], ret_rows[i]); } } diff --git a/paddle/fluid/operators/math/selected_rows_functor_test.cu.cc b/paddle/fluid/operators/math/selected_rows_functor_test.cu.cc index 73d83fa2e43f14445c969648cd469b0e32d644c7..74892316e6decdeab3a08396fa2f4bdeb8eb7b73 100644 --- a/paddle/fluid/operators/math/selected_rows_functor_test.cu.cc +++ b/paddle/fluid/operators/math/selected_rows_functor_test.cu.cc @@ -301,7 +301,7 @@ TEST(selected_rows_functor, gpu_merge_add) { auto* out_data = output_cpu.data(); for (size_t i = 0; i < ret_rows.size(); ++i) { - for (size_t j = 0; j < row_numel; ++j) { + for (size_t j = 0; j < static_cast(row_numel); ++j) { EXPECT_EQ(out_data[i * row_numel + j], ret_rows[i]); } } diff --git a/paddle/fluid/operators/math/sequence_pooling_test.cc b/paddle/fluid/operators/math/sequence_pooling_test.cc index 5535523e798912ff80eeb5d753914c7d8d70a05f..cf6e89b3d9f11f2b68322ef15ddf026625f6a5a5 100644 --- a/paddle/fluid/operators/math/sequence_pooling_test.cc +++ b/paddle/fluid/operators/math/sequence_pooling_test.cc @@ -66,7 +66,7 @@ void TestSequencePoolingSum(const paddle::framework::LoD& lod) { cpu_in_grad.set_lod(in_grad.lod()); } - EXPECT_EQ(in_grad.numel(), lod[0].back() * second_dim); + EXPECT_EQ(in_grad.numel(), static_cast(lod[0].back() * second_dim)); EXPECT_EQ(in_grad.lod(), lod); if (paddle::platform::is_cpu_place(*place)) { diff --git a/paddle/fluid/operators/nce_op.h b/paddle/fluid/operators/nce_op.h index fab46a5971d51960daf82ca6303daa5159e11f26..25b6ed851bc5149dcf6d25edc80544c99dd95d34 100644 --- a/paddle/fluid/operators/nce_op.h +++ b/paddle/fluid/operators/nce_op.h @@ -119,6 +119,11 @@ class NCEKernel : public framework::OpKernel { PrepareSamples(context, sampler); auto sample_labels = context.Output("SampleLabels"); const int64_t *sample_labels_data = sample_labels->data(); + + for (int x = 0; x < sample_labels->numel(); x++) { + PADDLE_ENFORCE_GE(sample_labels_data[x], 0, "nce sample label %d", x); + } + auto sample_out = context.Output("SampleLogits"); T *sample_out_data = sample_out->mutable_data(context.GetPlace()); auto label = context.Input("Label"); diff --git a/paddle/fluid/operators/ngraph/CMakeLists.txt b/paddle/fluid/operators/ngraph/CMakeLists.txt new file mode 100644 index 0000000000000000000000000000000000000000..6b256ef02666c21ec1db3f6922b56bb23363b4a0 --- /dev/null +++ b/paddle/fluid/operators/ngraph/CMakeLists.txt @@ -0,0 +1,5 @@ +if(WITH_NGRAPH) + cc_library(ngraph_bridge SRCS ngraph_bridge.cc DEPS operator framework_proto ngraph) + cc_library(ngraph_engine SRCS ngraph_engine.cc DEPS ngraph_bridge framework_proto) + op_library(ngraph_engine_op DEPS ngraph_engine op_registry op_info device_context) +endif() diff --git a/paddle/fluid/framework/ngraph_bridge.cc b/paddle/fluid/operators/ngraph/ngraph_bridge.cc similarity index 55% rename from paddle/fluid/framework/ngraph_bridge.cc rename to paddle/fluid/operators/ngraph/ngraph_bridge.cc index 365870c54eb3861ad6c273d3866dcd32d1c4166a..d6e897ed4666261cdd0bd6565f61abb218d971e5 100644 --- a/paddle/fluid/framework/ngraph_bridge.cc +++ b/paddle/fluid/operators/ngraph/ngraph_bridge.cc @@ -17,39 +17,39 @@ limitations under the License. */ #include #include "ngraph/ngraph.hpp" -#include "paddle/fluid/framework/ngraph_bridge.h" -#include "paddle/fluid/framework/operator.h" +#include "paddle/fluid/operators/ngraph/ngraph_bridge.h" #include "paddle/fluid/operators/ngraph/ngraph_ops.h" #include "paddle/fluid/platform/enforce.h" #include "paddle/fluid/platform/ngraph_helper.h" namespace paddle { -namespace framework { +namespace operators { namespace NG_OPS = paddle::operators::ngraphs; std::map&, + std::function&, std::shared_ptr>>)>> NgraphBridge::NG_NODE_MAP = { {"elementwise_add", NG_OPS::BuildElementwiseAddNode}, {"elementwise_add_grad", NG_OPS::BuildElementwiseAddGradNode}, - {"fill_constant", paddle::operators::ngraphs::BuildFillConstantNode}, - {"mean", paddle::operators::ngraphs::BuildMeanNode}, - {"mean_grad", paddle::operators::ngraphs::BuildMeanGradNode}, - {"mul", paddle::operators::ngraphs::BuildMulNode}, - {"mul_grad", paddle::operators::ngraphs::BuildMulGradNode}, - {"softmax", paddle::operators::ngraphs::BuildSoftmaxNode}, - {"softmax_grad", paddle::operators::ngraphs::BuildSoftmaxGradNode}, - {"scale", paddle::operators::ngraphs::BuildScaleNode}, - {"relu", paddle::operators::ngraphs::BuildUnaryNode}, - {"tanh", paddle::operators::ngraphs::BuildUnaryNode}, - {"top_k", paddle::operators::ngraphs::BuildTopKNode}}; - -void NgraphBridge::BuildNgNode(const std::shared_ptr& op) { + {"fill_constant", NG_OPS::BuildFillConstantNode}, + {"mean", NG_OPS::BuildMeanNode}, + {"mean_grad", NG_OPS::BuildMeanGradNode}, + {"mul", NG_OPS::BuildMulNode}, + {"mul_grad", NG_OPS::BuildMulGradNode}, + {"softmax", NG_OPS::BuildSoftmaxNode}, + {"softmax_grad", NG_OPS::BuildSoftmaxGradNode}, + {"scale", NG_OPS::BuildScaleNode}, + {"relu", NG_OPS::BuildUnaryNode}, + {"tanh", NG_OPS::BuildUnaryNode}, + {"top_k", NG_OPS::BuildTopKNode}}; + +void NgraphBridge::BuildNgNode( + const std::shared_ptr& op) { auto& op_type = op->Type(); NG_NODE_MAP[op_type](op, ngb_node_map_); } -} // namespace framework +} // namespace operators } // namespace paddle diff --git a/paddle/fluid/framework/ngraph_bridge.h b/paddle/fluid/operators/ngraph/ngraph_bridge.h similarity index 84% rename from paddle/fluid/framework/ngraph_bridge.h rename to paddle/fluid/operators/ngraph/ngraph_bridge.h index 5ad7b8daeb6a782515e50fc87ca7188b46308390..c57988f8f6322e76678c572aa21ff5b17b9e3c22 100644 --- a/paddle/fluid/framework/ngraph_bridge.h +++ b/paddle/fluid/operators/ngraph/ngraph_bridge.h @@ -21,16 +21,16 @@ limitations under the License. */ #include "ngraph/node.hpp" -namespace paddle { -namespace framework { +#include "paddle/fluid/framework/operator.h" -class OperatorBase; +namespace paddle { +namespace operators { class NgraphBridge { public: static std::map< std::string, - std::function&, + std::function&, std::shared_ptr>>)>> NG_NODE_MAP; @@ -41,7 +41,7 @@ class NgraphBridge { var_node_map) : ngb_node_map_(var_node_map) {} - void BuildNgNode(const std::shared_ptr& op); + void BuildNgNode(const std::shared_ptr& op); private: std::shared_ptr< @@ -49,5 +49,5 @@ class NgraphBridge { ngb_node_map_; }; -} // namespace framework +} // namespace operators } // namespace paddle diff --git a/paddle/fluid/operators/ngraph/ngraph_engine.cc b/paddle/fluid/operators/ngraph/ngraph_engine.cc new file mode 100644 index 0000000000000000000000000000000000000000..bec4b514a218715134d2366dd7efd7cf5b377b68 --- /dev/null +++ b/paddle/fluid/operators/ngraph/ngraph_engine.cc @@ -0,0 +1,491 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include + +#include +#include +#include +#include + +#include "paddle/fluid/framework/block_desc.h" +#include "paddle/fluid/framework/ddim.h" +#include "paddle/fluid/framework/feed_fetch_type.h" +#include "paddle/fluid/framework/framework.pb.h" +#include "paddle/fluid/framework/lod_tensor.h" +#include "paddle/fluid/framework/op_desc.h" +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/framework/var_desc.h" +#include "paddle/fluid/framework/var_type.h" +#include "paddle/fluid/operators/ngraph/ngraph_bridge.h" +#include "paddle/fluid/operators/ngraph/ngraph_engine.h" + +namespace paddle { +namespace operators { + +static ngraph::Shape Ddim2Shape(const framework::DDim& dims) { + ngraph::Shape sp; + for (int i = 0; i < dims.size(); ++i) { + int k = dims[i]; + k = k == 0 ? 1 : k; + sp.push_back(k); + } + return sp; +} + +static std::map + pd2ng_type_map = { + {framework::proto::VarType::FP32, ngraph::element::f32}, + {framework::proto::VarType::FP64, ngraph::element::f64}, + {framework::proto::VarType::INT32, ngraph::element::i32}, + {framework::proto::VarType::INT64, ngraph::element::i64}, + {framework::proto::VarType::BOOL, ngraph::element::boolean}, +}; + +std::unordered_map> + NgraphEngine::func_cache_ = {}; + +std::shared_ptr NgraphEngine::backend_ = + ngraph::runtime::Backend::create("CPU"); + +static std::vector> NgraphOpIntervals( + framework::BlockDesc* block) { + std::vector> intervals; + auto ops = block->AllOps(); + int size = ops.size(); + int left = 0; + while (left < size && ops.at(left)->Type() != framework::kFeedOpType) { + ++left; + } + if (left == size) { + return intervals; + } + while (left < size && ops.at(left)->Type() == framework::kFeedOpType) { + ++left; + } + + int right = left; + while (right < size && ops.at(right)->Type() != framework::kFetchOpType) { + ++right; + } + if (right == size) { + return intervals; + } + if (left >= right) return intervals; + + // (left, right - 1) represents indices between feed and fetch + int pivot = left; + while (pivot < right) { + auto op_type = ops.at(pivot)->Type(); + if (NgraphBridge::NG_NODE_MAP.find(op_type) == + NgraphBridge::NG_NODE_MAP.end()) { + ++pivot; + } else { + int start = pivot, end = start; + while (pivot < right && + (NgraphBridge::NG_NODE_MAP.find(ops.at(pivot)->Type()) != + NgraphBridge::NG_NODE_MAP.end())) { + ++pivot; + ++end; + } + std::vector interval = {start, end}; + intervals.push_back(interval); + } + } // end while + return intervals; +} + +static void SubstituteNgraphOp(framework::BlockDesc* block, + std::string block_str, + std::vector interval) { + framework::ProgramDesc program; + block->RemoveOp(interval.at(0), interval.at(1)); + auto* ng_op = block->InsertOp(interval.at(0)); + ng_op->SetType("ngraph_engine"); + ng_op->SetAttr("interval", interval); + ng_op->SetAttr("graph", block_str); +} + +// TODO(baojun-nervana): Move EnableNgraph to compile time per PR #15089 +void NgraphEngine::EnableNgraph(const framework::ProgramDesc& program) { +#ifdef PADDLE_WITH_NGRAPH + VLOG(4) << "use_ngraph=True"; + for (size_t bid = 0; bid < program.Size(); ++bid) { + // TODO(baojun-nervana): Remove the const_cast + auto* block = + const_cast(program).MutableBlock(bid); + std::string block_str = block->Proto()->SerializeAsString(); + auto intervals = NgraphOpIntervals(block); + for (auto it = intervals.rbegin(); it != intervals.rend(); ++it) { + SubstituteNgraphOp(block, block_str, *it); + } + } +#else + LOG(WARNING) + << "'NGRAPH' is not supported, Please re-compile with WITH_NGRAPH option"; +#endif +} + +NgraphEngine::NgraphEngine(const framework::Scope& scope, + const platform::Place& place, + const std::string& serialized_graph, + const std::vector& interval) + : scope_(scope), place_(place) { + var_in_node_map_ = std::make_shared< + std::unordered_map>>(); + + var_node_map_ = std::make_shared< + std::unordered_map>>(); + + func_cache_key_ = std::to_string(interval[0]) + std::to_string(interval[1]) + + serialized_graph; + + framework::proto::BlockDesc bdesc; + bdesc.ParseFromString(serialized_graph); + framework::BlockDesc block(nullptr, &bdesc); + + Prepare(block, interval); + + BuildNgIO(); + + GetNgFunction(); +} + +void NgraphEngine::Prepare(const framework::BlockDesc& block, + const std::vector& interval) { + for (auto& var : block.AllVars()) { + if (!(var->GetType() == framework::proto::VarType::SELECTED_ROWS || + var->GetType() == framework::proto::VarType::LOD_TENSOR || + var->GetType() == framework::proto::VarType::LOD_TENSOR_ARRAY)) { + continue; + } + + auto var_name = var->Name(); + if (var->Name() == framework::kEmptyVarName) { + continue; + } + + if (var_name != framework::kFeedOpType && + var_name != framework::kFetchOpType) { + auto pd_type = var->GetDataType(); + if (pd2ng_type_map.find(pd_type) == pd2ng_type_map.end()) { + PADDLE_THROW("Data type of var %s not found in pd2ng_type_map", + var_name); + } + var_type_map_[var_name] = pd2ng_type_map[pd_type]; + } + + if (var->Persistable()) { + persistables_.insert(var->Name()); + } + } + + auto ops_desc = block.AllOps(); + int idx = interval[0]; + while (idx < interval[1]) { + auto op_desc = ops_desc.at(idx); + auto op = framework::OpRegistry::CreateOp(*op_desc); + fused_ops_.push_back(std::move(op)); + ++idx; + } + + while (ops_desc.at(idx)->Type() != framework::kFetchOpType) { + auto op_desc = ops_desc.at(idx); + for (auto& var_name_item : op_desc->Inputs()) { + for (auto& var_name : var_name_item.second) { + post_op_inputs_.insert(var_name); + } + } + ++idx; + } + + while (idx < static_cast(ops_desc.size()) && + ops_desc.at(idx)->Type() == framework::kFetchOpType) { + std::string fetch_target_name = ops_desc.at(idx)->Input("X")[0]; + fetches_.insert(fetch_target_name); + ++idx; + } + + if (ops_desc.at(interval.at(0) - 1)->Type() == framework::kFeedOpType && + ops_desc.at(interval.at(1))->Type() == framework::kFetchOpType) { + ng_op_state_ = OpState::FULL; + } + + for (auto* op_desc : ops_desc) { + if (op_desc->Type().find("_grad") != std::string::npos) { + ng_op_state_ = ng_op_state_ == OpState::FULL ? OpState::FULL_TRAIN + : OpState::PARTIAL_TRAIN; + break; + } + } + + if (ng_op_state_ != OpState::FULL_TRAIN && + ng_op_state_ != OpState::PARTIAL_TRAIN) { + ng_op_state_ = ng_op_state_ == OpState::FULL ? OpState::FULL_TEST + : OpState::PARTIAL_TEST; + } +} + +void NgraphEngine::GetNgInputShape( + std::shared_ptr op) { + framework::RuntimeContext ctx(op->Inputs(), op->Outputs(), scope_); + op->RuntimeInferShape(scope_, place_, ctx); + for (auto& var_name_item : op->Inputs()) { + for (auto& var_name : var_name_item.second) { + auto* var = scope_.FindVar(var_name); + if (var && var->IsType()) { + auto* tensor_pd = GetLoDTensorOrSelectedRowsValueFromVar(*var); + auto sp = Ddim2Shape(tensor_pd->dims()); + if (std::find(var_in_.begin(), var_in_.end(), var_name) != + var_in_.end()) { + if (var_node_map_->find(var_name) == var_node_map_->end()) { + // auto ng_type = pd2ng_type_map.at(GetDataTypeOfVar(var)); + auto ng_type = var_type_map_.at(var_name); + auto prm = + std::make_shared(ng_type, sp, true); + (*var_node_map_)[var_name] = prm; + (*var_in_node_map_)[var_name] = prm; + } + } + } + } + } +} + +void NgraphEngine::BuildNgNodes() { + for (auto& op : fused_ops_) { + for (auto& var_name_item : op->Outputs()) { + for (auto& var_name : var_name_item.second) { + if (var_node_map_->find(var_name) == var_node_map_->end()) { + auto* var = scope_.FindVar(var_name); + if (var && var->IsType()) { + auto* tensor_pd = GetLoDTensorOrSelectedRowsValueFromVar(*var); + auto& ddim = tensor_pd->dims(); + auto ng_shape = Ddim2Shape(ddim); + auto ng_type = var_type_map_.at(var_name); + auto prm = std::make_shared(ng_type, + ng_shape, true); + (*var_node_map_)[var_name] = prm; + } + } + } + } + } + NgraphBridge ngb(var_node_map_); + for (auto& op : fused_ops_) { + ngb.BuildNgNode(op); + } +} + +void NgraphEngine::BuildNgIO() { + std::unordered_set inputs; + std::unordered_set outputs; + + for (auto& op : fused_ops_) { + for (auto& var_name_item : op->Inputs()) { + for (auto& var_name : var_name_item.second) { + inputs.insert(var_name); + const bool is_output = outputs.find(var_name) != outputs.end(); + if (!is_output && + std::find(var_in_.begin(), var_in_.end(), var_name) == + var_in_.end()) { + // fill var_in here to keep lhs and rhs order + var_in_.push_back(var_name); + } + } + } + + if (op->Type() != "fill_constant") { + GetNgInputShape(op); + } + + for (auto& var_name_item : op->Outputs()) { + PADDLE_ENFORCE_LE(var_name_item.second.size(), 1, + "op %s has more than 1 output - Not handling yet", + op->Type()); + for (auto& var_name : var_name_item.second) { + outputs.insert(var_name); + } + } + } + + // var_out.clear(); + for (auto& op : fused_ops_) { + for (auto& var_name_item : op->Outputs()) { + PADDLE_ENFORCE_LE(var_name_item.second.size(), 1, + "op %s has more than 1 output - Not handling yet", + op->Type()); + for (auto& var_name : var_name_item.second) { + switch (ng_op_state_) { + case OpState::PARTIAL_TEST: + if (post_op_inputs_.find(var_name) != post_op_inputs_.end() || + fetches_.find(var_name) != fetches_.end()) { + var_out_.push_back(var_name); + } + break; + case OpState::FULL_TEST: + if (fetches_.find(var_name) != fetches_.end()) { + var_out_.push_back(var_name); + } + break; + case OpState::PARTIAL_TRAIN: + if (fetches_.find(var_name) != fetches_.end() || + post_op_inputs_.find(var_name) != post_op_inputs_.end() || + persistables_.find(var_name) != persistables_.end()) { + var_out_.push_back(var_name); + } + break; + case OpState::FULL_TRAIN: + if (fetches_.find(var_name) != fetches_.end() || + persistables_.find(var_name) != persistables_.end()) { + var_out_.push_back(var_name); + } + break; + default: + var_out_.push_back(var_name); + } + } + } + } +} + +void NgraphEngine::BuildNgFunction() { + BuildNgNodes(); + ngraph_function_ = nullptr; + ngraph::NodeVector func_outputs; + ngraph::ParameterVector func_inputs; + + for (auto& vo : var_out_) { + func_outputs.push_back(var_node_map_->at(vo)); + } + + for (auto& vi : var_in_) { + std::shared_ptr prm = + std::dynamic_pointer_cast( + var_in_node_map_->at(vi)); + func_inputs.push_back(prm); + } + + ngraph_function_ = + std::make_shared(func_outputs, func_inputs); +} + +void NgraphEngine::GetNgFunction() { + bool cache_on = true; + if (cache_on) { + std::string input_shape_str; + for (auto& var_name : var_in_) { + auto shape = var_node_map_->at(var_name)->get_shape(); + for (size_t i = 0; i < shape.size(); ++i) { + input_shape_str += std::to_string(shape.at(i)); + } + } + func_cache_key_ = input_shape_str + func_cache_key_; + if (func_cache_.find(func_cache_key_) != func_cache_.end()) { + ngraph_function_ = func_cache_.at(func_cache_key_); + } else { + BuildNgFunction(); + func_cache_[func_cache_key_] = ngraph_function_; + } + } else { + BuildNgFunction(); + } +} + +void NgraphEngine::Run(const framework::Scope& scope, + const platform::Place& place) const { + std::vector> t_in; + std::vector> t_out; + + for (size_t i = 0; i < var_in_.size(); ++i) { + auto vi = var_in_.at(i); + auto sp = var_node_map_->at(vi)->get_shape(); + std::shared_ptr ti; + auto* var = scope.FindVar(vi); + if (var && var->IsType()) { + auto* tensor_pd = GetMutableLoDTensorOrSelectedRowsValueFromVar(var); + PADDLE_ENFORCE(sp == Ddim2Shape(tensor_pd->dims()), + "Ensure ngraph tensor layout align with paddle tensor"); + auto ng_type = var_type_map_.at(vi); + if (ng_type == ngraph::element::f32) { + auto pd_arr = tensor_pd->mutable_data(place); + ti = backend_->create_tensor(ngraph::element::f32, sp, pd_arr); + } else if (ng_type == ngraph::element::i32) { + const int* arr = tensor_pd->data(); + ti = backend_->create_tensor(ngraph::element::i32, sp, + const_cast(arr)); + } else if (ng_type == ngraph::element::i64) { + auto pd_arr = tensor_pd->mutable_data(place); + ti = backend_->create_tensor(ngraph::element::i64, sp, pd_arr); + } else if (ng_type == ngraph::element::f64) { + auto pd_arr = tensor_pd->mutable_data(place); + ti = backend_->create_tensor(ngraph::element::f64, sp, pd_arr); + } else if (ng_type == ngraph::element::boolean) { + auto pd_arr = tensor_pd->mutable_data(place); + ti = backend_->create_tensor(ngraph::element::boolean, sp, pd_arr); + } else { + PADDLE_THROW("Data type not handling for var %s", vi); + } + } else { + PADDLE_THROW("Cannot find var or tensor with var name %s", vi); + } + bool is_test = (ng_op_state_ == OpState::PARTIAL_TEST || + ng_op_state_ == OpState::FULL_TEST) + ? true + : false; + bool is_persistable = + (persistables_.find(vi) != persistables_.end()) ? true : false; + if (is_test && is_persistable) { + ti->set_stale(false); + } + t_in.push_back(ti); + } + + for (size_t i = 0; i < var_out_.size(); ++i) { + auto vo = var_out_[i]; + auto* var = scope.FindVar(vo); + std::shared_ptr to; + if (var && var->IsType()) { + auto* tensor_pd = GetMutableLoDTensorOrSelectedRowsValueFromVar(var); + auto dd = tensor_pd->dims(); + ngraph::Shape sp = Ddim2Shape(dd); + auto ng_type = var_type_map_.at(vo); + if (ng_type == ngraph::element::f32) { + auto pd_arr = tensor_pd->mutable_data(place); + to = backend_->create_tensor(ng_type, sp, pd_arr); + } else if (ng_type == ngraph::element::i64) { + auto pd_arr = tensor_pd->mutable_data(place); + to = backend_->create_tensor(ng_type, sp, pd_arr); + } else if (ng_type == ngraph::element::i32) { + auto pd_arr = tensor_pd->mutable_data(place); + to = backend_->create_tensor(ng_type, sp, pd_arr); + } else if (ng_type == ngraph::element::f64) { + auto pd_arr = tensor_pd->mutable_data(place); + to = backend_->create_tensor(ng_type, sp, pd_arr); + } else if (ng_type == ngraph::element::boolean) { + auto pd_arr = tensor_pd->mutable_data(place); + to = backend_->create_tensor(ng_type, sp, pd_arr); + } else { + PADDLE_THROW("Data type not handled in for var %s", vo); + } + t_out.push_back(to); + } else { + PADDLE_THROW("Cannot find var or tensor with var name %s", vo); + } + } + + backend_->call(backend_->compile(ngraph_function_), t_out, t_in); +} // NgraphEngine::Run +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/ngraph/ngraph_engine.h b/paddle/fluid/operators/ngraph/ngraph_engine.h new file mode 100644 index 0000000000000000000000000000000000000000..bf5ff2a743b0edb69163e674d36c56a02c0b4153 --- /dev/null +++ b/paddle/fluid/operators/ngraph/ngraph_engine.h @@ -0,0 +1,93 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include +#include +#include + +#include "paddle/fluid/framework/operator.h" +#include "paddle/fluid/framework/program_desc.h" + +#include "ngraph/ngraph.hpp" + +namespace paddle { +namespace operators { + +enum class OpState { /* nGraph support state on ops */ + FULL_TRAIN, /* Support full ops for train */ + PARTIAL_TRAIN, /* Support partial ops for train */ + FULL_TEST, /* Support full list of ops for test */ + PARTIAL_TEST, /* Support partial list of ops for test */ + FULL, /* All ops supported from feed to fetch */ + UNKNOWN /* Output all for debug purpose */ +}; + +// perform graph build through bridge and execute computation +class NgraphEngine { + public: + explicit NgraphEngine(const framework::Scope& scope, + const platform::Place& place, + const std::string& serialized_graph, + const std::vector& interval); + + void Run(const framework::Scope& scope, const platform::Place& place) const; + + static void EnableNgraph(const framework::ProgramDesc& program); + + private: + static std::unordered_map> + func_cache_; + const framework::Scope& scope_; + const platform::Place& place_; + std::vector> fused_ops_; + std::unordered_map var_type_map_; + std::unordered_set persistables_; + std::unordered_set fetches_; + std::unordered_set post_op_inputs_; + OpState ng_op_state_ = OpState::UNKNOWN; + std::string func_cache_key_; + + // ngraph backend eg. CPU + static std::shared_ptr backend_; + // ngraph function to call and execute + std::shared_ptr ngraph_function_; + // var_name of inputs + std::vector var_in_; + // var_name of outputs from fetch in order + std::vector var_out_; + // map input vars to nodes + std::shared_ptr< + std::unordered_map>> + var_in_node_map_; + // map each var name with a ngraph node + std::shared_ptr< + std::unordered_map>> + var_node_map_; + // prepare info for nraph engine + void Prepare(const framework::BlockDesc& block, + const std::vector& interval); + // get ngraph input and define ngraph input parameters + void GetNgInputShape(std::shared_ptr op); + // Call ngraph bridge to map ops + void BuildNgNodes(); + // get the ngraph input and output var list + void BuildNgIO(); + // build ngraph function call + void BuildNgFunction(); + // Check cache for ngraph function or otherwise build the function + void GetNgFunction(); +}; + +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/ngraph/ngraph_engine_op.cc b/paddle/fluid/operators/ngraph/ngraph_engine_op.cc new file mode 100644 index 0000000000000000000000000000000000000000..3051ca123b29658d3e9a35239ad00f621a297cb5 --- /dev/null +++ b/paddle/fluid/operators/ngraph/ngraph_engine_op.cc @@ -0,0 +1,52 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. */ + +#include + +#include "paddle/fluid/framework/block_desc.h" +#include "paddle/fluid/framework/op_desc.h" +#include "paddle/fluid/framework/op_info.h" +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/operators/ngraph/ngraph_engine_op.h" + +namespace paddle { +namespace operators { + +class NgraphEngineOpMaker : public framework::OpProtoAndCheckerMaker { + public: + void Make() override { + AddInput("Xs", "A list of inputs.").AsDispensable(); + AddOutput("Ys", "A list of outputs").AsDispensable(); + AddAttr("graph", "the graph."); + AddAttr>("interval", "op interval supported by ngraph"); + AddComment("ngraph engine operator."); + } +}; + +class NgraphEngineInferVarType : public framework::VarTypeInference { + public: + void operator()(const framework::OpDesc &op_desc, + framework::BlockDesc *block) const override {} +}; + +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; + +REGISTER_OPERATOR(ngraph_engine, ops::NgraphEngineOp, ops::NgraphEngineOpMaker, + ops::NgraphEngineOpMaker); +REGISTER_OP_CPU_KERNEL( + ngraph_engine, + ops::NgraphEngineKernel); diff --git a/paddle/fluid/operators/ngraph/ngraph_engine_op.h b/paddle/fluid/operators/ngraph/ngraph_engine_op.h new file mode 100644 index 0000000000000000000000000000000000000000..d2974298b0707575624ad2f6935e83d06b4c83bb --- /dev/null +++ b/paddle/fluid/operators/ngraph/ngraph_engine_op.h @@ -0,0 +1,58 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. */ + +#pragma once + +#include +#include + +#include "paddle/fluid/framework/scope.h" +#include "paddle/fluid/operators/ngraph/ngraph_engine.h" +#include "paddle/fluid/platform/device_context.h" +#include "paddle/fluid/platform/place.h" + +namespace paddle { +namespace operators { + +class NgraphEngineOp : public framework::OperatorWithKernel { + public: + using framework::OperatorWithKernel::OperatorWithKernel; + + protected: + void InferShape(framework::InferShapeContext* ctx) const override {} + + framework::OpKernelType GetExpectedKernelType( + const framework::ExecutionContext& ctx) const override { + framework::OpKernelType kt = framework::OpKernelType( + framework::proto::VarType::FP32, ctx.GetPlace()); + return kt; + } +}; + +template +class NgraphEngineKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const override { + auto& scope = ctx.scope(); + auto place = ctx.GetPlace(); + std::string serialized_graph = ctx.Attr("graph"); + auto interval = ctx.Attr>("interval"); + + NgraphEngine ngraph_engine(scope, place, serialized_graph, interval); + ngraph_engine.Run(scope, place); + } +}; + +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/reader/create_ctr_reader_op.cc b/paddle/fluid/operators/reader/create_ctr_reader_op.cc index 58a465d87a8c0da50e3eb80fefe32d50217f6990..2a3e80c9152b5550631f8c5669283b782f975d4e 100644 --- a/paddle/fluid/operators/reader/create_ctr_reader_op.cc +++ b/paddle/fluid/operators/reader/create_ctr_reader_op.cc @@ -41,13 +41,19 @@ class CreateCTRReaderOp : public framework::OperatorBase { auto* queue_holder = queue_holder_var->template GetMutable(); - int thread_num = Attr("thread_num"); - std::vector slots = Attr>("slots"); - int batch_size = Attr("batch_size"); - std::vector file_list = - Attr>("file_list"); - out->Reset(std::make_shared(queue_holder->GetQueue(), batch_size, - thread_num, slots, file_list)); + auto thread_num = Attr("thread_num"); + auto sparse_slots = Attr>("sparse_slots"); + auto dense_slot_index = Attr>("dense_slot_index"); + auto sparse_slot_index = Attr>("sparse_slot_index"); + auto batch_size = Attr("batch_size"); + auto file_type = Attr("file_type"); + auto file_format = Attr("file_format"); + auto file_list = Attr>("file_list"); + DataDesc data_desc(batch_size, file_list, file_type, file_format, + dense_slot_index, sparse_slot_index, sparse_slots); + VLOG(1) << data_desc; + out->Reset(std::make_shared(queue_holder->GetQueue(), thread_num, + data_desc)); } }; @@ -58,10 +64,22 @@ class CreateCTRReaderOpMaker : public FileReaderMakerBase { "Name of the `LoDTensorBlockingQueueHolder` variable"); AddAttr("thread_num", "the thread num to read data"); AddAttr("batch_size", "the batch size of read data"); + AddAttr("file_type", "plain or gzip").SetDefault("plain"); + AddAttr("file_format", "svm or csv").SetDefault("csv"); AddAttr>("file_list", "The list of files that need to read"); - AddAttr>( - "slots", "the slots that should be extract from file"); + AddAttr>( + "dense_slot_index", + "the dense slots id that should be extract from file") + .SetDefault({}); + AddAttr>( + "sparse_slot_index", + "the sparse slots id that should be extract from file") + .SetDefault({}); + AddAttr>("sparse_slots", + "the sparse slots id that should be " + "extract from file, used when file " + "format is svm"); AddComment(R"DOC( Create CTRReader to support read ctr data with cpp. diff --git a/paddle/fluid/operators/reader/ctr_reader.cc b/paddle/fluid/operators/reader/ctr_reader.cc index d1d3ddc89dc09a185e6a41274cf382b430ec3eeb..f08798794a2f9fc042800583cbc032d6f12bf3dc 100644 --- a/paddle/fluid/operators/reader/ctr_reader.cc +++ b/paddle/fluid/operators/reader/ctr_reader.cc @@ -73,6 +73,9 @@ static inline void parse_line( } } +// label slot1:fea_sign slot2:fea_sign slot1:fea_sign +static inline void parse_svm_line(const std::string& line) {} + class Reader { public: virtual ~Reader() {} @@ -95,11 +98,27 @@ class GzipReader : public Reader { igzstream gzstream_; }; -class MultiGzipReader : public Reader { +class PlainFileReader : public Reader { public: - explicit MultiGzipReader(const std::vector& file_list) { + explicit PlainFileReader(const std::string& file_name) + : stream_(file_name.c_str()) {} + + ~PlainFileReader() {} + + bool HasNext() override { return stream_.peek() != EOF; } + + void NextLine(std::string* line) override { std::getline(stream_, *line); } + + private: + std::ifstream stream_; +}; + +template +class MultiFileReader : public Reader { + public: + explicit MultiFileReader(const std::vector& file_list) { for (auto& file : file_list) { - readers_.emplace_back(std::make_shared(file)); + readers_.emplace_back(std::make_shared(file)); } } @@ -119,46 +138,35 @@ class MultiGzipReader : public Reader { } private: - std::vector> readers_; + std::vector> readers_; size_t current_reader_index_ = 0; }; void MonitorThread(std::vector* thread_status, std::shared_ptr queue) { - VLOG(30) << "monitor thread in"; + VLOG(3) << "monitor thread in"; bool reader_thread_is_running = true; while (reader_thread_is_running) { - VLOG(30) << "reader_thread_is_running"; + VLOG(3) << "reader_thread_is_running"; reader_thread_is_running = false; for (size_t i = 0; i < (*thread_status).size(); ++i) { if ((*thread_status)[i] == Running) { - VLOG(30) << "reader is running!"; + VLOG(3) << "reader is running!"; reader_thread_is_running = true; } } std::this_thread::sleep_for(std::chrono::milliseconds(1000)); } - VLOG(30) << "all reader thread is stopped, push empty data into queue"; - queue->Push({}); - VLOG(30) << "monitor thread exited"; + VLOG(3) << "all reader thread is stopped, close the queue"; + queue->Close(); + VLOG(3) << "monitor thread exited"; } -void ReadThread(const std::vector& file_list, - const std::vector& slots, int batch_size, - int thread_id, std::vector* thread_status, - std::shared_ptr queue) { - VLOG(30) << "[" << thread_id << "]" - << " reader thread start! thread_id = " << thread_id; - for (auto& file : file_list) { - VLOG(30) << "[" << thread_id << "]" - << " file " << file; - } - (*thread_status)[thread_id] = Running; - VLOG(30) << "set status to running"; - +void ReadSvmData(const DataDesc& data_desc, std::shared_ptr reader, + std::shared_ptr queue) { std::unordered_map slot_to_index; - for (size_t i = 0; i < slots.size(); ++i) { - slot_to_index[slots[i]] = i; + for (size_t i = 0; i < data_desc.sparse_slot_ids_.size(); ++i) { + slot_to_index[data_desc.sparse_slot_ids_[i]] = i; } std::string line; @@ -166,21 +174,17 @@ void ReadThread(const std::vector& file_list, std::vector>> batch_data; std::vector batch_label; - MultiGzipReader reader(file_list); - - VLOG(30) << "reader inited"; - - while (reader.HasNext()) { + while (reader->HasNext()) { batch_data.clear(); - batch_data.reserve(batch_size); + batch_data.reserve(data_desc.batch_size_); batch_label.clear(); - batch_label.reserve(batch_size); + batch_label.reserve(data_desc.batch_size_); // read batch_size data - for (int i = 0; i < batch_size; ++i) { - if (reader.HasNext()) { - reader.NextLine(&line); + for (int i = 0; i < data_desc.batch_size_; ++i) { + if (reader->HasNext()) { + reader->NextLine(&line); std::unordered_map> slot_to_data; int64_t label; parse_line(line, slot_to_index, &label, &slot_to_data); @@ -193,8 +197,8 @@ void ReadThread(const std::vector& file_list, std::vector lod_datas; - // first insert tensor for each slots - for (auto& slot : slots) { + // first insert tensor for each sparse_slots + for (auto& slot : data_desc.sparse_slot_ids_) { std::vector lod_data{0}; std::vector batch_feasign; @@ -226,11 +230,167 @@ void ReadThread(const std::vector& file_list, lod_datas.push_back(label_tensor); queue->Push(lod_datas); - VLOG(40) << "push one data, queue_size=" << queue->Size(); + VLOG(4) << "push one data, queue_size=" << queue->Size(); + } +} + +// label dense_fea,dense_fea sparse_fea,sparse_fea +static inline void parse_csv_line( + const std::string& line, const DataDesc& data_desc, int64_t* label, + std::vector>* dense_datas, + std::vector>* sparse_datas) { + std::vector ret; + string_split(line, ' ', &ret); + *label = std::stol(ret[0]); + dense_datas->resize(data_desc.dense_slot_index_.size()); + for (size_t i = 0; i < data_desc.dense_slot_index_.size(); ++i) { + int slot_idx = data_desc.dense_slot_index_[i]; + auto& slot_data = ret[slot_idx]; + std::vector data_in_slot_str; + string_split(slot_data, ',', &data_in_slot_str); + std::vector data_in_slot; + for (auto& data_str : data_in_slot_str) { + (*dense_datas)[i].push_back(std::stof(data_str)); + } + } + sparse_datas->resize(data_desc.sparse_slot_index_.size()); + for (size_t i = 0; i < data_desc.sparse_slot_index_.size(); ++i) { + int slot_idx = data_desc.sparse_slot_index_[i]; + auto& slot_data = ret[slot_idx]; + std::vector data_in_slot_str; + string_split(slot_data, ',', &data_in_slot_str); + std::vector data_in_slot; + for (auto& data_str : data_in_slot_str) { + auto id = std::stol(data_str); + (*sparse_datas)[i].push_back(id); + } + } +} + +void ReadCsvData(const DataDesc& data_desc, std::shared_ptr reader, + std::shared_ptr queue) { + std::string line; + while (reader->HasNext()) { + std::vector batch_label; + batch_label.reserve(data_desc.batch_size_); + + std::vector>> batch_dense_data; + batch_dense_data.reserve(data_desc.batch_size_); + + std::vector>> batch_sparse_data; + batch_sparse_data.reserve(data_desc.batch_size_); + + // read batch_size data + for (int i = 0; i < data_desc.batch_size_; ++i) { + if (reader->HasNext()) { + reader->NextLine(&line); + int64_t label; + std::vector> dense_datas; + std::vector> sparse_datas; + parse_csv_line(line, data_desc, &label, &dense_datas, &sparse_datas); + batch_label.push_back(label); + if (!batch_dense_data.empty()) { + PADDLE_ENFORCE_EQ(batch_dense_data[0].size(), dense_datas.size(), + "dense data should have the same shape"); + } + batch_dense_data.push_back(dense_datas); + batch_sparse_data.push_back(sparse_datas); + } else { + break; + } + } + + // the order of output data is label, dense_datas, sparse_datas + std::vector lod_datas; + + // insert label tensor + framework::LoDTensor label_tensor; + auto* label_tensor_data = label_tensor.mutable_data( + framework::make_ddim({static_cast(batch_label.size()), 1}), + platform::CPUPlace()); + memcpy(label_tensor_data, batch_label.data(), + batch_label.size() * sizeof(int64_t)); + lod_datas.push_back(label_tensor); + + // insert tensor for each dense_slots + for (size_t i = 0; i < data_desc.dense_slot_index_.size(); ++i) { + framework::LoDTensor lod_tensor; + size_t width = batch_dense_data[0][i].size(); + auto* tensor_data = lod_tensor.mutable_data( + framework::make_ddim( + {static_cast(batch_dense_data.size()), // batch_size + static_cast(width)}), + platform::CPUPlace()); + + for (size_t j = 0; j < batch_dense_data.size(); ++j) { + auto& dense_data_row = batch_dense_data[j][i]; + memcpy(tensor_data + j * width, dense_data_row.data(), + width * sizeof(float)); + } + + lod_datas.push_back(lod_tensor); + } + + // insert tensor for each sparse_slots + for (size_t i = 0; i < data_desc.sparse_slot_index_.size(); ++i) { + std::vector lod_data{0}; + std::vector batch_feasign; + + for (size_t row_idx = 0; row_idx < batch_sparse_data.size(); ++row_idx) { + auto& sparse_ids = batch_sparse_data[row_idx][i]; + lod_data.push_back(lod_data.back() + sparse_ids.size()); + batch_feasign.insert(batch_feasign.end(), sparse_ids.begin(), + sparse_ids.end()); + } + + framework::LoDTensor lod_tensor; + framework::LoD lod{lod_data}; + lod_tensor.set_lod(lod); + int64_t* tensor_data = lod_tensor.mutable_data( + framework::make_ddim({static_cast(batch_feasign.size()), 1}), + platform::CPUPlace()); + memcpy(tensor_data, batch_feasign.data(), + batch_feasign.size() * sizeof(int64_t)); + lod_datas.push_back(lod_tensor); + } + + queue->Push(lod_datas); + VLOG(4) << "push one data, queue_size=" << queue->Size(); + } +} + +void ReadThread(const std::vector& file_list, + const DataDesc& data_desc, int thread_id, + std::vector* thread_status, + std::shared_ptr queue) { + VLOG(3) << "[" << thread_id << "]" + << " reader thread start! thread_id = " << thread_id; + for (auto& file : file_list) { + VLOG(3) << "[" << thread_id << "]" + << " file " << file; + } + (*thread_status)[thread_id] = Running; + VLOG(3) << "set status to running"; + + std::shared_ptr reader; + if (data_desc.file_type_ == "gzip") { + reader.reset(new MultiFileReader(file_list)); + } else if (data_desc.file_type_ == "plain") { + reader.reset(new MultiFileReader(file_list)); + } else { + PADDLE_THROW("do not support file format %s", data_desc.file_type_); + } + + VLOG(3) << "reader inited"; + + if (data_desc.file_format_ == "svm") { + ReadSvmData(data_desc, reader, queue); + } else if (data_desc.file_format_ == "csv") { + ReadCsvData(data_desc, reader, queue); } (*thread_status)[thread_id] = Stopped; - VLOG(30) << "set status to stopped, thread " << thread_id << " exited"; + VLOG(3) << "set status to stopped, thread " << thread_id << " exited"; } } // namespace reader diff --git a/paddle/fluid/operators/reader/ctr_reader.h b/paddle/fluid/operators/reader/ctr_reader.h index 56879ffda5d3e04a88d12d6c4701c24a0d0ee4f7..740cd5219c70331d1f71d832adef084c148a2408 100644 --- a/paddle/fluid/operators/reader/ctr_reader.h +++ b/paddle/fluid/operators/reader/ctr_reader.h @@ -36,9 +36,63 @@ namespace reader { enum ReaderThreadStatus { Running, Stopped }; +struct DataDesc { + DataDesc(int batch_size, const std::vector& file_names, + const std::string& file_type, const std::string& file_format, + const std::vector& dense_slot_index, + const std::vector& sparse_slot_index, + const std::vector& sparse_slot_ids) + : batch_size_(batch_size), + file_names_(file_names), + file_type_(file_type), + file_format_(file_format), + dense_slot_index_(dense_slot_index), + sparse_slot_index_(sparse_slot_index), + sparse_slot_ids_(sparse_slot_ids) {} + + const int batch_size_; + const std::vector file_names_; + const std::string file_type_; // gzip or plain + const std::string file_format_; // csv or svm + // used for csv data format + const std::vector dense_slot_index_; + const std::vector sparse_slot_index_; + // used for svm data format + const std::vector sparse_slot_ids_; +}; + +inline std::ostream& operator<<(std::ostream& os, const DataDesc& data_desc) { + os << "data_desc:\n"; + os << "\tbatch_size -> " << data_desc.batch_size_ << "\n"; + os << "\tfile_type -> " << data_desc.file_type_ << "\n"; + os << "\tfile_format -> " << data_desc.file_format_ << "\n"; + os << "\tfile_names -> {"; + for (auto& file_name : data_desc.file_names_) { + os << file_name << ","; + } + os << "}\n"; + os << "\tdense_slot_index -> {"; + for (auto& slot : data_desc.dense_slot_index_) { + os << slot << ","; + } + os << "}\n"; + os << "\tsparse_slot_index_ -> {"; + for (auto& slot : data_desc.sparse_slot_index_) { + os << slot << ","; + } + os << "}\n"; + os << "\tsparse_slot_ids_ -> {"; + for (auto& slot : data_desc.sparse_slot_ids_) { + os << slot << ","; + } + os << "}\n"; + + return os; +} + void ReadThread(const std::vector& file_list, - const std::vector& slots, int batch_size, - int thread_id, std::vector* thread_status, + const DataDesc& data_desc, int thread_id, + std::vector* thread_status, std::shared_ptr queue); // monitor all running thread, if they are all stopped, @@ -48,15 +102,15 @@ void MonitorThread(std::vector* thread_status, class CTRReader : public framework::FileReader { public: - explicit CTRReader(const std::shared_ptr& queue, - int batch_size, size_t thread_num, - const std::vector& slots, - const std::vector& file_list) - : batch_size_(batch_size), slots_(slots), file_list_(file_list) { + CTRReader(const std::shared_ptr& queue, + int thread_num, const DataDesc& data_desc) + : data_desc_(data_desc) { PADDLE_ENFORCE_GT(thread_num, 0, "thread num should be larger then 0!"); PADDLE_ENFORCE(queue != nullptr, "LoDTensorBlockingQueue must not be null"); - PADDLE_ENFORCE_GT(file_list.size(), 0, "file list should not be empty"); - thread_num_ = std::min(file_list_.size(), thread_num); + PADDLE_ENFORCE_GT(data_desc_.file_names_.size(), 0, + "file list should not be empty"); + + thread_num_ = std::min(data_desc_.file_names_.size(), thread_num); queue_ = queue; SplitFiles(); for (size_t i = 0; i < thread_num_; ++i) { @@ -64,7 +118,7 @@ class CTRReader : public framework::FileReader { } } - ~CTRReader() {} + ~CTRReader() { Shutdown(); } void ReadNext(std::vector* out) override { bool success; @@ -81,7 +135,10 @@ class CTRReader : public framework::FileReader { for (auto& read_thread : read_threads_) { read_thread->join(); } - monitor_thread_->join(); + + if (monitor_thread_) { + monitor_thread_->join(); + } read_threads_.clear(); monitor_thread_.reset(nullptr); @@ -95,9 +152,9 @@ class CTRReader : public framework::FileReader { queue_->ReOpen(); VLOG(3) << "reopen success"; VLOG(3) << "thread_num " << thread_num_; - for (size_t thread_id = 0; thread_id < thread_num_; thread_id++) { + for (int thread_id = 0; thread_id < thread_num_; thread_id++) { read_threads_.emplace_back(new std::thread(std::bind( - &ReadThread, file_groups_[thread_id], slots_, batch_size_, + &ReadThread, file_groups_[thread_id], data_desc_, static_cast(thread_id), &read_thread_status_, queue_))); } monitor_thread_.reset(new std::thread( @@ -108,8 +165,8 @@ class CTRReader : public framework::FileReader { private: void SplitFiles() { file_groups_.resize(thread_num_); - for (size_t i = 0; i < file_list_.size(); ++i) { - auto& file_name = file_list_[i]; + for (size_t i = 0; i < data_desc_.file_names_.size(); ++i) { + auto& file_name = data_desc_.file_names_[i]; std::ifstream f(file_name.c_str()); PADDLE_ENFORCE(f.good(), "file %s not exist!", file_name); file_groups_[i % thread_num_].push_back(file_name); @@ -118,9 +175,7 @@ class CTRReader : public framework::FileReader { private: size_t thread_num_; - const int batch_size_; - const std::vector slots_; - const std::vector file_list_; + const DataDesc data_desc_; std::shared_ptr queue_; std::vector> read_threads_; std::unique_ptr monitor_thread_; diff --git a/paddle/fluid/operators/reader/ctr_reader_test.cc b/paddle/fluid/operators/reader/ctr_reader_test.cc index 8dba9baebce0a82ee2a541fe6ae9f6bcef8e2835..9f3a254c84d4e04fbcd449644a7e138eff520fbc 100644 --- a/paddle/fluid/operators/reader/ctr_reader_test.cc +++ b/paddle/fluid/operators/reader/ctr_reader_test.cc @@ -36,6 +36,7 @@ using paddle::framework::LoD; using paddle::framework::DDim; using paddle::platform::CPUPlace; using paddle::framework::make_ddim; +using paddle::operators::reader::DataDesc; static void generatedata(const std::vector& data, const std::string& file_name) { @@ -126,30 +127,103 @@ TEST(CTR_READER, read_data) { LoDTensorBlockingQueueHolder queue_holder; int capacity = 64; - queue_holder.InitOnce(capacity, {}, false); + queue_holder.InitOnce(capacity, false); std::shared_ptr queue = queue_holder.GetQueue(); int batch_size = 3; int thread_num = 1; - std::vector slots = {"6002", "6003"}; + std::vector sparse_slots = {"6002", "6003"}; std::vector file_list; for (int i = 0; i < thread_num; ++i) { file_list.push_back(gz_file_name); } - CTRReader reader(queue, batch_size, thread_num, slots, file_list); + DataDesc data_desc(batch_size, file_list, "gzip", "svm", {}, {}, + sparse_slots); + + CTRReader reader(queue, thread_num, data_desc); reader.Start(); size_t batch_num = std::ceil(static_cast(ctr_data.size()) / batch_size) * thread_num; - check_all_data(ctr_data, slots, label_dims, label_value, data_slot_6002, - data_slot_6003, batch_num, batch_size, queue, &reader); + check_all_data(ctr_data, sparse_slots, label_dims, label_value, + data_slot_6002, data_slot_6003, batch_num, batch_size, queue, + &reader); reader.Shutdown(); reader.Start(); - check_all_data(ctr_data, slots, label_dims, label_value, data_slot_6002, - data_slot_6003, batch_num, batch_size, queue, &reader); + check_all_data(ctr_data, sparse_slots, label_dims, label_value, + data_slot_6002, data_slot_6003, batch_num, batch_size, queue, + &reader); reader.Shutdown(); } + +static void GenereteCsvData(const std::string& file_name, + const std::vector& data) { + std::ofstream out(file_name.c_str()); + PADDLE_ENFORCE(out.good(), "open file %s failed!", file_name); + for (auto& c : data) { + out << c; + } + out.close(); + PADDLE_ENFORCE(out.good(), "save file %s failed!", file_name); +} + +static void CheckReadCsvOut(const std::vector& out) { + ASSERT_EQ(out.size(), 3); + ASSERT_EQ(out[0].dims()[1], 1); + ASSERT_EQ(out[1].dims()[1], 2); + ASSERT_EQ(out[2].dims()[1], 1); + for (size_t i = 0; i < out[0].numel(); ++i) { + int64_t label = out[0].data()[i]; + auto& dense_dim = out[1].dims(); + for (size_t j = 0; j < dense_dim[1]; ++j) { + ASSERT_EQ(out[1].data()[i * dense_dim[1] + j], + static_cast(label + 0.1)); + } + auto& sparse_lod = out[2].lod(); + for (size_t j = sparse_lod[0][i]; j < sparse_lod[0][i + 1]; ++j) { + ASSERT_EQ(out[2].data()[j], label); + } + } +} + +TEST(CTR_READER, read_csv_data) { + std::string file_name = "test_ctr_reader_data.csv"; + const std::vector csv_data = { + "0 0.1,0.1 0,0,0,0\n", "1 1.1,1.1 1,1,1,1\n", "2 2.1,2.1 2,2,2,2\n", + "3 3.1,3.1 3,3,3,3\n", + }; + GenereteCsvData(file_name, csv_data); + + LoDTensorBlockingQueueHolder queue_holder; + int capacity = 64; + queue_holder.InitOnce(capacity, false); + + std::shared_ptr queue = queue_holder.GetQueue(); + + int batch_size = 3; + int thread_num = 1; + std::vector file_list; + for (int i = 0; i < thread_num; ++i) { + file_list.push_back(file_name); + } + DataDesc data_desc(batch_size, file_list, "plain", "csv", {1}, {2}, {}); + + CTRReader reader(queue, thread_num, data_desc); + + for (size_t i = 0; i < 2; ++i) { + reader.Start(); + std::vector out; + while (true) { + reader.ReadNext(&out); + if (out.empty()) { + break; + } + CheckReadCsvOut(out); + } + reader.Shutdown(); + } +} diff --git a/paddle/fluid/operators/reader/lod_tensor_blocking_queue.h b/paddle/fluid/operators/reader/lod_tensor_blocking_queue.h index 3f041ff7e4e32b407729a22aab25d3aab199fee0..5b53edff5d8ea79a03542231dbf34f5a6f254986 100644 --- a/paddle/fluid/operators/reader/lod_tensor_blocking_queue.h +++ b/paddle/fluid/operators/reader/lod_tensor_blocking_queue.h @@ -32,10 +32,8 @@ class LoDTensorBlockingQueue { friend class LoDTensorBlockingQueueHolder; private: - LoDTensorBlockingQueue(size_t capacity, - const std::vector& dims, - bool speed_test_mode = false) - : queue_(capacity, speed_test_mode), dims_(dims) {} + explicit LoDTensorBlockingQueue(size_t capacity, bool speed_test_mode = false) + : queue_(capacity, speed_test_mode) {} public: bool Push(const std::vector& lod_tensor_vec) { @@ -65,17 +63,15 @@ class LoDTensorBlockingQueue { private: BlockingQueue> queue_; - std::vector dims_; }; class LoDTensorBlockingQueueHolder { public: - void InitOnce(size_t capacity, const std::vector& dims, - bool speed_test_mode = false) { + void InitOnce(size_t capacity, bool speed_test_mode = false) { PADDLE_ENFORCE( queue_ == nullptr, "LoDTensorBlockingQueueHolder::InitOnce() can only be called once"); - queue_.reset(new LoDTensorBlockingQueue(capacity, dims, speed_test_mode)); + queue_.reset(new LoDTensorBlockingQueue(capacity, speed_test_mode)); } inline const std::shared_ptr& GetQueue() const { diff --git a/paddle/fluid/operators/reader/read_op.cc b/paddle/fluid/operators/reader/read_op.cc index a0b70938d354cbb3bf10a9c8c589ba5153624f45..8fe638ac2fdc6e0baed7d6cd3c57b72f23164129 100644 --- a/paddle/fluid/operators/reader/read_op.cc +++ b/paddle/fluid/operators/reader/read_op.cc @@ -27,13 +27,13 @@ class ReadInferShape : public framework::InferShapeBase { "The ReadOp must take a reader as input."); PADDLE_ENFORCE(ctx->HasOutputs("Out"), "The ReadOp should be assigned with output."); - std::vector reader_dims = ctx->GetReaderDims("Reader"); - std::vector out_names = ctx->Outputs("Out"); - PADDLE_ENFORCE_EQ( - reader_dims.size(), out_names.size(), - "The reader's dim number doesn't match the output number."); - ctx->SetOutputsDim("Out", reader_dims); - if (!ctx->IsRuntime()) { + if (!ctx->IsRuntime() && ctx->Attrs().Get("infer_out")) { + std::vector reader_dims = ctx->GetReaderDims("Reader"); + std::vector out_names = ctx->Outputs("Out"); + PADDLE_ENFORCE_EQ( + reader_dims.size(), out_names.size(), + "The reader's dim number doesn't match the output number."); + ctx->SetOutputsDim("Out", reader_dims); auto in_desc = boost::get(ctx->GetInputVarPtrs("Reader")[0]); auto in_lod_levels = in_desc->GetLoDLevels(); @@ -53,15 +53,18 @@ class ReadInferVarType : public framework::VarTypeInference { public: void operator()(const framework::OpDesc& op_desc, framework::BlockDesc* block) const override { - std::string reader_name = op_desc.Input("Reader")[0]; - std::vector out_names = op_desc.Output("Out"); - framework::VarDesc* reader = block->FindVarRecursive(reader_name); - auto dtypes = reader->GetDataTypes(); - PADDLE_ENFORCE_EQ(dtypes.size(), out_names.size()); - for (size_t i = 0; i < dtypes.size(); ++i) { - framework::VarDesc& out = block->FindRecursiveOrCreateVar(out_names[i]); - out.SetType(framework::proto::VarType::LOD_TENSOR); - out.SetDataType(dtypes[i]); + bool infer_out = boost::get(op_desc.GetAttr("infer_out")); + if (infer_out) { + std::string reader_name = op_desc.Input("Reader")[0]; + std::vector out_names = op_desc.Output("Out"); + framework::VarDesc* reader = block->FindVarRecursive(reader_name); + auto dtypes = reader->GetDataTypes(); + PADDLE_ENFORCE_EQ(dtypes.size(), out_names.size()); + for (size_t i = 0; i < dtypes.size(); ++i) { + framework::VarDesc& out = block->FindRecursiveOrCreateVar(out_names[i]); + out.SetType(framework::proto::VarType::LOD_TENSOR); + out.SetDataType(dtypes[i]); + } } } }; @@ -73,6 +76,7 @@ class ReadOp : public framework::OperatorBase { private: void RunImpl(const framework::Scope& scope, const platform::Place& dev_place) const override { + VLOG(3) << "read op in"; framework::ReaderHolder* reader = detail::Ref(scope.FindVar(Input("Reader")), "Cannot find reader variable %s", Input("Reader")) @@ -87,7 +91,9 @@ class ReadOp : public framework::OperatorBase { reader->ReadNext(&ins); if (ins.empty()) { + VLOG(3) << "read empty data in"; if (Attr("throw_eof_exp")) { + VLOG(3) << "throw_eof_exp"; PADDLE_THROW_EOF(); } else { ins.resize(out_arg_names.size()); @@ -96,6 +102,7 @@ class ReadOp : public framework::OperatorBase { tensor.mutable_data(framework::make_ddim({0}), dev_place); } } + VLOG(3) << "read empty data out"; } PADDLE_ENFORCE_EQ(ins.size(), out_arg_names.size()); for (size_t i = 0; i < out_arg_names.size(); ++i) { @@ -120,6 +127,7 @@ class ReadOpMaker : public framework::OpProtoAndCheckerMaker { " only when the data-balance is enabled in ParallelExecutor" " and it is set by ParallelExecutor instance, not users.") .SetDefault(true); + AddAttr("infer_out", "").SetDefault(true); AddComment(R"DOC( Read Operator diff --git a/paddle/fluid/operators/reader/reader_op_registry.cc b/paddle/fluid/operators/reader/reader_op_registry.cc index b82aab1214992be73d876a42424234e3cea46455..3921eedf94abbe68bed035940913f830a6c16e48 100644 --- a/paddle/fluid/operators/reader/reader_op_registry.cc +++ b/paddle/fluid/operators/reader/reader_op_registry.cc @@ -65,6 +65,10 @@ void FileReaderMakerBase::Make() { "It means the reader will generate two data each time," "whose shapes are [2,3,4] and [5,6] respectively."); AddAttr>("lod_levels", "The LoD levels of each data."); + AddAttr( + "use_data_config", + "Use the config of all datas like shape_concat/ranks/lod_levels") + .SetDefault(true); Apply(); } @@ -75,19 +79,23 @@ void FileReaderInferShape::operator()(framework::InferShapeContext* ctx) const { PADDLE_ENFORCE(ctx->HasOutput("Out"), "The output file reader should not be null."); - const auto shape_concat = ctx->Attrs().Get>("shape_concat"); - const auto ranks = ctx->Attrs().Get>("ranks"); - std::vector shapes = RestoreShapes(shape_concat, ranks); - ctx->SetReaderDims("Out", shapes); - - const auto lod_levels = ctx->Attrs().Get>("lod_levels"); - PADDLE_ENFORCE_EQ(lod_levels.size(), shapes.size(), - "The number of 'lod_levels'(%d) doesn't match the number " - "of 'shapes'(%d).", - lod_levels.size(), shapes.size()); - framework::VarDesc* reader = - boost::get(ctx->GetOutputVarPtrs("Out")[0]); - reader->SetLoDLevels(lod_levels); + bool use_data_config = ctx->Attrs().Get("use_data_config"); + if (use_data_config) { + const auto shape_concat = + ctx->Attrs().Get>("shape_concat"); + const auto ranks = ctx->Attrs().Get>("ranks"); + std::vector shapes = RestoreShapes(shape_concat, ranks); + ctx->SetReaderDims("Out", shapes); + + const auto lod_levels = ctx->Attrs().Get>("lod_levels"); + PADDLE_ENFORCE_EQ(lod_levels.size(), shapes.size(), + "The number of 'lod_levels'(%d) doesn't match the number " + "of 'shapes'(%d).", + lod_levels.size(), shapes.size()); + framework::VarDesc* reader = + boost::get(ctx->GetOutputVarPtrs("Out")[0]); + reader->SetLoDLevels(lod_levels); + } } void FileReaderInferVarType::operator()(const framework::OpDesc& op_desc, diff --git a/paddle/fluid/operators/shuffle_channel_op.cc b/paddle/fluid/operators/shuffle_channel_op.cc new file mode 100644 index 0000000000000000000000000000000000000000..9349912e090f2ad3248923c87b50c8d72b0d84d1 --- /dev/null +++ b/paddle/fluid/operators/shuffle_channel_op.cc @@ -0,0 +1,113 @@ +/*Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + http://www.apache.org/licenses/LICENSE-2.0 +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/operators/shuffle_channel_op.h" + +namespace paddle { +namespace operators { + +class ShuffleChannelOp : public framework::OperatorWithKernel { + public: + using framework::OperatorWithKernel::OperatorWithKernel; + + void InferShape(framework::InferShapeContext* ctx) const override { + PADDLE_ENFORCE(ctx->HasInput("X"), + "Input(X) of ShuffleChannelOp should not be null."); + PADDLE_ENFORCE(ctx->HasOutput("Out"), + "Output(Out) of ShuffleChannelOp should not be null."); + + auto input_dims = ctx->GetInputDim("X"); + PADDLE_ENFORCE(input_dims.size() == 4, "The layout of input is NCHW."); + + ctx->SetOutputDim("Out", input_dims); + } + + protected: + framework::OpKernelType GetExpectedKernelType( + const framework::ExecutionContext& ctx) const override { + return framework::OpKernelType(ctx.Input("X")->type(), + ctx.device_context()); + } +}; + +class ShuffleChannelOpMaker : public framework::OpProtoAndCheckerMaker { + public: + void Make() override { + AddInput("X", + "(Tensor, default Tensor), " + "the input feature data of ShuffleChannelOp, the layout is NCHW."); + AddOutput("Out", + "(Tensor, default Tensor), the output of " + "ShuffleChannelOp. The layout is NCHW."); + AddAttr("group", "the number of groups.") + .SetDefault(1) + .AddCustomChecker([](const int& group) { + PADDLE_ENFORCE_GE(group, 1, "group should be larger than 0."); + }); + + AddComment(R"DOC( + Shuffle Channel operator + This opearator shuffles the channels of input x. + It divide the input channels in each group into several subgroups, + and obtain a new order by selecting element from every subgroup one by one. + + Shuffle channel operation makes it possible to build more powerful structures + with multiple group convolutional layers. + please get more information from the following paper: + https://arxiv.org/pdf/1707.01083.pdf + )DOC"); + } +}; + +class ShuffleChannelGradOp : public framework::OperatorWithKernel { + public: + using framework::OperatorWithKernel::OperatorWithKernel; + + void InferShape(framework::InferShapeContext* ctx) const override { + PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("Out")), + "Input(Out@Grad) should not be null"); + PADDLE_ENFORCE(ctx->HasOutput(framework::GradVarName("X")), + "Output(X@Grad) should not be null"); + + auto input_dims = ctx->GetInputDim("X"); + PADDLE_ENFORCE(input_dims.size() == 4, "The layout of input is NCHW."); + + ctx->SetOutputDim(framework::GradVarName("X"), input_dims); + } + + protected: + framework::OpKernelType GetExpectedKernelType( + const framework::ExecutionContext& ctx) const override { + return framework::OpKernelType(ctx.Input("X")->type(), + ctx.device_context()); + } +}; + +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; +REGISTER_OPERATOR(shuffle_channel, ops::ShuffleChannelOp, + ops::ShuffleChannelOpMaker, + paddle::framework::DefaultGradOpDescMaker); + +REGISTER_OPERATOR(shuffle_channel_grad, ops::ShuffleChannelGradOp); + +REGISTER_OP_CPU_KERNEL( + shuffle_channel, + ops::ShuffleChannelOpKernel, + ops::ShuffleChannelOpKernel); + +REGISTER_OP_CPU_KERNEL( + shuffle_channel_grad, + ops::ShuffleChannelGradOpKernel, + ops::ShuffleChannelGradOpKernel); diff --git a/paddle/fluid/operators/shuffle_channel_op.cu b/paddle/fluid/operators/shuffle_channel_op.cu new file mode 100644 index 0000000000000000000000000000000000000000..9506343b3d508459c6e10dc68eba13504b07338f --- /dev/null +++ b/paddle/fluid/operators/shuffle_channel_op.cu @@ -0,0 +1,125 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + http://www.apache.org/licenses/LICENSE-2.0 +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/operators/shuffle_channel_op.h" +#include "paddle/fluid/platform/cuda_primitives.h" +#include "paddle/fluid/platform/gpu_info.h" + +namespace paddle { +namespace operators { + +using Tensor = framework::Tensor; +static constexpr int kNumCUDAThreads = 512; +static constexpr int kNumMaximumNumBlocks = 4096; + +static inline int NumBlocks(const int N) { + return std::min((N + kNumCUDAThreads - 1) / kNumCUDAThreads, + kNumMaximumNumBlocks); +} + +template +__global__ void ShuffleChannel(const int nthreads, const int feature_map_size, + T* output, const T* input, int group_row, + int group_column, int len) { + int index = blockIdx.x * blockDim.x + threadIdx.x; + int offset = blockDim.x * gridDim.x; + for (size_t ii = index; ii < nthreads; ii += offset) { + const int n = index / group_row / group_column / len; + const int i = (index / group_column / len) % group_row; + const int j = index / len % group_column; + const int k = index - (n * feature_map_size + (i * group_column + j) * len); + T* p_o = output + n * feature_map_size + (j * group_row + i) * len; + p_o[k] = input[index]; + } +} +template +class ShuffleChannelOpCUDAKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const override { + auto* input = ctx.Input("X"); + auto* output = ctx.Output("Out"); + int group = ctx.Attr("group"); + + auto input_dims = input->dims(); + auto num = input_dims[0]; + auto channel = input_dims[1]; + auto height = input_dims[2]; + auto weight = input_dims[3]; + + auto feature_map_size = channel * height * weight; + auto sp_sz = height * weight; + int group_row = group; + int group_column = channel / group_row; + // count is the product of NCHW same as numel() + int count = num * group_column * group_row * sp_sz; + + int blocks = NumBlocks(output->numel()); + int threads = kNumCUDAThreads; + + const T* input_data = input->data(); + T* output_data = output->mutable_data(ctx.GetPlace()); + + ShuffleChannel< + T><<>>( + count, feature_map_size, output_data, input_data, group_row, + group_column, sp_sz); + } +}; + +template +class ShuffleChannelGradOpCUDAKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const override { + auto* input = ctx.Input("X"); + int group = ctx.Attr("group"); + + auto input_dims = input->dims(); + auto num = input_dims[0]; + auto channel = input_dims[1]; + auto height = input_dims[2]; + auto weight = input_dims[3]; + auto feature_map_size = channel * height * weight; + auto sp_sz = height * weight; + + int group_row = group; + int group_column = channel / group_row; + auto* output_grad = + ctx.Input(framework::GradVarName("Out")); + auto* input_grad = + ctx.Output(framework::GradVarName("X")); + T* input_grad_data = input_grad->mutable_data(ctx.GetPlace()); + const T* output_grad_data = output_grad->data(); + + int blocks = NumBlocks(output_grad->numel()); + int threads = kNumCUDAThreads; + int count = num * group_column * group_row * sp_sz; + + ShuffleChannel< + T><<>>( + count, feature_map_size, input_grad_data, output_grad_data, group_row, + group_column, sp_sz); + } +}; +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; +REGISTER_OP_CUDA_KERNEL( + shuffle_channel, + ops::ShuffleChannelOpCUDAKernel, + ops::ShuffleChannelOpCUDAKernel); +REGISTER_OP_CUDA_KERNEL( + shuffle_channel_grad, + ops::ShuffleChannelGradOpCUDAKernel, + ops::ShuffleChannelGradOpCUDAKernel); diff --git a/paddle/fluid/operators/shuffle_channel_op.h b/paddle/fluid/operators/shuffle_channel_op.h new file mode 100644 index 0000000000000000000000000000000000000000..f6af1bc88598870ebccef81bd37f93f376940851 --- /dev/null +++ b/paddle/fluid/operators/shuffle_channel_op.h @@ -0,0 +1,95 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + http://www.apache.org/licenses/LICENSE-2.0 +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once +#include +#include +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/operators/math/math_function.h" + +namespace paddle { +namespace operators { + +template +class ShuffleChannelOpKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const override { + auto* input = ctx.Input("X"); + auto* output = ctx.Output("Out"); + int group = ctx.Attr("group"); + + auto input_dims = input->dims(); + auto num = input_dims[0]; + auto channel = input_dims[1]; + auto height = input_dims[2]; + auto weight = input_dims[3]; + + auto feature_map_size = channel * height * weight; + auto sp_sz = height * weight; + int group_row = group; + int group_column = channel / group_row; + + const T* input_data = input->data(); + T* output_data = output->mutable_data(ctx.GetPlace()); + for (int n = 0; n < num; ++n) { + for (int i = 0; i < group_row; ++i) { + for (int j = 0; j < group_column; ++j) { + const T* p_i = input_data + n * feature_map_size + + (i * group_column + j) * sp_sz; + T* p_o = + output_data + n * feature_map_size + (j * group_row + i) * sp_sz; + memcpy(p_o, p_i, sizeof(int) * sp_sz); + } + } + } + } +}; + +template +class ShuffleChannelGradOpKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const override { + auto* input = ctx.Input("X"); + int group = ctx.Attr("group"); + + auto input_dims = input->dims(); + auto num = input_dims[0]; + auto channel = input_dims[1]; + auto height = input_dims[2]; + auto weight = input_dims[3]; + auto feature_map_size = channel * height * weight; + auto sp_sz = height * weight; + + int group_row = group; + int group_column = channel / group_row; + + auto* output_grad = + ctx.Input(framework::GradVarName("Out")); + auto* input_grad = + ctx.Output(framework::GradVarName("X")); + T* input_grad_data = input_grad->mutable_data(ctx.GetPlace()); + const T* output_grad_data = output_grad->data(); + for (int n = 0; n < num; ++n) { + for (int i = 0; i < group_row; ++i) { + for (int j = 0; j < group_column; ++j) { + const T* p_i = output_grad_data + n * feature_map_size + + (i * group_column + j) * sp_sz; + T* p_o = input_grad_data + n * feature_map_size + + (j * group_row + i) * sp_sz; + memcpy(p_o, p_i, sizeof(int) * sp_sz); + } + } + } + } +}; + +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/tensorrt/tensorrt_engine_op.cc b/paddle/fluid/operators/tensorrt/tensorrt_engine_op.cc index b993c55fad13e892efd51648b78704bec83bf2b4..031335009b692f9d1f73070c88e8e79d852cbe36 100644 --- a/paddle/fluid/operators/tensorrt/tensorrt_engine_op.cc +++ b/paddle/fluid/operators/tensorrt/tensorrt_engine_op.cc @@ -29,8 +29,14 @@ class TensorRTEngineOpMaker : public framework::OpProtoAndCheckerMaker { AddInput("Xs", "A list of inputs.").AsDuplicable(); AddOutput("Ys", "A list of outputs").AsDuplicable(); AddAttr("subgraph", "the subgraph."); + AddAttr("calibration_data", "the calibration data for int8"); + AddAttr( + "engine_key", + "The engine_key here is used to distinguish different TRT Engines"); AddAttr("max_batch_size", "the maximum batch size."); AddAttr("workspace_size", "the workspace size."); + AddAttr("sub_block", "the trt block"); + AddAttr("enable_int8", "whether swith to int8 mode"); AddComment("TensorRT engine operator."); } }; @@ -47,6 +53,6 @@ class TensorRTEngineInferVarType : public framework::VarTypeInference { namespace ops = paddle::operators; REGISTER_OPERATOR(tensorrt_engine, ops::TensorRTEngineOp, - ops::TensorRTEngineOpMaker); + ops::TensorRTEngineOpMaker, ops::TensorRTEngineOpMaker); #endif // PADDLE_WITH_CUDA diff --git a/paddle/fluid/operators/tensorrt/tensorrt_engine_op.h b/paddle/fluid/operators/tensorrt/tensorrt_engine_op.h index e7e990f759ba411f6954c51fb697a6befbad31b1..2ff35c7c6ac6409d529de5b794bfc322b1f5dd9b 100644 --- a/paddle/fluid/operators/tensorrt/tensorrt_engine_op.h +++ b/paddle/fluid/operators/tensorrt/tensorrt_engine_op.h @@ -17,8 +17,10 @@ #ifdef PADDLE_WITH_CUDA #include +#include #include +#include "paddle/fluid/framework/executor.h" #include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/operator.h" #include "paddle/fluid/inference/analysis/helper.h" @@ -62,6 +64,9 @@ nvinfer1::Dims Vec2TRT_Dims(const std::vector &shape) { using inference::Singleton; using inference::tensorrt::TensorRTEngine; +using inference::tensorrt::TRTInt8Calibrator; +using inference::tensorrt::TRTCalibratorEngine; +using inference::tensorrt::TRTCalibratorEngineManager; class TensorRTEngineOp : public framework::OperatorBase { private: @@ -70,6 +75,11 @@ class TensorRTEngineOp : public framework::OperatorBase { mutable std::unique_ptr trt_engine_; int max_batch_size_; int workspace_size_; + std::unique_ptr calibrator_; + bool enable_int8_; + std::string calibration_data_; + std::string engine_key_; + bool calibration_mode_; public: TensorRTEngineOp(const std::string &type, @@ -80,19 +90,96 @@ class TensorRTEngineOp : public framework::OperatorBase { input_names_ = Inputs("Xs"); max_batch_size_ = Attr("max_batch_size"); workspace_size_ = Attr("workspace_size"); + enable_int8_ = Attr("enable_int8"); + calibration_data_ = Attr("calibration_data"); + engine_key_ = Attr("engine_key"); auto params = Attr>("parameters"); for (const auto ¶m : params) { param_names_.insert(param); } + // calibration_mode is ture represents we need to + // generate the calibration table data. + calibration_mode_ = (enable_int8_ && calibration_data_.size() == 0); + + VLOG(4) << "calibration_mode: " << calibration_mode_; + if (enable_int8_ && calibration_data_.size()) { + calibrator_.reset(new TRTInt8Calibrator(calibration_data_)); + } } protected: + void RunNativeImpl(const framework::Scope &scope, + const platform::Place &dev_place) const { + framework::Executor executor(dev_place); + auto *block = Attr("sub_block"); + auto *program = block->Program(); + auto ¤t_scope = scope.NewScope(); + auto ctx = executor.Prepare(*program, block->ID()); + executor.RunPreparedContext(ctx.get(), ¤t_scope, false, true, true); + } + void RunImpl(const framework::Scope &scope, const platform::Place &dev_place) const override { + if (calibration_mode_ == true) { + RunCalibration(scope, dev_place); + return; + } RunTrt(scope, dev_place); } + void RunCalibration(const framework::Scope &scope, + const platform::Place &dev_place) const { + // This process will builds a 32-bit trt engine, runs it on the calibration + // set, and records a histogram for each + // tensor of the distribution of activation values. + LOG_FIRST_N(INFO, 1) << "The TRT engine: " << engine_key_ + << " is running calibration trt int8... "; + int runtime_batch = 1; + platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance(); + auto &dev_ctx = *pool.Get(dev_place); + auto stream = + reinterpret_cast(dev_ctx).stream(); + if (!Singleton::Global().Has(engine_key_)) { + TRTCalibratorEngine *calib_res = + Singleton::Global().Create(engine_key_); + std::unordered_map calib_buffers; + for (auto &x : input_names_) { + if (param_names_.count(x)) continue; + auto &t = + inference::analysis::GetFromScope(scope, x); + calib_buffers[x] = t.memory_size(); + auto t_shape = framework::vectorize(t.dims()); + runtime_batch = t_shape[0]; + } + calib_res->calib_.reset(new TRTInt8Calibrator( + calib_buffers, runtime_batch, engine_key_, dev_place)); + calib_res->thr_.reset(new std::thread([&]() { + calib_res->engine_.reset(new TensorRTEngine( + max_batch_size_, workspace_size_, stream, + boost::get(dev_place).device, enable_int8_, + calib_res->calib_.get())); + VLOG(3) << "start the calib trt engine thread"; + Prepare(scope, dev_place, calib_res->engine_.get()); + })); + } + + TRTInt8Calibrator *temp_calibrator = + Singleton::Global() + .Get(engine_key_) + ->calib_.get(); + std::unordered_map calib_data; + + for (auto &x : Inputs("Xs")) { + if (param_names_.count(x)) continue; + auto &t = + inference::analysis::GetFromScope(scope, x); + calib_data.emplace(x, t.data()); + } + temp_calibrator->setBatch(calib_data); + RunNativeImpl(scope, dev_place); + } + void RunTrt(const framework::Scope &scope, const platform::Place &dev_place) const { int runtime_batch = 1; @@ -101,9 +188,10 @@ class TensorRTEngineOp : public framework::OperatorBase { auto stream = reinterpret_cast(dev_ctx).stream(); if (trt_engine_.get() == nullptr) { - trt_engine_.reset(new TensorRTEngine( - max_batch_size_, workspace_size_, stream, - boost::get(dev_place).device)); + trt_engine_.reset( + new TensorRTEngine(max_batch_size_, workspace_size_, stream, + boost::get(dev_place).device, + enable_int8_, calibrator_.get())); Prepare(scope, dev_place, trt_engine_.get()); } @@ -173,7 +261,8 @@ class TensorRTEngineOp : public framework::OperatorBase { void Prepare(const framework::Scope &scope, const platform::Place &dev_place, TensorRTEngine *engine) const { - VLOG(4) << "Prepare engine"; + LOG(INFO) << "Prepare TRT engine (Optimize model structure, Select OP " + "kernel etc). This process may cost a lot of time."; framework::proto::BlockDesc block_desc; block_desc.ParseFromString(Attr("subgraph")); diff --git a/paddle/fluid/operators/tensorrt/tensorrt_engine_op_test.cc b/paddle/fluid/operators/tensorrt/tensorrt_engine_op_test.cc index 391e7a1c070e040f6e90f820634c0d8b7cd40a96..5a3d9d2c1a3e8111acbad2ddcf4f5469a3a99751 100644 --- a/paddle/fluid/operators/tensorrt/tensorrt_engine_op_test.cc +++ b/paddle/fluid/operators/tensorrt/tensorrt_engine_op_test.cc @@ -96,19 +96,20 @@ TEST(TensorRTEngineOp, manual) { engine_op_desc.SetType("tensorrt_engine"); engine_op_desc.SetInput("Xs", std::vector({"x"})); engine_op_desc.SetOutput("Ys", std::vector({"z0"})); - SetAttr(engine_op_desc.Proto(), "subgraph", - block_->SerializeAsString()); - SetAttr(engine_op_desc.Proto(), "max_batch_size", 2); - SetAttr(engine_op_desc.Proto(), "workspace_size", 1 << 20); - SetAttr(engine_op_desc.Proto(), "engine_uniq_key", "a_engine"); - SetAttr>(engine_op_desc.Proto(), "parameters", - std::vector({})); - SetAttr>(engine_op_desc.Proto(), - "output_name_mapping", - std::vector({"z0"})); + + engine_op_desc.SetBlockAttr("sub_block", &block_desc); + engine_op_desc.SetAttr("max_batch_size", static_cast(2)); + engine_op_desc.SetAttr("workspace_size", static_cast(1 << 20)); + engine_op_desc.SetAttr("parameters", std::vector({})); + engine_op_desc.SetAttr("engine_key", std::string("a_engine")); + engine_op_desc.SetAttr("calibration_data", std::string("")); + engine_op_desc.SetAttr("enable_int8", static_cast(false)); + engine_op_desc.SetAttr("output_name_mapping", + std::vector({"z0"})); + engine_op_desc.SetAttr("subgraph", std::string(block_->SerializeAsString())); LOG(INFO) << "create engine op"; - auto engine_op = framework::OpRegistry::CreateOp(*engine_op_desc.Proto()); + auto engine_op = framework::OpRegistry::CreateOp(engine_op_desc); LOG(INFO) << "engine_op " << engine_op.get(); framework::Scope scope; @@ -190,20 +191,19 @@ void Execute(int batch_size, int input_dim, int output_dim, int nlayers = 1) { engine_op_desc.SetInput("Xs", std::vector({"x0"})); engine_op_desc.SetOutput("Ys", std::vector({"z3"})); - SetAttr(engine_op_desc.Proto(), "subgraph", - block_->SerializeAsString()); - SetAttr(engine_op_desc.Proto(), "max_batch_size", batch_size); - SetAttr(engine_op_desc.Proto(), "workspace_size", 1 << 20); - SetAttr>( - engine_op_desc.Proto(), "parameters", - std::vector({"y0", "y1", "y2", "y3"})); - SetAttr(engine_op_desc.Proto(), "engine_uniq_key", "b_engine"); - - SetAttr>(engine_op_desc.Proto(), - "output_name_mapping", - std::vector({"z3"})); - - auto engine_op = framework::OpRegistry::CreateOp(*engine_op_desc.Proto()); + engine_op_desc.SetBlockAttr("sub_block", &block_desc); + engine_op_desc.SetAttr("max_batch_size", static_cast(batch_size)); + engine_op_desc.SetAttr("workspace_size", static_cast(1 << 20)); + engine_op_desc.SetAttr("parameters", + std::vector({"y0", "y1", "y2", "y3"})); + engine_op_desc.SetAttr("engine_key", std::string("b_engine")); + engine_op_desc.SetAttr("calibration_data", std::string("")); + engine_op_desc.SetAttr("enable_int8", static_cast(false)); + engine_op_desc.SetAttr("output_name_mapping", + std::vector({"z3"})); + engine_op_desc.SetAttr("subgraph", std::string(block_->SerializeAsString())); + + auto engine_op = framework::OpRegistry::CreateOp(engine_op_desc); // Execute them. engine_op->Run(scope, place); diff --git a/paddle/fluid/operators/warpctc_cudnn_op.cu.cc b/paddle/fluid/operators/warpctc_cudnn_op.cu.cc index 5e16a209e712a143e1083e171f88002817aef838..a764d59410c90535dbda0b3f11e89ae9bf578c04 100644 --- a/paddle/fluid/operators/warpctc_cudnn_op.cu.cc +++ b/paddle/fluid/operators/warpctc_cudnn_op.cu.cc @@ -144,19 +144,17 @@ class CudnnCTCKernel : public framework::OpKernel { CUDNN_CTC_LOSS_ALGO_DETERMINISTIC, cu_ctcloss_desc, &workspace_size)); T* loss_data = loss->mutable_data(loss_dims, ctx.GetPlace()); - math::SetConstant()( - ctx.template device_context(), loss, static_cast(0)); - - auto temp_allocation = - platform::DeviceTemporaryAllocator::Instance().Get(dev_ctx).Allocate( - workspace_size); - void* cudnn_workspace = temp_allocation->ptr(); - - CUDNN_ENFORCE(platform::dynload::cudnnCTCLoss( - handle, cu_logits_desc, warpctc_logits_data, warpctc_label_data, - warpctc_label_lengths.data(), warpctc_logits_lengths.data(), loss_data, - cu_grad_desc, warpctc_grad_data, CUDNN_CTC_LOSS_ALGO_DETERMINISTIC, - cu_ctcloss_desc, cudnn_workspace, workspace_size)); + + auto workspace_handle = dev_ctx.cudnn_workspace_handle(); + auto cudnn_func = [&](void* cudnn_workspace) { + CUDNN_ENFORCE(platform::dynload::cudnnCTCLoss( + handle, cu_logits_desc, warpctc_logits_data, warpctc_label_data, + warpctc_label_lengths.data(), warpctc_logits_lengths.data(), + loss_data, cu_grad_desc, warpctc_grad_data, + CUDNN_CTC_LOSS_ALGO_DETERMINISTIC, cu_ctcloss_desc, cudnn_workspace, + workspace_size)); + }; + workspace_handle.RunFunc(cudnn_func, workspace_size); } }; diff --git a/paddle/fluid/platform/cuda_device_function.h b/paddle/fluid/platform/cuda_device_function.h index 9f504d14a8da116648483c0f64cb511b46e6a97e..2ce8f141d3c51661305f4952479cf2889fc4f396 100644 --- a/paddle/fluid/platform/cuda_device_function.h +++ b/paddle/fluid/platform/cuda_device_function.h @@ -13,6 +13,7 @@ See the License for the specific language governing permissions and limitations under the License. */ #pragma once + #include // NOTE(): support float16 to half in header file. #define PADDLE_CUDA_FP16 @@ -30,6 +31,34 @@ namespace platform { mask = __ballot_sync(FULL_WARP_MASK, (predicate)) #endif +inline static int RoundToPowerOfTwo(int dim) { + if (dim > 512) { + return 1024; + } else if (dim > 256) { + return 512; + } else if (dim > 128) { + return 256; + } else if (dim > 64) { + return 128; + } else if (dim > 32) { + return 64; + } else { + return 32; + } +} + +#define CUDA_LAUNCH_KERNEL_BASE(dim, ...) \ + case (dim): { \ + constexpr auto kPowerOfTwoDim = (dim); \ + __VA_ARGS__; \ + } break + +#define CUDA_LAUNCH_KERNEL_HELPER(...) \ + CUDA_LAUNCH_KERNEL_BASE(256, ##__VA_ARGS__); \ + CUDA_LAUNCH_KERNEL_BASE(128, ##__VA_ARGS__); \ + CUDA_LAUNCH_KERNEL_BASE(64, ##__VA_ARGS__); \ + CUDA_LAUNCH_KERNEL_BASE(32, ##__VA_ARGS__); + template __forceinline__ __device__ T CudaShuffleDownSync(unsigned mask, T val, int delta, int width = 32) { diff --git a/paddle/fluid/platform/device_context.cc b/paddle/fluid/platform/device_context.cc index 8f80a2d7822f1dc16cee2514a991b7341f5d1cfd..2493fb71c019f9923012afa4a46cb3e95479f860 100644 --- a/paddle/fluid/platform/device_context.cc +++ b/paddle/fluid/platform/device_context.cc @@ -30,8 +30,9 @@ platform::DeviceContext* DeviceContextPool::Get(const platform::Place& place) { auto it = device_contexts_.find(place); if (it == device_contexts_.end()) { PADDLE_THROW( - "'Place' is not supported, Please re-compile with WITH_GPU " - "option"); + "Place %s is not supported, Please re-compile with WITH_GPU " + "option", + place); } return it->second.get().get(); } diff --git a/paddle/fluid/platform/gpu_info.cc b/paddle/fluid/platform/gpu_info.cc index ca89d91aadb2d3e9005e6dd06cef124428d7e250..400a6d7bfa5912774c4bbb2a5868dd9a471afd00 100644 --- a/paddle/fluid/platform/gpu_info.cc +++ b/paddle/fluid/platform/gpu_info.cc @@ -15,6 +15,8 @@ limitations under the License. */ #include "paddle/fluid/platform/gpu_info.h" #include +#include +#include #include "gflags/gflags.h" #include "paddle/fluid/platform/enforce.h" @@ -58,7 +60,18 @@ DEFINE_string(selected_gpus, "", namespace paddle { namespace platform { -int GetCUDADeviceCount() { +static int GetCUDADeviceCountImpl() { + const auto *cuda_visible_devices = std::getenv("CUDA_VISIBLE_DEVICES"); + if (cuda_visible_devices != nullptr) { + std::string cuda_visible_devices_str(cuda_visible_devices); + if (std::all_of(cuda_visible_devices_str.begin(), + cuda_visible_devices_str.end(), + [](char ch) { return ch == ' '; })) { + VLOG(2) << "CUDA_VISIBLE_DEVICES is set to be empty. No GPU detected."; + return 0; + } + } + int count; PADDLE_ENFORCE( cudaGetDeviceCount(&count), @@ -66,6 +79,11 @@ int GetCUDADeviceCount() { return count; } +int GetCUDADeviceCount() { + static auto dev_cnt = GetCUDADeviceCountImpl(); + return dev_cnt; +} + int GetCUDAComputeCapability(int id) { PADDLE_ENFORCE_LT(id, GetCUDADeviceCount(), "id must less than GPU count"); cudaDeviceProp device_prop; @@ -203,13 +221,17 @@ size_t GpuMaxChunkSize() { void GpuMemcpyAsync(void *dst, const void *src, size_t count, enum cudaMemcpyKind kind, cudaStream_t stream) { PADDLE_ENFORCE(cudaMemcpyAsync(dst, src, count, kind, stream), - "cudaMemcpyAsync failed in paddle::platform::GpuMemcpyAsync"); + "cudaMemcpyAsync failed in paddle::platform::GpuMemcpyAsync " + "(%p -> %p, length: %d)", + src, dst, static_cast(count)); } void GpuMemcpySync(void *dst, const void *src, size_t count, enum cudaMemcpyKind kind) { PADDLE_ENFORCE(cudaMemcpy(dst, src, count, kind), - "cudaMemcpy failed in paddle::platform::GpuMemcpySync"); + "cudaMemcpy failed in paddle::platform::GpuMemcpySync (%p -> " + "%p, length: %d)", + src, dst, static_cast(count)); } void GpuMemcpyPeerAsync(void *dst, int dst_device, const void *src, diff --git a/paddle/fluid/pybind/imperative.cc b/paddle/fluid/pybind/imperative.cc index dbc7843caa0c0a39a32cda6050fa99a3ab4c3e22..31c3bfa43ffec22059a602e9ff09a33188d72c91 100644 --- a/paddle/fluid/pybind/imperative.cc +++ b/paddle/fluid/pybind/imperative.cc @@ -15,18 +15,38 @@ limitations under the License. */ #include "paddle/fluid/pybind/imperative.h" #include "paddle/fluid/framework/block_desc.h" #include "paddle/fluid/imperative/tracer.h" +#include "paddle/fluid/imperative/type_defs.h" namespace paddle { namespace pybind { // Bind Methods -void BindTracer(pybind11::module *m) { +void BindTracer(pybind11::module* m) { pybind11::class_(*m, "Tracer", "") .def("__init__", - [](imperative::Tracer &self, framework::BlockDesc *root_block) { + [](imperative::Tracer& self, framework::BlockDesc* root_block) { new (&self) imperative::Tracer(root_block); }) - .def("trace", &imperative::Tracer::Trace) + .def("trace", + [](imperative::Tracer& self, imperative::OpBase* op, + const imperative::VarBasePtrMap& inputs, + const imperative::VarBasePtrMap& outputs, + framework::BlockDesc* block, + const platform::CPUPlace expected_place, + const bool stop_gradient = false) { + self.Trace(op, inputs, outputs, block, expected_place, + stop_gradient); + }) + .def("trace", + [](imperative::Tracer& self, imperative::OpBase* op, + const imperative::VarBasePtrMap& inputs, + const imperative::VarBasePtrMap& outputs, + framework::BlockDesc* block, + const platform::CUDAPlace expected_place, + const bool stop_gradient = false) { + self.Trace(op, inputs, outputs, block, expected_place, + stop_gradient); + }) .def("py_trace", &imperative::Tracer::PyTrace, pybind11::return_value_policy::take_ownership); } diff --git a/paddle/fluid/pybind/inference_api.cc b/paddle/fluid/pybind/inference_api.cc index 26247026667158a2f43cdac21bf5600479455e16..e05667d2c7e9ce5c64cfacee4919cd36d7383c0c 100644 --- a/paddle/fluid/pybind/inference_api.cc +++ b/paddle/fluid/pybind/inference_api.cc @@ -180,8 +180,14 @@ void BindNativePredictor(py::module *m) { } void BindAnalysisConfig(py::module *m) { - py::class_(*m, "AnalysisConfig") - .def(py::init()) + py::class_ analysis_config(*m, "AnalysisConfig"); + + py::enum_(analysis_config, "Precision") + .value("Float32", AnalysisConfig::Precision::kFloat32) + .value("Int8", AnalysisConfig::Precision::kInt8) + .export_values(); + + analysis_config.def(py::init()) .def(py::init()) .def(py::init()) .def("set_model", (void (AnalysisConfig::*)(const std::string &)) & @@ -215,7 +221,8 @@ void BindAnalysisConfig(py::module *m) { .def("specify_input_name", &AnalysisConfig::specify_input_name) .def("enable_tensorrt_engine", &AnalysisConfig::EnableTensorRtEngine, py::arg("workspace_size") = 1 << 20, py::arg("max_batch_size") = 1, - py::arg("min_subgraph_size") = 3) + py::arg("min_subgraph_size") = 3, + py::arg("precision_mode") = AnalysisConfig::Precision::kFloat32) .def("tensorrt_engine_enabled", &AnalysisConfig::tensorrt_engine_enabled) .def("switch_ir_debug", &AnalysisConfig::SwitchIrDebug, py::arg("x") = true) diff --git a/paddle/fluid/pybind/pybind.cc b/paddle/fluid/pybind/pybind.cc index c470483756659b55329c022e0c43002182db815b..97e5bbaaccaf7c702a324abd708a314c72ece004 100644 --- a/paddle/fluid/pybind/pybind.cc +++ b/paddle/fluid/pybind/pybind.cc @@ -138,6 +138,22 @@ PYBIND11_MODULE(core, m) { .def("_grad_ivar", [](const imperative::VarBase &self) { return self.grads_; }, py::return_value_policy::reference) + .def("_copy_to", + [](const imperative::VarBase &self, const platform::CPUPlace &place, + bool blocking) { + std::unique_ptr new_var = + self.NewVarBase(place, blocking); + return new_var.release(); + }, + py::return_value_policy::take_ownership) + .def("_copy_to", + [](const imperative::VarBase &self, const platform::CUDAPlace &place, + bool blocking) { + std::unique_ptr new_var = + self.NewVarBase(place, blocking); + return new_var.release(); + }, + py::return_value_policy::take_ownership) .def("value", [](const imperative::VarBase &self) { return self.var_; }, py::return_value_policy::reference) .def_property( @@ -469,6 +485,7 @@ All parameter, weight, gradient are variables in Paddle. py::return_value_policy::reference); py::class_(m, "Reader", "") + .def("start", &framework::ReaderHolder::Start) .def("reset", &framework::ReaderHolder::ResetAll); using LoDTensorBlockingQueue = @@ -489,19 +506,12 @@ All parameter, weight, gradient are variables in Paddle. .def("is_closed", &LoDTensorBlockingQueue::IsClosed); m.def("init_lod_tensor_blocking_queue", - [](Variable &var, size_t capacity, - const std::vector> &shapes) - -> std::shared_ptr { - std::vector dims(shapes.size()); - std::transform(shapes.begin(), shapes.end(), dims.begin(), - [](const std::vector &shape) { - return make_ddim(shape); - }); - auto *holder = var.GetMutable(); - holder->InitOnce(capacity, dims, - FLAGS_reader_queue_speed_test_mode); - return holder->GetQueue(); - }, + [](Variable &var, + size_t capacity) -> std::shared_ptr { + auto *holder = var.GetMutable(); + holder->InitOnce(capacity, FLAGS_reader_queue_speed_test_mode); + return holder->GetQueue(); + }, py::return_value_policy::copy); py::class_(m, "_Scope", R"DOC( @@ -626,7 +636,18 @@ All parameter, weight, gradient are variables in Paddle. py::class_(m, "Communicator").def(py::init<>()); #endif py::class_(m, "CUDAPlace") - .def(py::init()) + .def("__init__", + [](platform::CUDAPlace &self, int dev_id) { +#ifdef PADDLE_WITH_CUDA + PADDLE_ENFORCE( + dev_id >= 0 && dev_id < platform::GetCUDADeviceCount(), + "Invalid CUDAPlace(%d), must inside [0, %d)", dev_id, + platform::GetCUDADeviceCount()); + new (&self) platform::CUDAPlace(dev_id); +#else + PADDLE_THROW("Cannot use CUDAPlace in CPU only version"); +#endif + }) .def("__str__", string::to_string); py::class_(m, "CPUPlace") @@ -634,7 +655,12 @@ All parameter, weight, gradient are variables in Paddle. .def("__str__", string::to_string); py::class_(m, "CUDAPinnedPlace") - .def(py::init<>()) + .def("__init__", + [](platform::CUDAPinnedPlace &) { +#ifndef PADDLE_WITH_CUDA + PADDLE_THROW("Cannot use CUDAPinnedPlace in CPU only version"); +#endif + }) .def("__str__", string::to_string); py::class_(m, "Place") @@ -1005,7 +1031,7 @@ All parameter, weight, gradient are variables in Paddle. PADDLE_ENFORCE(!self.IsFinalized(), "BuildStrategy is finlaized."); self.remove_unnecessary_lock_ = b; }, - R"DOC(The type is BOOL. If set True, some locks in GPU ops would be released and ParallelExecutor would run faster. Default False.)DOC") + R"DOC(The type is BOOL. If set True, some locks in GPU ops would be released and ParallelExecutor would run faster. Default True.)DOC") .def_property( "num_trainers", [](const BuildStrategy &self) { return self.num_trainers_; }, diff --git a/paddle/scripts/paddle_build.sh b/paddle/scripts/paddle_build.sh index bb7258ee5913469d9f9a5f1bf5cf4bb4fa63938a..1135caf4f8c32901d93270d372fdaac702acf006 100755 --- a/paddle/scripts/paddle_build.sh +++ b/paddle/scripts/paddle_build.sh @@ -173,7 +173,6 @@ function cmake_gen() { -DCMAKE_BUILD_TYPE=${CMAKE_BUILD_TYPE:-Release} ${PYTHON_FLAGS} -DWITH_DSO=ON - -DWITH_DOC=${WITH_DOC:-OFF} -DWITH_GPU=${WITH_GPU:-OFF} -DWITH_AMD_GPU=${WITH_AMD_GPU:-OFF} -DWITH_DISTRIBUTE=${distibuted_flag} @@ -208,7 +207,6 @@ EOF -DCMAKE_BUILD_TYPE=${CMAKE_BUILD_TYPE:-Release} \ ${PYTHON_FLAGS} \ -DWITH_DSO=ON \ - -DWITH_DOC=${WITH_DOC:-OFF} \ -DWITH_GPU=${WITH_GPU:-OFF} \ -DWITH_AMD_GPU=${WITH_AMD_GPU:-OFF} \ -DWITH_DISTRIBUTE=${distibuted_flag} \ @@ -328,7 +326,8 @@ function run_brpc_test() { ======================================== EOF set +x - declare -a other_tests=("test_listen_and_serv_op" "system_allocator_test") + declare -a other_tests=("test_listen_and_serv_op" "system_allocator_test" \ + "rpc_server_test" "varhandle_test" "collective_server_test" "brpc_serde_test") all_tests=`ctest -N` for t in "${other_tests[@]}" @@ -527,31 +526,6 @@ function bind_test() { wait } - -function gen_docs() { - mkdir -p ${PADDLE_ROOT}/build - cd ${PADDLE_ROOT}/build - cat < 1e-4: + if normal_prob - 1.0 > 0: bigs.append((i, normal_prob)) - elif 1.0 - normal_prob > 1e-4: + elif 1.0 - normal_prob > 0: littles.append((i, normal_prob)) else: alias_probs_[i] = normal_prob @@ -5164,9 +5171,9 @@ def nce(input, alias_probs_[little[0]] = little[1] alias_[little[0]] = big_idx big_left = big[1] + little[1] - 1 - if big_left - 1.0 > 1e-4: + if big_left - 1.0 > 0: bigs.append((big_idx, big_left)) - elif 1.0 - big_left > 1e-4: + elif 1.0 - big_left > 0: littles.append((big_idx, big_left)) else: alias_probs_[big_idx] = big_left @@ -5856,7 +5863,8 @@ def autoincreased_step_counter(counter_name=None, begin=1, step=1): type='increment', inputs={'X': [counter]}, outputs={'Out': [counter]}, - attrs={'step': float(step)}) + attrs={'step': float(step)}, + stop_gradient=True) counter.stop_gradient = True return counter @@ -9475,7 +9483,7 @@ def teacher_student_sigmoid_loss(input, by the previous operator. label (Variable|list): the ground truth which is a 2-D tensor with shape [N x 1], where N is the batch size. - soft_max_up_bound (float): if input > soft_max_up_bound, will be bound + soft_max_up_bound (float): if input > soft_max_up_bound, will be bound soft_max_lower_bound (float): if input < soft_max_lower_bound, will be bound Returns: @@ -9639,6 +9647,79 @@ def get_tensor_from_selected_rows(x, name=None): return out +def shuffle_channel(x, group, name=None): + """ + **Shuffle Channel Operator** + + This operator shuffles the channels of input x. + It divide the input channels in each group into :attr:`group` subgroups, + and obtain a new order by selecting element from every subgroup one by one. + + Please refer to the paper + https://arxiv.org/pdf/1707.01083.pdf + + .. code-block:: text + + Given a 4-D tensor input with the shape (N, C, H, W): + input.shape = (1, 4, 2, 2) + input.data =[[[[0.1, 0.2], + [0.2, 0.3]], + + [[0.3, 0.4], + [0.4, 0.5]], + + [[0.5, 0.6], + [0.6, 0.7]], + + [[0.7, 0.8], + [0.8, 0.9]]]] + Given group: 2 + then we get a 4-D tensor out whth the same shape of input: + out.shape = (1, 4, 2, 2) + out.data = [[[[0.1, 0.2], + [0.2, 0.3]], + + [[0.5, 0.6], + [0.6, 0.7]], + + [[0.3, 0.4], + [0.4, 0.5]], + + [[0.7, 0.8], + [0.8, 0.9]]]] + + Args: + x(Variable): The input tensor variable. It should be a 4-D tensor with shape [N, C, H, W] + group(int): Indicating the conuts of subgroups, It should divide the number of channels. + + Returns: + out(Variable): the channels shuffling result is a tensor variable with the + same shape and same type as the input. + + Raises: + ValueError: If group is not an int type variable. + + Examples: + .. code-block:: python + + input = fluid.layers.data(name='input', shape=[4,2,2], dtype='float32') + out = fluid.layers.shuffle_channel(x=input, group=2) + """ + helper = LayerHelper("shuffle_channel", **locals()) + + out = helper.create_variable_for_type_inference(dtype=x.dtype) + + if not isinstance(group, int): + raise TypeError("group must be int type") + + helper.append_op( + type="shuffle_channel", + inputs={"X": x}, + outputs={"Out": out}, + attrs={"group": group}) + return out + + class PyFuncRegistry(object): _register_funcs = [] diff --git a/python/paddle/fluid/layers/tensor.py b/python/paddle/fluid/layers/tensor.py index ce9f508c9f10981d62b7f8417080f0f3b8d9b8a7..2153ca254f0e286a77160a2d53473e1bc76109d5 100644 --- a/python/paddle/fluid/layers/tensor.py +++ b/python/paddle/fluid/layers/tensor.py @@ -382,7 +382,8 @@ def fill_constant(shape, dtype, value, force_cpu=False, out=None): 'dtype': out.dtype, 'value': float(value), 'force_cpu': force_cpu or force_init_on_cpu() - }) + }, + stop_gradient=True) out.stop_gradient = True return out diff --git a/python/paddle/fluid/optimizer.py b/python/paddle/fluid/optimizer.py index b72b900d3b26cb705b76623e9bacc16c76a7c79f..14f4276e2f4fc4a24d701ef05c94b88c4f0336da 100644 --- a/python/paddle/fluid/optimizer.py +++ b/python/paddle/fluid/optimizer.py @@ -301,10 +301,10 @@ class Optimizer(object): no_grad_set (set|None): set of Variables should be ignored. callbacks (list|None): list of callables to run when appending backward operator for one parameter. - + Return: list: list of (param, grad) pair, grad is the output of backward. - + Examples: See examples in `apply_gradients`. """ @@ -322,10 +322,10 @@ class Optimizer(object): Args: params_grads (list): list of (param, grad) pair to do optimization. - + Returns: list: A list of operators appended to the current program. - + Examples: .. code-block:: python @@ -364,7 +364,7 @@ class Optimizer(object): This method combines interface `backward()` and `apply_gradients()` into one. - + Args: loss (Variable): loss variable to run optimizations. startup_program (Program): startup_program for initializing parameters @@ -381,18 +381,21 @@ class Optimizer(object): optimize_ops = [] if imperative_base.enabled(): if parameter_list is not None: - params_grads = parameter_list + parameters = parameter_list else: parameters = program.global_block().all_parameters() - params_grads = [] - for param in parameters: - # create gradient variable - grad_var = Variable( - block=loss.block, - name=param._ivar._grad_name(), - stop_gradient=True, - ivar=param._ivar._grad_ivar()) - params_grads.append((param, grad_var)) + + params_grads = [] + for param in parameters: + if param.stop_gradient: + continue + # create gradient variable + grad_var = Variable( + block=loss.block, + name=param._ivar._grad_name(), + stop_gradient=True, + ivar=param._ivar._grad_ivar()) + params_grads.append((param, grad_var)) with program_guard(program, startup_program): optimize_ops = self._create_optimization_pass(params_grads) else: diff --git a/python/paddle/fluid/parallel_executor.py b/python/paddle/fluid/parallel_executor.py index a1b1d2f584c399b790580757dea746d7b4e4ac80..a07ff6ac69ca20c8c68659a67606076ce8cdf027 100644 --- a/python/paddle/fluid/parallel_executor.py +++ b/python/paddle/fluid/parallel_executor.py @@ -159,7 +159,7 @@ class ParallelExecutor(object): trainers_endpoints = main._trainers_endpoints if num_trainers > 1 and trainers_endpoints: assert num_trainers == len( - trainers_endpoints), "num_trainers == len(end_points)" + trainers_endpoints), "num_trainers == len(endpoints)" build_strategy.trainers_endpoints = trainers_endpoints # step6: get persistable_vars, places. persistable_vars diff --git a/python/paddle/fluid/tests/unittests/CMakeLists.txt b/python/paddle/fluid/tests/unittests/CMakeLists.txt index 808e1e6aa80744db1289094d7c1bad00002a4c3e..c23dfa01e76c21d0d162f2fed986e2eaf3a70a6d 100644 --- a/python/paddle/fluid/tests/unittests/CMakeLists.txt +++ b/python/paddle/fluid/tests/unittests/CMakeLists.txt @@ -84,6 +84,7 @@ list(REMOVE_ITEM TEST_OPS test_parallel_executor_transformer) list(REMOVE_ITEM TEST_OPS test_image_classification_resnet) list(REMOVE_ITEM TEST_OPS test_bilinear_interp_op) list(REMOVE_ITEM TEST_OPS test_nearest_interp_op) +list(REMOVE_ITEM TEST_OPS test_imperative_resnet) foreach(TEST_OP ${TEST_OPS}) py_test_modules(${TEST_OP} MODULES ${TEST_OP}) endforeach(TEST_OP) @@ -91,6 +92,8 @@ py_test_modules(test_adam_op_multi_thread MODULES test_adam_op ENVS FLAGS_inner_ py_test_modules(test_warpctc_op MODULES test_warpctc_op ENVS FLAGS_warpctc_dir=${WARPCTC_LIB_DIR} SERIAL) py_test_modules(test_bilinear_interp_op MODULES test_bilinear_interp_op SERIAL) py_test_modules(test_nearest_interp_op MODULES test_nearest_interp_op SERIAL) +py_test_modules(test_imperative_resnet MODULES test_imperative_resnet ENVS + FLAGS_cudnn_deterministic=1) if(WITH_DISTRIBUTE) py_test_modules(test_dist_train MODULES test_dist_train SERIAL) set_tests_properties(test_listen_and_serv_op PROPERTIES TIMEOUT 20) diff --git a/python/paddle/fluid/tests/unittests/test_dist_base.py b/python/paddle/fluid/tests/unittests/test_dist_base.py index e51ae1a944e70ba71cdced9b0126ea2e46a364b4..0968ace62b6a4e258f7763dbf6fbeda07feb4cd5 100644 --- a/python/paddle/fluid/tests/unittests/test_dist_base.py +++ b/python/paddle/fluid/tests/unittests/test_dist_base.py @@ -124,7 +124,7 @@ class TestDistRunnerBase(object): if args.batch_merge_repeat > 1: pass_builder = build_stra._finalize_strategy_and_create_passes() mypass = pass_builder.insert_pass( - len(pass_builder.all_passes()) - 2, "multi_batch_merge_pass") + len(pass_builder.all_passes()) - 3, "multi_batch_merge_pass") mypass.set("num_repeats", args.batch_merge_repeat) if args.update_method == "nccl2": diff --git a/python/paddle/fluid/tests/unittests/test_eager_deletion_mnist.py b/python/paddle/fluid/tests/unittests/test_eager_deletion_mnist.py index 7ec1f0ae753724dac5c4675926ead87a097a7a99..56dfb095def62bc617948821038f0c15c1547683 100644 --- a/python/paddle/fluid/tests/unittests/test_eager_deletion_mnist.py +++ b/python/paddle/fluid/tests/unittests/test_eager_deletion_mnist.py @@ -16,12 +16,17 @@ import os import unittest os.environ['FLAGS_eager_delete_tensor_gb'] = "0.0" +# FIXME(zjl): It seems that this unittest fails randomly +# when comparing all reduce last loss and reduce last loss +# e.g.: AssertionError: 1.0357145 != 1.0673475 within 0.01 delta +# Disable it temporarily. +''' from test_parallel_executor_mnist import TestMNIST class EagerDeletionTestMNIST(TestMNIST): pass - +''' if __name__ == '__main__': unittest.main() diff --git a/python/paddle/fluid/tests/unittests/test_imperative.py b/python/paddle/fluid/tests/unittests/test_imperative.py index dfe4daca95af5e7b1aff93c6fa9027dec7c64642..adf35c851bf05011223e483e472900a3d415e2ee 100644 --- a/python/paddle/fluid/tests/unittests/test_imperative.py +++ b/python/paddle/fluid/tests/unittests/test_imperative.py @@ -67,6 +67,18 @@ class MLP(fluid.imperative.Layer): class TestImperative(unittest.TestCase): + def test_sum_op(self): + x = np.ones([2, 2], np.float32) + with fluid.imperative.guard(): + inputs = [] + for _ in range(10): + inputs.append(fluid.imperative.base.to_variable(x)) + ret = fluid.layers.sums(inputs) + loss = fluid.layers.reduce_sum(ret) + loss._backward() + self.assertTrue(np.allclose(ret._numpy(), x * 10)) + self.assertTrue(np.allclose(inputs[0]._gradient(), x)) + def test_layer(self): with fluid.imperative.guard(): cl = core.Layer() @@ -133,7 +145,8 @@ class TestImperative(unittest.TestCase): x = fluid.layers.reduce_sum(fluid.layers.tanh(x1)) param_grads = fluid.backward.append_backward( x, parameter_list=[x1.name])[0] - exe = fluid.Executor(fluid.CPUPlace()) + exe = fluid.Executor(fluid.CPUPlace( + ) if not core.is_compiled_with_cuda() else fluid.CUDAPlace(0)) static_out, static_grad = exe.run( feed={inp.name: np_inp}, @@ -160,7 +173,8 @@ class TestImperative(unittest.TestCase): x = l(inp)[0] param_grads = fluid.backward.append_backward( x, parameter_list=[l._x_for_debug.name])[0] - exe = fluid.Executor(fluid.CPUPlace()) + exe = fluid.Executor(fluid.CPUPlace( + ) if not core.is_compiled_with_cuda() else fluid.CUDAPlace(0)) static_out, static_grad = exe.run( feed={inp.name: np_inp}, @@ -186,7 +200,8 @@ class TestImperative(unittest.TestCase): out = mlp(inp) param_grads = fluid.backward.append_backward( out, parameter_list=[mlp._fc1._w.name])[0] - exe = fluid.Executor(fluid.CPUPlace()) + exe = fluid.Executor(fluid.CPUPlace( + ) if not core.is_compiled_with_cuda() else fluid.CUDAPlace(0)) exe.run(fluid.default_startup_program()) static_out, static_grad = exe.run( diff --git a/python/paddle/fluid/tests/unittests/test_imperative_gan.py b/python/paddle/fluid/tests/unittests/test_imperative_gan.py index 4fe286f85ec551946a9431f70d7012b4e7d79662..681661bfc63db95653be371688a047efe96f3866 100644 --- a/python/paddle/fluid/tests/unittests/test_imperative_gan.py +++ b/python/paddle/fluid/tests/unittests/test_imperative_gan.py @@ -20,6 +20,7 @@ import sys import paddle import paddle.fluid as fluid +import paddle.fluid.core as core from paddle.fluid.optimizer import SGDOptimizer from paddle.fluid.imperative.nn import Conv2D, Pool2D, FC from test_imperative_base import new_program_scope @@ -58,7 +59,7 @@ class Generator(fluid.imperative.Layer): class TestImperativeMnist(unittest.TestCase): - def test_mnist_cpu_float32(self): + def test_gan_float32(self): seed = 90 startup = fluid.Program() @@ -115,7 +116,8 @@ class TestImperativeMnist(unittest.TestCase): sgd = SGDOptimizer(learning_rate=1e-3) sgd.minimize(g_loss) - exe = fluid.Executor(fluid.CPUPlace()) + exe = fluid.Executor(fluid.CPUPlace() if not core.is_compiled_with_cuda( + ) else fluid.CUDAPlace(0)) static_params = dict() with fluid.scope_guard(scope): img = np.ones([2, 1], np.float32) diff --git a/python/paddle/fluid/tests/unittests/test_imperative_optimizer.py b/python/paddle/fluid/tests/unittests/test_imperative_optimizer.py index 63eeae4b712c2064309b664b91d5f0347b67817d..d0a5a883174cb33a035b344f9489b2ba02ba99f1 100644 --- a/python/paddle/fluid/tests/unittests/test_imperative_optimizer.py +++ b/python/paddle/fluid/tests/unittests/test_imperative_optimizer.py @@ -145,7 +145,8 @@ class TestImperativeMnist(unittest.TestCase): fluid.default_startup_program().random_seed = seed fluid.default_main_program().random_seed = seed - exe = fluid.Executor(fluid.CPUPlace()) + exe = fluid.Executor(fluid.CPUPlace( + ) if not core.is_compiled_with_cuda() else fluid.CUDAPlace(0)) mnist = MNIST() sgd = SGDOptimizer(learning_rate=1e-3) diff --git a/python/paddle/fluid/tests/unittests/test_imperative_resnet.py b/python/paddle/fluid/tests/unittests/test_imperative_resnet.py new file mode 100644 index 0000000000000000000000000000000000000000..87a72dd04e376cf9225e275d862b0cbbb9774e2c --- /dev/null +++ b/python/paddle/fluid/tests/unittests/test_imperative_resnet.py @@ -0,0 +1,370 @@ +# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import contextlib +import unittest +import numpy as np +import six + +import paddle +import paddle.fluid as fluid +from paddle.fluid import core +from paddle.fluid.layer_helper import LayerHelper +from paddle.fluid.optimizer import SGDOptimizer +from paddle.fluid.imperative.nn import Conv2D, Pool2D, BatchNorm, FC +from paddle.fluid.imperative.base import to_variable +from test_imperative_base import new_program_scope + +batch_size = 8 +train_parameters = { + "input_size": [3, 224, 224], + "input_mean": [0.485, 0.456, 0.406], + "input_std": [0.229, 0.224, 0.225], + "learning_strategy": { + "name": "piecewise_decay", + "batch_size": batch_size, + "epochs": [30, 60, 90], + "steps": [0.1, 0.01, 0.001, 0.0001] + }, + "batch_size": batch_size, + "lr": 0.1, + "total_images": 1281164, +} + + +def optimizer_setting(params): + ls = params["learning_strategy"] + if ls["name"] == "piecewise_decay": + if "total_images" not in params: + total_images = 1281167 + else: + total_images = params["total_images"] + batch_size = ls["batch_size"] + step = int(total_images / batch_size + 1) + + bd = [step * e for e in ls["epochs"]] + base_lr = params["lr"] + lr = [] + lr = [base_lr * (0.1**i) for i in range(len(bd) + 1)] + optimizer = fluid.optimizer.SGD(learning_rate=0.01) + # TODO(minqiyang): Add learning rate scheduler support to imperative mode + # optimizer = fluid.optimizer.Momentum( + # learning_rate=params["lr"], + # learning_rate=fluid.layers.piecewise_decay( + # boundaries=bd, values=lr), + # momentum=0.9, + # regularization=fluid.regularizer.L2Decay(1e-4)) + + return optimizer + + +class ConvBNLayer(fluid.imperative.Layer): + def __init__(self, + num_channels, + num_filters, + filter_size, + stride=1, + groups=1, + act=None): + super(ConvBNLayer, self).__init__() + + self._conv = Conv2D( + num_channels=num_channels, + num_filters=num_filters, + filter_size=filter_size, + stride=stride, + padding=(filter_size - 1) // 2, + groups=groups, + act=None, + bias_attr=None) + + self._batch_norm = BatchNorm(num_filters, act=act) + + def forward(self, inputs): + y = self._conv(inputs) + y = self._batch_norm(y) + + return y + + +class BottleneckBlock(fluid.imperative.Layer): + def __init__(self, num_channels, num_filters, stride, shortcut=True): + super(BottleneckBlock, self).__init__() + + self.conv0 = ConvBNLayer( + num_channels=num_channels, + num_filters=num_filters, + filter_size=1, + act='relu') + self.conv1 = ConvBNLayer( + num_channels=num_filters, + num_filters=num_filters, + filter_size=3, + stride=stride, + act='relu') + self.conv2 = ConvBNLayer( + num_channels=num_filters, + num_filters=num_filters * 4, + filter_size=1, + act=None) + + if not shortcut: + self.short = ConvBNLayer( + num_channels=num_channels, + num_filters=num_filters * 4, + filter_size=1, + stride=stride) + + self.shortcut = shortcut + + self._num_channels_out = num_filters * 4 + + def forward(self, inputs): + y = self.conv0(inputs) + conv1 = self.conv1(y) + conv2 = self.conv2(conv1) + + if self.shortcut: + short = inputs + else: + short = self.short(inputs) + + y = fluid.layers.elementwise_add(x=short, y=conv2) + + layer_helper = LayerHelper('elementwise_add_activation', act='relu') + return layer_helper.append_activation(y) + + +class ResNet(fluid.imperative.Layer): + def __init__(self, layers=50, class_dim=102): + super(ResNet, self).__init__() + + self.layers = layers + supported_layers = [50, 101, 152] + assert layers in supported_layers, \ + "supported layers are {} but input layer is {}".format(supported_layers, layers) + + if layers == 50: + depth = [3, 4, 6, 3] + elif layers == 101: + depth = [3, 4, 23, 3] + elif layers == 152: + depth = [3, 8, 36, 3] + num_filters = [64, 128, 256, 512] + + self.conv = ConvBNLayer( + num_channels=3, num_filters=64, filter_size=7, stride=2, act='relu') + self.pool2d_max = Pool2D( + pool_size=3, pool_stride=2, pool_padding=1, pool_type='max') + + self.bottleneck_block_list = [] + num_channels = 64 + for block in range(len(depth)): + shortcut = False + for i in range(depth[block]): + bottleneck_block = BottleneckBlock( + num_channels=num_channels, + num_filters=num_filters[block], + stride=2 if i == 0 and block != 0 else 1, + shortcut=shortcut) + num_channels = bottleneck_block._num_channels_out + self.bottleneck_block_list.append(bottleneck_block) + shortcut = True + + self.pool2d_avg = Pool2D( + pool_size=7, pool_type='avg', global_pooling=True) + + import math + stdv = 1.0 / math.sqrt(2048 * 1.0) + + self.out = FC(size=class_dim, + act='softmax', + param_attr=fluid.param_attr.ParamAttr( + initializer=fluid.initializer.Uniform(-stdv, stdv))) + + def forward(self, inputs): + y = self.conv(inputs) + y = self.pool2d_max(y) + for bottleneck_block in self.bottleneck_block_list: + y = bottleneck_block(y) + y = self.pool2d_avg(y) + y = self.out(y) + return y + + +class TestImperativeResnet(unittest.TestCase): + def test_resnet_float32(self): + seed = 90 + + batch_size = train_parameters["batch_size"] + batch_num = 1 + with fluid.imperative.guard(): + fluid.default_startup_program().random_seed = seed + fluid.default_main_program().random_seed = seed + + resnet = ResNet() + optimizer = optimizer_setting(train_parameters) + np.random.seed(seed) + import random + random.seed = seed + train_reader = paddle.batch( + paddle.dataset.flowers.train(use_xmap=False), + batch_size=batch_size) + + dy_param_init_value = {} + for param in fluid.default_main_program().global_block( + ).all_parameters(): + dy_param_init_value[param.name] = param._numpy() + + for batch_id, data in enumerate(train_reader()): + if batch_id >= batch_num: + break + + dy_x_data = np.array( + [x[0].reshape(3, 224, 224) for x in data]).astype('float32') + y_data = np.array([x[1] for x in data]).astype('int64').reshape( + batch_size, 1) + + img = to_variable(dy_x_data) + label = to_variable(y_data) + label._stop_gradient = True + + out = resnet(img) + loss = fluid.layers.cross_entropy(input=out, label=label) + avg_loss = fluid.layers.mean(x=loss) + + dy_out = avg_loss._numpy() + + if batch_id == 0: + for param in fluid.default_main_program().global_block( + ).all_parameters(): + if param.name not in dy_param_init_value: + dy_param_init_value[param.name] = param._numpy() + + avg_loss._backward() + + dy_grad_value = {} + for param in fluid.default_main_program().global_block( + ).all_parameters(): + if not param.stop_gradient: + np_array = np.array(param._ivar._grad_ivar().value() + .get_tensor()) + dy_grad_value[param.name + core.grad_var_suffix( + )] = np_array + + optimizer.minimize(avg_loss) + + dy_param_value = {} + for param in fluid.default_main_program().global_block( + ).all_parameters(): + dy_param_value[param.name] = param._numpy() + + with new_program_scope(): + fluid.default_startup_program().random_seed = seed + fluid.default_main_program().random_seed = seed + + exe = fluid.Executor(fluid.CPUPlace( + ) if not core.is_compiled_with_cuda() else fluid.CUDAPlace(0)) + + resnet = ResNet() + optimizer = optimizer_setting(train_parameters) + + np.random.seed(seed) + import random + random.seed = seed + train_reader = paddle.batch( + paddle.dataset.flowers.train(use_xmap=False), + batch_size=batch_size) + + img = fluid.layers.data( + name='pixel', shape=[3, 224, 224], dtype='float32') + label = fluid.layers.data(name='label', shape=[1], dtype='int64') + out = resnet(img) + loss = fluid.layers.cross_entropy(input=out, label=label) + avg_loss = fluid.layers.mean(x=loss) + optimizer.minimize(avg_loss) + + # initialize params and fetch them + static_param_init_value = {} + static_param_name_list = [] + static_grad_name_list = [] + for param in fluid.default_startup_program().global_block( + ).all_parameters(): + static_param_name_list.append(param.name) + for param in fluid.default_main_program().global_block( + ).all_parameters(): + if not param.stop_gradient: + static_grad_name_list.append(param.name + + core.grad_var_suffix()) + + out = exe.run(fluid.default_startup_program(), + fetch_list=static_param_name_list) + + for i in range(len(static_param_name_list)): + static_param_init_value[static_param_name_list[i]] = out[i] + + for batch_id, data in enumerate(train_reader()): + if batch_id >= batch_num: + break + + static_x_data = np.array( + [x[0].reshape(3, 224, 224) for x in data]).astype('float32') + y_data = np.array([x[1] for x in data]).astype('int64').reshape( + [batch_size, 1]) + + fetch_list = [avg_loss.name] + fetch_list.extend(static_param_name_list) + fetch_list.extend(static_grad_name_list) + out = exe.run(fluid.default_main_program(), + feed={"pixel": static_x_data, + "label": y_data}, + fetch_list=fetch_list) + + static_param_value = {} + static_grad_value = {} + static_out = out[0] + param_start_pos = 1 + grad_start_pos = len(static_param_name_list) + param_start_pos + for i in range(param_start_pos, + len(static_param_name_list) + param_start_pos): + static_param_value[static_param_name_list[ + i - param_start_pos]] = out[i] + for i in range(grad_start_pos, + len(static_grad_name_list) + grad_start_pos): + static_grad_value[static_grad_name_list[ + i - grad_start_pos]] = out[i] + + self.assertTrue(np.allclose(static_out, dy_out)) + + self.assertEqual(len(dy_param_init_value), len(static_param_init_value)) + for key, value in six.iteritems(static_param_init_value): + self.assertTrue(np.allclose(value, dy_param_init_value[key])) + self.assertTrue(np.isfinite(value.all())) + self.assertFalse(np.isnan(value.any())) + + self.assertEqual(len(dy_grad_value), len(static_grad_value)) + for key, value in six.iteritems(static_grad_value): + self.assertTrue(np.allclose(value, dy_grad_value[key])) + self.assertTrue(np.isfinite(value.all())) + self.assertFalse(np.isnan(value.any())) + + self.assertEqual(len(dy_param_value), len(static_param_value)) + for key, value in six.iteritems(static_param_value): + self.assertTrue(np.allclose(value, dy_param_value[key])) + self.assertTrue(np.isfinite(value.all())) + self.assertFalse(np.isnan(value.any())) + + +if __name__ == '__main__': + unittest.main() diff --git a/python/paddle/fluid/tests/unittests/test_layers.py b/python/paddle/fluid/tests/unittests/test_layers.py index 90f5d797a67d951e618e64cfc5a3608335714e05..c13f03e86f3e375026b04a31d51ac1a5223360ef 100644 --- a/python/paddle/fluid/tests/unittests/test_layers.py +++ b/python/paddle/fluid/tests/unittests/test_layers.py @@ -1023,6 +1023,14 @@ class TestBook(unittest.TestCase): print(str(program)) + def test_shuffle_channel(self): + program = Program() + with program_guard(program): + x = layers.data(name="X", shape=[16, 4, 4], dtype="float32") + out = layers.shuffle_channel(x, group=4) + self.assertIsNotNone(out) + print(str(program)) + if __name__ == '__main__': unittest.main() diff --git a/python/paddle/fluid/tests/unittests/test_shuffle_channel_op.py b/python/paddle/fluid/tests/unittests/test_shuffle_channel_op.py new file mode 100644 index 0000000000000000000000000000000000000000..aeaae9058187be1c9191bcbec21237c69fefe6e6 --- /dev/null +++ b/python/paddle/fluid/tests/unittests/test_shuffle_channel_op.py @@ -0,0 +1,52 @@ +# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import print_function + +import unittest +import numpy as np +import sys +import math +from op_test import OpTest +import paddle.fluid.core as core + + +class TestShuffleChannelOp(OpTest): + def setUp(self): + self.op_type = "shuffle_channel" + self.batch_size = 10 + self.input_channels = 16 + self.layer_h = 4 + self.layer_w = 4 + self.group = 4 + self.x = np.random.random( + (self.batch_size, self.input_channels, self.layer_h, + self.layer_w)).astype('float32') + self.inputs = {'X': self.x} + self.attrs = {'group': self.group} + n, c, h, w = self.x.shape + input_reshaped = np.reshape(self.x, + (-1, self.group, c // self.group, h, w)) + input_transposed = np.transpose(input_reshaped, (0, 2, 1, 3, 4)) + self.outputs = {'Out': np.reshape(input_transposed, (-1, c, h, w))} + + def test_check_output(self): + self.check_output() + + def test_check_grad(self): + self.check_grad(['X'], 'Out') + + +if __name__ == '__main__': + unittest.main() diff --git a/python/paddle/fluid/transpiler/details/checkport.py b/python/paddle/fluid/transpiler/details/checkport.py index 6b78ceeaeec4d9b3db6524a5b5e939f88267340c..89dd4dd50b0299de986b84f46e889d554030f180 100644 --- a/python/paddle/fluid/transpiler/details/checkport.py +++ b/python/paddle/fluid/transpiler/details/checkport.py @@ -16,6 +16,7 @@ import sys import time import socket from contextlib import closing +from six import string_types def wait_server_ready(endpoints): @@ -32,6 +33,7 @@ def wait_server_ready(endpoints): wait_server_ready(["127.0.0.1:8080", "127.0.0.1:8081"]) """ + assert not isinstance(endpoints, string_types) while True: all_ok = True not_ready_endpoints = [] @@ -45,7 +47,7 @@ def wait_server_ready(endpoints): all_ok = False not_ready_endpoints.append(ep) if not all_ok: - sys.stderr.write("pserver not ready, wait 3 sec to retry...\n") + sys.stderr.write("server not ready, wait 3 sec to retry...\n") sys.stderr.write("not ready endpoints:" + str(not_ready_endpoints) + "\n") sys.stderr.flush() diff --git a/python/paddle/fluid/transpiler/distribute_transpiler.py b/python/paddle/fluid/transpiler/distribute_transpiler.py index c61cb54e1f20d647e20538c880bb111a9268a4eb..e58f34e3750803669149685003ea5858fa775ed7 100644 --- a/python/paddle/fluid/transpiler/distribute_transpiler.py +++ b/python/paddle/fluid/transpiler/distribute_transpiler.py @@ -477,13 +477,16 @@ class DistributeTranspiler(object): trainer_id, trainers, current_endpoint, - startup_program=None): + startup_program=None, + wait_port=True): if not startup_program: startup_program = default_startup_program() if trainer_id >= 0: worker_endpoints = trainers.split(",") # send NCCL_ID to others or recv from trainer 0 worker_endpoints.remove(current_endpoint) + if trainer_id == 0 and wait_port: + wait_server_ready(worker_endpoints) nccl_id_var = startup_program.global_block().create_var( name="NCCLID", persistable=True, type=core.VarDesc.VarType.RAW) @@ -564,11 +567,13 @@ class DistributeTranspiler(object): if self.config.mode == "nccl2": assert (isinstance(trainers, str)) + self.origin_program._trainers_endpoints = trainers.split(",") self._transpile_nccl2( trainer_id, trainers, current_endpoint, - startup_program=startup_program) + startup_program=startup_program, + wait_port=self.config.wait_port) return self.trainer_num = trainers diff --git a/python/setup.py.in b/python/setup.py.in index fb4b273a0676fcbcb4402eaf54ddf73d37a2754f..c947785cbf7517be56c3e43120db65284ab22d10 100644 --- a/python/setup.py.in +++ b/python/setup.py.in @@ -109,6 +109,7 @@ packages=['paddle', 'paddle.fluid.contrib', 'paddle.fluid.contrib.decoder', 'paddle.fluid.contrib.quantize', + 'paddle.fluid.contrib.reader', 'paddle.fluid.contrib.slim', 'paddle.fluid.contrib.slim.core', 'paddle.fluid.contrib.slim.graph',