diff --git a/CMakeLists.txt b/CMakeLists.txt index 9ec632e20690eafdc558e24f160270a89b29ee41..e85fce58368aa233e39a554947e20a128fce6218 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -276,9 +276,3 @@ add_subdirectory(paddle) if(WITH_PYTHON) add_subdirectory(python) endif() - -if(WITH_DOC) - find_package(Sphinx REQUIRED) - find_python_module(recommonmark REQUIRED) - add_subdirectory(doc) -endif() diff --git a/Dockerfile b/Dockerfile index acfd091265e26d6c29c561d166fed2504c0cff1c..fe0721e9b99b5e028df2f6228ff04cb56a567a3f 100644 --- a/Dockerfile +++ b/Dockerfile @@ -11,12 +11,10 @@ RUN /bin/bash -c 'if [[ -n ${UBUNTU_MIRROR} ]]; then sed -i 's#http://archive.ub # ENV variables ARG WITH_GPU ARG WITH_AVX -ARG WITH_DOC ENV WOBOQ OFF ENV WITH_GPU=${WITH_GPU:-ON} ENV WITH_AVX=${WITH_AVX:-ON} -ENV WITH_DOC=${WITH_DOC:-OFF} ENV HOME /root # Add bash enhancements diff --git a/cmake/FindSphinx.cmake b/cmake/FindSphinx.cmake deleted file mode 100644 index f74cd4ff8c9c2c52319b18ac37264167b3718eae..0000000000000000000000000000000000000000 --- a/cmake/FindSphinx.cmake +++ /dev/null @@ -1,147 +0,0 @@ -# - This module looks for Sphinx -# Find the Sphinx documentation generator -# -# This modules defines -# SPHINX_EXECUTABLE -# SPHINX_FOUND - -find_program(SPHINX_EXECUTABLE - NAMES sphinx-build - PATHS - /usr/bin - /usr/local/bin - /opt/local/bin - DOC "Sphinx documentation generator" -) - -if( NOT SPHINX_EXECUTABLE ) - set(_Python_VERSIONS - 2.7 2.6 2.5 2.4 2.3 2.2 2.1 2.0 1.6 1.5 - ) - - foreach( _version ${_Python_VERSIONS} ) - set( _sphinx_NAMES sphinx-build-${_version} ) - - find_program( SPHINX_EXECUTABLE - NAMES ${_sphinx_NAMES} - PATHS - /usr/bin - /usr/local/bin - /opt/loca/bin - DOC "Sphinx documentation generator" - ) - endforeach() -endif() - -include(FindPackageHandleStandardArgs) - -find_package_handle_standard_args(Sphinx DEFAULT_MSG - SPHINX_EXECUTABLE -) - - -option( SPHINX_HTML_OUTPUT "Build a single HTML with the whole content." ON ) -option( SPHINX_DIRHTML_OUTPUT "Build HTML pages, but with a single directory per document." OFF ) -option( SPHINX_HTMLHELP_OUTPUT "Build HTML pages with additional information for building a documentation collection in htmlhelp." OFF ) -option( SPHINX_QTHELP_OUTPUT "Build HTML pages with additional information for building a documentation collection in qthelp." OFF ) -option( SPHINX_DEVHELP_OUTPUT "Build HTML pages with additional information for building a documentation collection in devhelp." OFF ) -option( SPHINX_EPUB_OUTPUT "Build HTML pages with additional information for building a documentation collection in epub." OFF ) -option( SPHINX_LATEX_OUTPUT "Build LaTeX sources that can be compiled to a PDF document using pdflatex." OFF ) -option( SPHINX_MAN_OUTPUT "Build manual pages in groff format for UNIX systems." OFF ) -option( SPHINX_TEXT_OUTPUT "Build plain text files." OFF ) - - -mark_as_advanced( - SPHINX_EXECUTABLE - SPHINX_HTML_OUTPUT - SPHINX_DIRHTML_OUTPUT - SPHINX_HTMLHELP_OUTPUT - SPHINX_QTHELP_OUTPUT - SPHINX_DEVHELP_OUTPUT - SPHINX_EPUB_OUTPUT - SPHINX_LATEX_OUTPUT - SPHINX_MAN_OUTPUT - SPHINX_TEXT_OUTPUT -) - -function( Sphinx_add_target target_name builder conf cache source destination ) - add_custom_target( ${target_name} ALL - COMMAND ${SPHINX_EXECUTABLE} -b ${builder} - -d ${cache} - -c ${conf} - ${source} - ${destination} - COMMENT "Generating sphinx documentation: ${builder}" - COMMAND cd ${destination} && ln -sf ./index_*.html index.html - ) - - set_property( - DIRECTORY APPEND PROPERTY - ADDITIONAL_MAKE_CLEAN_FILES - ${destination} - ) -endfunction() - -# Target dependencies can be optionally listed at the end. -function( Sphinx_add_targets target_base_name conf source base_destination ) - - set( _dependencies ) - - foreach( arg IN LISTS ARGN ) - set( _dependencies ${_dependencies} ${arg} ) - endforeach() - - if( ${SPHINX_HTML_OUTPUT} ) - Sphinx_add_target( ${target_base_name}_html html ${conf} ${source} ${base_destination}/html ) - - add_dependencies( ${target_base_name}_html ${_dependencies} ) - endif() - - if( ${SPHINX_DIRHTML_OUTPUT} ) - Sphinx_add_target( ${target_base_name}_dirhtml dirhtml ${conf} ${source} ${base_destination}/dirhtml ) - - add_dependencies( ${target_base_name}_dirhtml ${_dependencies} ) - endif() - - if( ${SPHINX_QTHELP_OUTPUT} ) - Sphinx_add_target( ${target_base_name}_qthelp qthelp ${conf} ${source} ${base_destination}/qthelp ) - - add_dependencies( ${target_base_name}_qthelp ${_dependencies} ) - endif() - - if( ${SPHINX_DEVHELP_OUTPUT} ) - Sphinx_add_target( ${target_base_name}_devhelp devhelp ${conf} ${source} ${base_destination}/devhelp ) - - add_dependencies( ${target_base_name}_devhelp ${_dependencies} ) - endif() - - if( ${SPHINX_EPUB_OUTPUT} ) - Sphinx_add_target( ${target_base_name}_epub epub ${conf} ${source} ${base_destination}/epub ) - - add_dependencies( ${target_base_name}_epub ${_dependencies} ) - endif() - - if( ${SPHINX_LATEX_OUTPUT} ) - Sphinx_add_target( ${target_base_name}_latex latex ${conf} ${source} ${base_destination}/latex ) - - add_dependencies( ${target_base_name}_latex ${_dependencies} ) - endif() - - if( ${SPHINX_MAN_OUTPUT} ) - Sphinx_add_target( ${target_base_name}_man man ${conf} ${source} ${base_destination}/man ) - - add_dependencies( ${target_base_name}_man ${_dependencies} ) - endif() - - if( ${SPHINX_TEXT_OUTPUT} ) - Sphinx_add_target( ${target_base_name}_text text ${conf} ${source} ${base_destination}/text ) - - add_dependencies( ${target_base_name}_text ${_dependencies} ) - endif() - - if( ${BUILD_TESTING} ) - sphinx_add_target( ${target_base_name}_linkcheck linkcheck ${conf} ${source} ${base_destination}/linkcheck ) - - add_dependencies( ${target_base_name}_linkcheck ${_dependencies} ) - endif() -endfunction() diff --git a/cmake/generic.cmake b/cmake/generic.cmake index 1f4dbe0b49825aef9a236f7ae72c6bea168b2ec5..6679a09dfc9dd00cfe3b5c5da3e12bd1c1389432 100644 --- a/cmake/generic.cmake +++ b/cmake/generic.cmake @@ -388,6 +388,7 @@ function(cc_test TARGET_NAME) endif() set_property(TEST ${TARGET_NAME} PROPERTY ENVIRONMENT FLAGS_cpu_deterministic=true) set_property(TEST ${TARGET_NAME} PROPERTY ENVIRONMENT FLAGS_init_allocated_mem=true) + set_property(TEST ${TARGET_NAME} PROPERTY ENVIRONMENT FLAGS_limit_of_tmp_allocation=4294967296) # 4G set_property(TEST ${TARGET_NAME} PROPERTY ENVIRONMENT FLAGS_cudnn_deterministic=true) # No unit test should exceed 10 minutes. set_tests_properties(${TARGET_NAME} PROPERTIES TIMEOUT 600) @@ -460,6 +461,7 @@ function(nv_test TARGET_NAME) endif() set_property(TEST ${TARGET_NAME} PROPERTY ENVIRONMENT FLAGS_cpu_deterministic=true) set_property(TEST ${TARGET_NAME} PROPERTY ENVIRONMENT FLAGS_init_allocated_mem=true) + set_property(TEST ${TARGET_NAME} PROPERTY ENVIRONMENT FLAGS_limit_of_tmp_allocation=4294967296) # 4G set_property(TEST ${TARGET_NAME} PROPERTY ENVIRONMENT FLAGS_cudnn_deterministic=true) endif() endfunction(nv_test) @@ -708,9 +710,10 @@ function(py_test TARGET_NAME) set(oneValueArgs "") set(multiValueArgs SRCS DEPS ARGS ENVS) cmake_parse_arguments(py_test "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN}) + add_test(NAME ${TARGET_NAME} COMMAND ${CMAKE_COMMAND} -E env FLAGS_init_allocated_mem=true FLAGS_cudnn_deterministic=true - FLAGS_cpu_deterministic=true + FLAGS_cpu_deterministic=true FLAGS_limit_of_tmp_allocation=4294967296 # 4G PYTHONPATH=${PADDLE_BINARY_DIR}/python ${py_test_ENVS} ${PYTHON_EXECUTABLE} -u ${py_test_SRCS} ${py_test_ARGS} WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}) diff --git a/paddle/fluid/API.spec b/paddle/fluid/API.spec index 430882dee968fd8e944627cab1cb1d156b29f5ae..afd3342768701adba4ff0040bd1c762b1cd8739d 100644 --- a/paddle/fluid/API.spec +++ b/paddle/fluid/API.spec @@ -67,6 +67,7 @@ paddle.fluid.initializer.BilinearInitializer.__init__ ArgSpec(args=['self'], var paddle.fluid.initializer.MSRAInitializer.__init__ ArgSpec(args=['self', 'uniform', 'fan_in', 'seed'], varargs=None, keywords=None, defaults=(True, None, 0)) paddle.fluid.initializer.force_init_on_cpu ArgSpec(args=[], varargs=None, keywords=None, defaults=None) paddle.fluid.initializer.init_on_cpu ArgSpec(args=[], varargs='args', keywords='kwds', defaults=None) +paddle.fluid.initializer.NumpyArrayInitializer.__init__ ArgSpec(args=['self', 'value'], varargs=None, keywords=None, defaults=None) paddle.fluid.layers.fc ArgSpec(args=['input', 'size', 'num_flatten_dims', 'param_attr', 'bias_attr', 'act', 'is_test', 'name'], varargs=None, keywords=None, defaults=(1, None, None, None, False, None)) paddle.fluid.layers.embedding ArgSpec(args=['input', 'size', 'is_sparse', 'is_distributed', 'padding_idx', 'param_attr', 'dtype'], varargs=None, keywords=None, defaults=(False, False, None, None, 'float32')) paddle.fluid.layers.dynamic_lstm ArgSpec(args=['input', 'size', 'h_0', 'c_0', 'param_attr', 'bias_attr', 'use_peepholes', 'is_reverse', 'gate_activation', 'cell_activation', 'candidate_activation', 'dtype', 'name'], varargs=None, keywords=None, defaults=(None, None, None, None, True, False, 'sigmoid', 'tanh', 'tanh', 'float32', None)) @@ -121,7 +122,7 @@ paddle.fluid.layers.transpose ArgSpec(args=['x', 'perm', 'name'], varargs=None, paddle.fluid.layers.im2sequence ArgSpec(args=['input', 'filter_size', 'stride', 'padding', 'input_image_size', 'out_stride', 'name'], varargs=None, keywords=None, defaults=(1, 1, 0, None, 1, None)) paddle.fluid.layers.nce ArgSpec(args=['input', 'label', 'num_total_classes', 'sample_weight', 'param_attr', 'bias_attr', 'num_neg_samples', 'name', 'sampler', 'custom_dist', 'seed', 'is_sparse'], varargs=None, keywords=None, defaults=(None, None, None, None, None, 'uniform', None, 0, False)) paddle.fluid.layers.hsigmoid ArgSpec(args=['input', 'label', 'num_classes', 'param_attr', 'bias_attr', 'name', 'path_table', 'path_code', 'is_custom', 'is_sparse'], varargs=None, keywords=None, defaults=(None, None, None, None, None, False, False)) -paddle.fluid.layers.beam_search ArgSpec(args=['pre_ids', 'pre_scores', 'ids', 'scores', 'beam_size', 'end_id', 'level', 'name'], varargs=None, keywords=None, defaults=(0, None)) +paddle.fluid.layers.beam_search ArgSpec(args=['pre_ids', 'pre_scores', 'ids', 'scores', 'beam_size', 'end_id', 'level', 'is_accumulated', 'name'], varargs=None, keywords=None, defaults=(0, True, None)) paddle.fluid.layers.row_conv ArgSpec(args=['input', 'future_context_size', 'param_attr', 'act'], varargs=None, keywords=None, defaults=(None, None)) paddle.fluid.layers.multiplex ArgSpec(args=['inputs', 'index'], varargs=None, keywords=None, defaults=None) paddle.fluid.layers.layer_norm ArgSpec(args=['input', 'scale', 'shift', 'begin_norm_axis', 'epsilon', 'param_attr', 'bias_attr', 'act', 'name'], varargs=None, keywords=None, defaults=(True, True, 1, 1e-05, None, None, None, None)) @@ -212,6 +213,7 @@ paddle.fluid.layers.bilinear_tensor_product ArgSpec(args=['x', 'y', 'size', 'act paddle.fluid.layers.merge_selected_rows ArgSpec(args=['x', 'name'], varargs=None, keywords=None, defaults=(None,)) paddle.fluid.layers.get_tensor_from_selected_rows ArgSpec(args=['x', 'name'], varargs=None, keywords=None, defaults=(None,)) paddle.fluid.layers.lstm ArgSpec(args=['input', 'init_h', 'init_c', 'max_len', 'hidden_size', 'num_layers', 'dropout_prob', 'is_bidirec', 'is_test', 'name', 'default_initializer', 'seed'], varargs=None, keywords=None, defaults=(0.0, False, False, None, None, -1)) +paddle.fluid.layers.shuffle_channel ArgSpec(args=['x', 'group', 'name'], varargs=None, keywords=None, defaults=(None,)) paddle.fluid.layers.py_func ArgSpec(args=['func', 'x', 'out', 'backward_func', 'skip_vars_in_backward_input'], varargs=None, keywords=None, defaults=(None, None)) paddle.fluid.layers.psroi_pool ArgSpec(args=['input', 'rois', 'output_channels', 'spatial_scale', 'pooled_height', 'pooled_width', 'name'], varargs=None, keywords=None, defaults=(None,)) paddle.fluid.layers.teacher_student_sigmoid_loss ArgSpec(args=['input', 'label', 'soft_max_up_bound', 'soft_max_lower_bound'], varargs=None, keywords=None, defaults=(15.0, -15.0)) @@ -358,6 +360,7 @@ paddle.fluid.contrib.QuantizeTranspiler.__init__ ArgSpec(args=['self', 'weight_b paddle.fluid.contrib.QuantizeTranspiler.convert_to_int8 ArgSpec(args=['self', 'program', 'place', 'scope'], varargs=None, keywords=None, defaults=(None,)) paddle.fluid.contrib.QuantizeTranspiler.freeze_program ArgSpec(args=['self', 'program', 'place', 'fuse_bn', 'scope'], varargs=None, keywords=None, defaults=(False, None)) paddle.fluid.contrib.QuantizeTranspiler.training_transpile ArgSpec(args=['self', 'program', 'startup_program'], varargs=None, keywords=None, defaults=(None, None)) +paddle.fluid.contrib.reader.ctr_reader.ctr_reader ArgSpec(args=['feed_dict', 'file_type', 'file_format', 'dense_slot_index', 'sparse_slot_index', 'capacity', 'thread_num', 'batch_size', 'file_list', 'slots', 'name'], varargs=None, keywords=None, defaults=(None,)) paddle.fluid.contrib.build_compressor ArgSpec(args=['place', 'data_reader', 'data_feeder', 'scope', 'metrics', 'epoch', 'config'], varargs=None, keywords=None, defaults=(None, None, None, None, None, None, None)) paddle.fluid.contrib.CompressPass.__init__ ArgSpec(args=['self', 'place', 'data_reader', 'data_feeder', 'scope', 'metrics', 'epoch', 'program_exe'], varargs=None, keywords=None, defaults=(None, None, None, None, None, None, None)) paddle.fluid.contrib.CompressPass.add_strategy ArgSpec(args=['self', 'strategy'], varargs=None, keywords=None, defaults=None) diff --git a/paddle/fluid/framework/CMakeLists.txt b/paddle/fluid/framework/CMakeLists.txt index a167511160d074c13ca1dca36b4f2c5eeea4bb93..66f11dedbaccd7febcd75fa7ade9c68b6c42022c 100644 --- a/paddle/fluid/framework/CMakeLists.txt +++ b/paddle/fluid/framework/CMakeLists.txt @@ -1,4 +1,3 @@ - #windows treat symbolic file as a real file, which is different with unix #We create a hidden file and compile it instead of origin source file. function(windows_symbolic TARGET) @@ -129,12 +128,6 @@ cc_test(version_test SRCS version_test.cc DEPS version) cc_library(proto_desc SRCS var_desc.cc op_desc.cc block_desc.cc program_desc.cc DEPS shape_inference op_info operator glog version) -if(WITH_NGRAPH) - cc_library(ngraph_bridge SRCS ngraph_bridge.cc DEPS operator framework_proto ngraph) - cc_library(ngraph_operator SRCS ngraph_operator.cc DEPS ngraph_bridge operator op_info device_context tensor scope glog - shape_inference data_transform lod_tensor profiler) -endif(WITH_NGRAPH) - cc_library(op_registry SRCS op_registry.cc DEPS op_proto_maker op_info operator glog proto_desc) nv_test(op_registry_test SRCS op_registry_test.cc DEPS op_registry) @@ -171,13 +164,12 @@ if(WITH_DISTRIBUTE) set(DISTRIBUTE_COMPILE_FLAGS "-Wno-non-virtual-dtor -Wno-error=non-virtual-dtor -Wno-error=delete-non-virtual-dtor") set_source_files_properties(executor.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS}) - else() - if(WITH_NGRAPH) - cc_library(executor SRCS executor.cc DEPS op_registry device_context scope framework_proto glog lod_rank_table feed_fetch_method graph_to_program_pass ngraph_operator variable_helper) - else(WITH_NGRAPH) + if (WITH_NGRAPH) + cc_library(executor SRCS executor.cc DEPS op_registry device_context scope framework_proto glog lod_rank_table feed_fetch_method graph_to_program_pass variable_helper ngraph_engine) + else () cc_library(executor SRCS executor.cc DEPS op_registry device_context scope framework_proto glog lod_rank_table feed_fetch_method graph_to_program_pass variable_helper) - endif(WITH_NGRAPH) + endif() cc_test(test_naive_executor SRCS naive_executor_test.cc DEPS naive_executor elementwise_add_op) endif() @@ -214,3 +206,24 @@ endif (NOT WIN32) cc_library(dlpack_tensor SRCS dlpack_tensor.cc DEPS tensor dlpack) cc_test(dlpack_tensor_test SRCS dlpack_tensor_test.cc DEPS dlpack_tensor glog) + +# Get the current working branch +execute_process( + COMMAND git rev-parse --abbrev-ref HEAD + WORKING_DIRECTORY ${CMAKE_SOURCE_DIR} + OUTPUT_VARIABLE PADDLE_BRANCH + OUTPUT_STRIP_TRAILING_WHITESPACE +) + +# Get the latest abbreviated commit hash of the working branch +execute_process( + COMMAND git log -1 --format=%h + WORKING_DIRECTORY ${CMAKE_SOURCE_DIR} + OUTPUT_VARIABLE PADDLE_COMMIT + OUTPUT_STRIP_TRAILING_WHITESPACE +) + +message(STATUS "commit: ${PADDLE_COMMIT}") +message(STATUS "branch: ${PADDLE_BRANCH}") + +configure_file(commit.h.in commit.h) diff --git a/paddle/fluid/framework/commit.h.in b/paddle/fluid/framework/commit.h.in new file mode 100644 index 0000000000000000000000000000000000000000..3a33ece624443a99083ae29abb70254a5ac40a3d --- /dev/null +++ b/paddle/fluid/framework/commit.h.in @@ -0,0 +1,21 @@ +#pragma once + +#include + +namespace paddle { +namespace framework { + +static std::string paddle_commit() { + return "@PADDLE_COMMIT@"; +} + +static std::string paddle_compile_branch() { + return "@PADDLE_BRANCH@"; +} + +static std::string paddle_version() { + return "@PADDLE_VERSION@"; +} + +} // namespace framework +} // namespace paddle diff --git a/paddle/fluid/framework/details/build_strategy.h b/paddle/fluid/framework/details/build_strategy.h index 603df2e06936e3d9d8e7ec62efd0c6e83200239c..cd24a3175953bf323748bf0c7e3159761c13f0a9 100644 --- a/paddle/fluid/framework/details/build_strategy.h +++ b/paddle/fluid/framework/details/build_strategy.h @@ -91,7 +91,7 @@ struct BuildStrategy { int num_trainers_{1}; int trainer_id_{0}; std::vector trainers_endpoints_; - bool remove_unnecessary_lock_{false}; + bool remove_unnecessary_lock_{true}; // NOTE: // Before you add new options, think if it's a general strategy that works diff --git a/paddle/fluid/framework/details/execution_strategy.h b/paddle/fluid/framework/details/execution_strategy.h index 37b07e5736312b3050debe745f2d3c108469c5d6..318694a1d4b0599655f05bf01c907fb6c07a4193 100644 --- a/paddle/fluid/framework/details/execution_strategy.h +++ b/paddle/fluid/framework/details/execution_strategy.h @@ -25,6 +25,9 @@ struct ExecutionStrategy { size_t num_threads_{0}; bool use_cuda_{true}; bool allow_op_delay_{false}; + // If we set this to 1, we will delete all variables when finish a batch. and + // this will loss 15%+ performance. + // Please be aware about this parameters. size_t num_iteration_per_drop_scope_{1}; ExecutorType type_{kDefault}; bool dry_run_{false}; diff --git a/paddle/fluid/framework/executor.cc b/paddle/fluid/framework/executor.cc index c93bbe7ceecce9193acfae0b4e03c06212edd6d6..4323883fa5cc9b26a68c2980f3b7a49eca610543 100644 --- a/paddle/fluid/framework/executor.cc +++ b/paddle/fluid/framework/executor.cc @@ -27,7 +27,7 @@ limitations under the License. */ #include "paddle/fluid/platform/profiler.h" #ifdef PADDLE_WITH_NGRAPH -#include "paddle/fluid/framework/ngraph_operator.h" +#include "paddle/fluid/operators/ngraph/ngraph_engine.h" #endif DECLARE_bool(benchmark); @@ -133,24 +133,6 @@ static void DeleteUnusedTensors( } } -static void EnableFusedOp(ExecutorPrepareContext* ctx) { -#ifdef PADDLE_WITH_NGRAPH - VLOG(3) << "use_ngraph=True"; - auto intervals = NgraphOperator::NgraphOpIntervals(&ctx->ops_); - for (auto& interval : intervals) { - auto* ng_op = new NgraphOperator(ctx->prog_, ctx->block_id_, interval.at(0), - interval.at(1)); - *interval[0] = std::unique_ptr(ng_op); - } - for (auto it = intervals.rbegin(); it != intervals.rend(); ++it) { - ctx->ops_.erase(it->at(0) + 1, it->at(1)); - } -#else - LOG(WARNING) - << "'NGRAPH' is not supported, Please re-compile with WITH_NGRAPH option"; -#endif -} - Executor::Executor(const platform::Place& place) : place_(place) {} void Executor::Close() { @@ -204,6 +186,9 @@ void Executor::Run(const ProgramDesc& pdesc, Scope* scope, int block_id, bool create_local_scope, bool create_vars) { platform::RecordBlock b(block_id); if (FLAGS_use_mkldnn) EnableMKLDNN(pdesc); +#ifdef PADDLE_WITH_NGRAPH + if (FLAGS_use_ngraph) operators::NgraphEngine::EnableNgraph(pdesc); +#endif auto ctx = Prepare(pdesc, block_id); RunPreparedContext(ctx.get(), scope, create_local_scope, create_vars); } @@ -379,7 +364,6 @@ std::unique_ptr Executor::Prepare( for (auto& op_desc : block.AllOps()) { ctx->ops_.push_back(OpRegistry::CreateOp(*op_desc)); } - if (FLAGS_use_ngraph) EnableFusedOp(ctx.get()); return ctx; } diff --git a/paddle/fluid/framework/ir/graph_traits.cc b/paddle/fluid/framework/ir/graph_traits.cc index 2ee12cc410393d1e1aa5fc9e5374d858eca1b901..929d9edc34ffb92f468d5b7af54a0b8da4121543 100644 --- a/paddle/fluid/framework/ir/graph_traits.cc +++ b/paddle/fluid/framework/ir/graph_traits.cc @@ -14,6 +14,7 @@ #include "paddle/fluid/framework/ir/graph_traits.h" +#include #include namespace paddle { @@ -79,7 +80,7 @@ NodesTSIterator::NodesTSIterator(const std::vector &source) { } std::unordered_set visited; - std::unordered_set to_visit{source.begin(), source.end()}; + std::set to_visit{source.begin(), source.end()}; std::vector inlink_visited; while (!to_visit.empty()) { diff --git a/paddle/fluid/framework/lod_tensor.cc b/paddle/fluid/framework/lod_tensor.cc index 8fbbc6584e121d22bdec8173d501a35dc97c9c06..f46bdf96ba1e9e1e137c690057051d9a127d45c9 100644 --- a/paddle/fluid/framework/lod_tensor.cc +++ b/paddle/fluid/framework/lod_tensor.cc @@ -54,13 +54,14 @@ std::ostream &operator<<(std::ostream &os, const LoD &lod) { std::ostream &operator<<(std::ostream &os, const LoDTensor &t) { if (!platform::is_cpu_place(t.place())) { - LoDTensor tt; - framework::TensorCopy(t, platform::CPUPlace(), &tt); + LoDTensor cpu_tensor; + cpu_tensor.set_lod(t.lod()); + framework::TensorCopy(t, platform::CPUPlace(), &cpu_tensor); platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance(); auto &dev_ctx = *pool.Get(t.place()); dev_ctx.Wait(); - os << tt; + os << cpu_tensor; return os; } diff --git a/paddle/fluid/framework/mixed_vector.h b/paddle/fluid/framework/mixed_vector.h index c3a044d22cf04dceecc164fae934ee15c4563af1..5d854cb8d7856a631faf01741d29d3cecfd9a627 100644 --- a/paddle/fluid/framework/mixed_vector.h +++ b/paddle/fluid/framework/mixed_vector.h @@ -1,16 +1,16 @@ /* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. - Licensed under the Apache License, Version 2.0 (the "License"); - you may not use this file except in compliance with the License. - You may obtain a copy of the License at +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at - http://www.apache.org/licenses/LICENSE-2.0 + http://www.apache.org/licenses/LICENSE-2.0 - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. */ +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ #pragma once diff --git a/paddle/fluid/framework/ngraph_operator.cc b/paddle/fluid/framework/ngraph_operator.cc deleted file mode 100644 index 7e174c7def1ffa4089a94d9cc504b18843557c53..0000000000000000000000000000000000000000 --- a/paddle/fluid/framework/ngraph_operator.cc +++ /dev/null @@ -1,545 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include - -#include -#include - -#include "paddle/fluid/framework/feed_fetch_type.h" -#include "paddle/fluid/framework/framework.pb.h" -#include "paddle/fluid/framework/lod_tensor.h" -#include "paddle/fluid/framework/ngraph_bridge.h" -#include "paddle/fluid/framework/ngraph_operator.h" -#include "paddle/fluid/framework/tensor.h" -#include "paddle/fluid/framework/var_desc.h" -#include "paddle/fluid/framework/var_type.h" - -#include "ngraph/ngraph.hpp" - -namespace paddle { -namespace framework { - -static ngraph::Shape Ddim2Shape(const DDim& dims) { - ngraph::Shape sp; - for (int i = 0; i < dims.size(); ++i) { - int k = dims[i]; - k = k == 0 ? 1 : k; - sp.push_back(k); - } - return sp; -} - -static std::map pd2ng_type_map = { - {proto::VarType::FP32, ngraph::element::f32}, - {proto::VarType::FP64, ngraph::element::f64}, - {proto::VarType::INT32, ngraph::element::i32}, - {proto::VarType::INT64, ngraph::element::i64}, - {proto::VarType::BOOL, ngraph::element::boolean}, -}; - -typedef enum { /* nGraph support state on ops */ - FULL_TRAIN, /* Support full ops for train */ - PARTIAL_TRAIN, /* Support partial ops for train */ - FULL_TEST, /* Support full list of ops for test */ - PARTIAL_TEST /* Support partial list of ops for test */ -} op_state; - -// perform graph build through bridge and execute computation -class NgraphEngine { - public: - explicit NgraphEngine(const Scope& scope, const platform::Place& place, - const std::vector>& ops, - const std::unordered_map< - std::string, ngraph::element::Type>& var_type_map, - const std::unordered_set& persist, - const std::unordered_set& fetches, - const std::unordered_set& post_op_inputs, - op_state ng_op_state) - : scope_(scope), - place_(place), - fused_ops_(ops), - var_type_map_(var_type_map), - persistables_(persist), - fetches_(fetches), - post_op_inputs_(post_op_inputs), - ng_op_state_(ng_op_state) { - var_in_node_map_ = std::make_shared< - std::unordered_map>>(); - - var_node_map_ = std::make_shared< - std::unordered_map>>(); - - BuildNgIO(); - - GetNgFunction(); - } - - void Run(const Scope& scope, const platform::Place& place) const; - - private: - static std::unordered_map> - func_cache_; - const Scope& scope_; - const platform::Place& place_; - std::vector> fused_ops_; - std::unordered_map var_type_map_; - std::unordered_set persistables_; - std::unordered_set fetches_; - std::unordered_set post_op_inputs_; - op_state ng_op_state_; - - // ngraph backend eg. CPU - static std::shared_ptr backend_; - // ngraph function to call and execute - std::shared_ptr ngraph_function_; - // var_name of inputs - std::vector var_in_; - // var_name of outputs from fetch in order - std::vector var_out_; - // map input vars to nodes - std::shared_ptr< - std::unordered_map>> - var_in_node_map_; - // map each var name with a ngraph node - std::shared_ptr< - std::unordered_map>> - var_node_map_; - // cache key to check if function is cached - std::shared_ptr GetCacheKey(); - // get ngraph input and define ngraph input parameters - void GetNgInputShape(std::shared_ptr op); - // Call ngraph bridge to map ops - void BuildNgNodes(); - // get the ngraph input and output var list - void BuildNgIO(); - // build ngraph function call - void BuildNgFunction(); - // Check cache for ngraph function or otherwise build the function - void GetNgFunction(); -}; - -std::vector>::iterator>> -NgraphOperator::NgraphOpIntervals( - std::vector>* ops) { - std::vector>::iterator>> - intervals; - if (ops->empty()) { - return intervals; - } - size_t size = ops->size(); - size_t left = 0; - while (left < size && ops->at(left)->Type() != kFeedOpType) { - ++left; - } - if (left == size) { - return intervals; - } - while (left < size && ops->at(left)->Type() == kFeedOpType) { - ++left; - } - - size_t right = left; - while (right < size && ops->at(right)->Type() != kFetchOpType) { - ++right; - } - if (right == size) { - return intervals; - } - if (left >= right) return intervals; - - // (left, right - 1) represents indices between feed and fetch - size_t pivot = left; - while (pivot < right) { - auto op_type = ops->at(pivot)->Type(); - if (paddle::framework::NgraphBridge::NG_NODE_MAP.find(op_type) == - paddle::framework::NgraphBridge::NG_NODE_MAP.end()) { - ++pivot; - } else { - size_t start = pivot, end = start; - while (pivot < right && - (paddle::framework::NgraphBridge::NG_NODE_MAP.find( - ops->at(pivot)->Type()) != - paddle::framework::NgraphBridge::NG_NODE_MAP.end())) { - ++pivot; - ++end; - } - std::vector>::iterator> - interval = {ops->begin() + start, ops->begin() + end}; - intervals.push_back(interval); - } - } // end while - - return intervals; -} - -NgraphOperator::NgraphOperator( - const ProgramDesc& prog, size_t block_id, - std::vector>::iterator start, - std::vector>::iterator end, - const std::string& type, const VariableNameMap& inputs, - const VariableNameMap& outputs, const AttributeMap& attrs) - : OperatorBase(type, inputs, outputs, attrs), - pdesc_(prog), - block_(block_id) { - for (std::vector>::iterator it = start; - it != end; ++it) { - fused_ops_.push_back(std::move(*it)); - } - - for (std::vector>::iterator it = end; - (*it)->Type() != kFetchOpType; ++it) { - for (auto& var_name_item : (*it)->Inputs()) { - for (auto& var_name : var_name_item.second) { - post_op_inputs_.insert(var_name); - } - } - } - - if ((*(start - 1))->Type() == kFeedOpType && (*end)->Type() == kFetchOpType) { - is_full_ = true; - } - - Process(); -} - -void NgraphOperator::Process() { - auto& bdesc = pdesc_.Block(block_); - for (auto& var : bdesc.AllVars()) { - if (!(var->GetType() == proto::VarType::SELECTED_ROWS || - var->GetType() == proto::VarType::LOD_TENSOR || - var->GetType() == proto::VarType::LOD_TENSOR_ARRAY)) { - continue; - } - - auto var_name = var->Name(); - if (var->Name() == framework::kEmptyVarName) { - continue; - } - - if (var_name != "fetch" && var_name != "feed") { - auto pd_type = var->GetDataType(); - if (pd2ng_type_map.find(pd_type) == pd2ng_type_map.end()) { - PADDLE_THROW("Data type of var %s not found in pd2ng_type_map", - var_name); - } - var_type_map_[var_name] = pd2ng_type_map[pd_type]; - } - - if (var->Persistable()) { - persistables_.insert(var->Name()); - } - } - - for (auto* op : bdesc.AllOps()) { - if (op->Type() == kFetchOpType) { - std::string fetch_target_name = op->Input("X")[0]; - fetches_.insert(fetch_target_name); - } - } -} - -void NgraphOperator::RunImpl(const Scope& scope, - const platform::Place& place) const { - op_state ng_op_state = PARTIAL_TEST; - auto& bdesc = pdesc_.Block(block_); - for (auto* op : bdesc.AllOps()) { - if (op->Type().find("_grad") != std::string::npos) { - ng_op_state = PARTIAL_TRAIN; - break; - } - } - - if (is_full_) { - ng_op_state = ng_op_state == PARTIAL_TEST ? FULL_TEST : FULL_TRAIN; - } - - NgraphEngine ngraph_engine(scope, place, fused_ops_, var_type_map_, - persistables_, fetches_, post_op_inputs_, - ng_op_state); - ngraph_engine.Run(scope, place); -} - -std::unordered_map> - NgraphEngine::func_cache_ = {}; - -std::shared_ptr NgraphEngine::backend_ = - ngraph::runtime::Backend::create("CPU"); - -void NgraphEngine::GetNgInputShape(std::shared_ptr op) { - RuntimeContext ctx(op->Inputs(), op->Outputs(), scope_); - op->RuntimeInferShape(scope_, place_, ctx); - for (auto& var_name_item : op->Inputs()) { - for (auto& var_name : var_name_item.second) { - auto* var = scope_.FindVar(var_name); - if (var && var->IsType()) { - auto* tensor_pd = GetLoDTensorOrSelectedRowsValueFromVar(*var); - auto sp = Ddim2Shape(tensor_pd->dims()); - if (std::find(var_in_.begin(), var_in_.end(), var_name) != - var_in_.end()) { - if (var_node_map_->find(var_name) == var_node_map_->end()) { - auto ng_type = var_type_map_.at(var_name); - auto prm = - std::make_shared(ng_type, sp, true); - (*var_node_map_)[var_name] = prm; - (*var_in_node_map_)[var_name] = prm; - } - } - } - } - } -} - -void NgraphEngine::BuildNgNodes() { - for (auto& var_name : var_out_) { - if (var_node_map_->find(var_name) == var_node_map_->end()) { - auto* var = scope_.FindVar(var_name); - if (var && var->IsType()) { - auto* tensor_pd = GetLoDTensorOrSelectedRowsValueFromVar(*var); - auto& ddim = tensor_pd->dims(); - auto ng_shape = Ddim2Shape(ddim); - auto ng_type = var_type_map_.at(var_name); - auto prm = - std::make_shared(ng_type, ng_shape, true); - (*var_node_map_)[var_name] = prm; - } - } - } - - paddle::framework::NgraphBridge ngb(var_node_map_); - for (auto& op : fused_ops_) { - ngb.BuildNgNode(op); - } -} - -void NgraphEngine::BuildNgIO() { - std::unordered_set inputs; - std::unordered_set outputs; - - for (auto& op : fused_ops_) { - for (auto& var_name_item : op->Inputs()) { - for (auto& var_name : var_name_item.second) { - inputs.insert(var_name); - const bool is_output = outputs.find(var_name) != outputs.end(); - if (!is_output && - std::find(var_in_.begin(), var_in_.end(), var_name) == - var_in_.end()) { - // fill var_in here to keep lhs and rhs order - var_in_.push_back(var_name); - } - } - } - - if (op->Type() != "fill_constant") { - GetNgInputShape(op); - } - - for (auto& var_name_item : op->Outputs()) { - PADDLE_ENFORCE_LE(var_name_item.second.size(), 1, - "op %s has more than 1 output - Not handling yet", - op->Type()); - for (auto& var_name : var_name_item.second) { - outputs.insert(var_name); - } - } - } - - // var_out.clear(); - for (auto& op : fused_ops_) { - for (auto& var_name_item : op->Outputs()) { - PADDLE_ENFORCE_LE(var_name_item.second.size(), 1, - "op %s has more than 1 output - Not handling yet", - op->Type()); - for (auto& var_name : var_name_item.second) { - switch (ng_op_state_) { - case PARTIAL_TEST: - if (post_op_inputs_.find(var_name) != post_op_inputs_.end() || - fetches_.find(var_name) != fetches_.end()) { - var_out_.push_back(var_name); - } - break; - case FULL_TEST: - if (fetches_.find(var_name) != fetches_.end()) { - var_out_.push_back(var_name); - } - break; - case PARTIAL_TRAIN: - if (fetches_.find(var_name) != fetches_.end() || - post_op_inputs_.find(var_name) != post_op_inputs_.end() || - persistables_.find(var_name) != persistables_.end()) { - var_out_.push_back(var_name); - } - break; - case FULL_TRAIN: - if (fetches_.find(var_name) != fetches_.end() || - persistables_.find(var_name) != persistables_.end()) { - var_out_.push_back(var_name); - } - break; - default: - var_out_.push_back(var_name); - } - } - } - } -} - -void NgraphEngine::BuildNgFunction() { - BuildNgNodes(); - ngraph_function_ = nullptr; - ngraph::NodeVector func_outputs; - ngraph::ParameterVector func_inputs; - - for (auto& vo : var_out_) { - func_outputs.push_back(var_node_map_->at(vo)); - } - - for (auto& vi : var_in_) { - std::shared_ptr prm = - std::dynamic_pointer_cast( - var_in_node_map_->at(vi)); - func_inputs.push_back(prm); - } - - ngraph_function_ = - std::make_shared(func_outputs, func_inputs); -} - -std::shared_ptr NgraphEngine::GetCacheKey() { - auto cache_key = std::make_shared(""); - *cache_key += std::to_string(fused_ops_.size()); - for (auto& op : fused_ops_) { - *cache_key += op->Type(); - } - for (auto& var_name : var_in_) { - auto shape = var_node_map_->at(var_name)->get_shape(); - *cache_key += var_name; - *cache_key += var_type_map_.at(var_name).c_type_string(); - for (size_t i = 0; i < shape.size(); ++i) { - *cache_key += std::to_string(shape.at(i)); - } - } - - for (auto& var_name : var_out_) { - auto* var = scope_.FindVar(var_name); - if (var && var->IsType()) { - auto* tensor_pd = GetLoDTensorOrSelectedRowsValueFromVar(*var); - auto& ddim = tensor_pd->dims(); - for (int i = 0; i < ddim.size(); ++i) { - *cache_key += std::to_string(ddim[i]); - } - } - } - return cache_key; -} - -void NgraphEngine::GetNgFunction() { - bool cache_on = true; - if (cache_on) { - std::string cache_key_val = *GetCacheKey(); - if (func_cache_.find(cache_key_val) != func_cache_.end()) { - ngraph_function_ = func_cache_.at(cache_key_val); - } else { - BuildNgFunction(); - func_cache_[cache_key_val] = ngraph_function_; - } - } else { - BuildNgFunction(); - } -} - -void NgraphEngine::Run(const Scope& scope, const platform::Place& place) const { - std::vector> t_in; - std::vector> t_out; - - for (size_t i = 0; i < var_in_.size(); ++i) { - auto vi = var_in_.at(i); - auto sp = var_node_map_->at(vi)->get_shape(); - std::shared_ptr ti; - auto* var = scope.FindVar(vi); - if (var && var->IsType()) { - auto* tensor_pd = GetLoDTensorOrSelectedRowsValueFromVar(*var); - PADDLE_ENFORCE(sp == Ddim2Shape(tensor_pd->dims()), - "Ensure ngraph tensor layout align with paddle tensor"); - if (tensor_pd->type() == proto::VarType::FP32) { - const float* arr = tensor_pd->data(); - ti = backend_->create_tensor(ngraph::element::f32, sp, - const_cast(arr)); - } else if (tensor_pd->type() == proto::VarType::INT32) { - const int* arr = tensor_pd->data(); - ti = backend_->create_tensor(ngraph::element::i32, sp, - const_cast(arr)); - } else if (tensor_pd->type() == proto::VarType::INT64) { - const int64_t* arr = tensor_pd->data(); - ti = backend_->create_tensor(ngraph::element::i64, sp, - const_cast(arr)); - } else if (tensor_pd->type() == proto::VarType::FP64) { - const double* arr = tensor_pd->data(); - ti = backend_->create_tensor(ngraph::element::f64, sp, - const_cast(arr)); - } else if (tensor_pd->type() == proto::VarType::BOOL) { - const bool* arr = tensor_pd->data(); - ti = backend_->create_tensor(ngraph::element::boolean, sp, - const_cast(arr)); - } else { - PADDLE_THROW("Data type not handling for var %s", vi); - } - } else { - PADDLE_THROW("Cannot find var or tensor with var name %s", vi); - } - bool is_test = (ng_op_state_ == PARTIAL_TEST || ng_op_state_ == FULL_TEST) - ? true - : false; - bool is_persistable = - (persistables_.find(vi) != persistables_.end()) ? true : false; - if (is_test && is_persistable) { - ti->set_stale(false); - } - t_in.push_back(ti); - } - - for (size_t i = 0; i < var_out_.size(); ++i) { - auto var_name = var_out_[i]; - auto* var = scope.FindVar(var_name); - std::shared_ptr to; - if (var && var->IsType()) { - auto* tensor_pd = GetMutableLoDTensorOrSelectedRowsValueFromVar(var); - auto dd = tensor_pd->dims(); - ngraph::Shape sp = Ddim2Shape(dd); - auto ng_type = var_type_map_.at(var_name); - if (ng_type == ngraph::element::f32) { - auto pd_arr = tensor_pd->mutable_data(place); - to = backend_->create_tensor(ngraph::element::f32, sp, pd_arr); - } else if (ng_type == ngraph::element::i64) { - auto pd_arr = tensor_pd->mutable_data(place); - to = backend_->create_tensor(ngraph::element::i64, sp, pd_arr); - } else if (ng_type == ngraph::element::f64) { - auto pd_arr = tensor_pd->mutable_data(place); - to = backend_->create_tensor(ngraph::element::f64, sp, pd_arr); - } else if (ng_type == ngraph::element::boolean) { - auto pd_arr = tensor_pd->mutable_data(place); - to = backend_->create_tensor(ngraph::element::boolean, sp, pd_arr); - } else { - PADDLE_THROW("Data type not handled in for var %s", var_name); - } - t_out.push_back(to); - } else { - PADDLE_THROW("Cannot find var or tensor with var name %s", var_name); - } - } - - backend_->call(backend_->compile(ngraph_function_), t_out, t_in); -} // NgraphEngine::RunImpl -} // namespace framework -} // namespace paddle diff --git a/paddle/fluid/framework/ngraph_operator.h b/paddle/fluid/framework/ngraph_operator.h deleted file mode 100644 index ede80f44bea208b66acc3b3f4bc0f4adee4fb860..0000000000000000000000000000000000000000 --- a/paddle/fluid/framework/ngraph_operator.h +++ /dev/null @@ -1,64 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#pragma once - -#include -#include -#include -#include - -#include "paddle/fluid/framework/attribute.h" -#include "paddle/fluid/framework/op_info.h" -#include "paddle/fluid/framework/op_kernel_type.h" -#include "paddle/fluid/framework/operator.h" -#include "paddle/fluid/framework/program_desc.h" -#include "paddle/fluid/framework/scope.h" -#include "paddle/fluid/platform/variant.h" - -#include "ngraph/type/element_type.hpp" - -namespace paddle { -namespace framework { - -class NgraphOperator : public OperatorBase { - public: - static std::vector< - std::vector>::iterator>> - NgraphOpIntervals( - std::vector>* ops); - - explicit NgraphOperator( - const ProgramDesc& prog, size_t block_id, - std::vector>::iterator start, - std::vector>::iterator end, - const std::string& type = "fused_op", const VariableNameMap& inputs = {}, - const VariableNameMap& outputs = {}, const AttributeMap& attrs = {}); - - void RunImpl(const Scope& scope, const platform::Place& place) const final; - - private: - const ProgramDesc pdesc_; - size_t block_; - std::vector> fused_ops_; - std::unordered_map var_type_map_; - std::unordered_set persistables_; - std::unordered_set fetches_; - std::unordered_set post_op_inputs_; - bool is_full_ = false; - - void Process(); -}; -} // namespace framework -} // namespace paddle diff --git a/paddle/fluid/framework/operator.cc b/paddle/fluid/framework/operator.cc index 38f811c0e9e8ecc6a4226e17e621a3c2ef3f78c5..ab3cf308fc04e227d5402712f6bab226fea04711 100644 --- a/paddle/fluid/framework/operator.cc +++ b/paddle/fluid/framework/operator.cc @@ -19,8 +19,6 @@ limitations under the License. */ #include #include #include -#include "gflags/gflags.h" -#include "glog/logging.h" #include "paddle/fluid/framework/data_transform.h" #include "paddle/fluid/framework/executor.h" #include "paddle/fluid/framework/lod_tensor.h" @@ -1075,7 +1073,9 @@ Scope* OperatorWithKernel::PrepareData( proto::VarType::Type OperatorWithKernel::IndicateDataType( const ExecutionContext& ctx) const { - int data_type = -1; + proto::VarType::Type dafault_data_type = + static_cast(-1); + proto::VarType::Type data_type = dafault_data_type; for (auto& input : this->inputs_) { const std::vector vars = ctx.MultiInputVar(input.first); for (size_t i = 0; i < vars.size(); ++i) { @@ -1092,18 +1092,19 @@ proto::VarType::Type OperatorWithKernel::IndicateDataType( if (t != nullptr) { PADDLE_ENFORCE(t->IsInitialized(), "Input %s(%lu)is not initialized", input.first, i); - int tmp = static_cast(t->type()); + proto::VarType::Type tmp = t->type(); PADDLE_ENFORCE( - tmp == data_type || data_type == -1, + tmp == data_type || data_type == dafault_data_type, "DataType of Paddle Op %s must be the same. Get (%d) != (%d)", - Type(), data_type, tmp); + Type(), DataTypeToString(data_type), DataTypeToString(tmp)); data_type = tmp; } } } } - PADDLE_ENFORCE(data_type != -1, "DataType should be indicated by input"); - return static_cast(data_type); + PADDLE_ENFORCE(data_type != dafault_data_type, + "DataType should be indicated by input"); + return data_type; } OpKernelType OperatorWithKernel::GetExpectedKernelType( diff --git a/paddle/fluid/framework/tensor_impl.h b/paddle/fluid/framework/tensor_impl.h index ce3ad18b1fb1c6304eaa60173e6dfad5e9dafb2d..ef5404e4755817cefc925acbf4882ff86d1f0ba3 100644 --- a/paddle/fluid/framework/tensor_impl.h +++ b/paddle/fluid/framework/tensor_impl.h @@ -25,7 +25,8 @@ inline const T* Tensor::data() const { check_memory_size(); bool valid = std::is_same::value || type_ == DataTypeTrait::DataType; - PADDLE_ENFORCE(valid, "Tensor holds the wrong type, it holds %d", type_); + PADDLE_ENFORCE(valid, "Tensor holds the wrong type, it holds %d", + DataTypeToString(type_)); return reinterpret_cast( reinterpret_cast(holder_->ptr()) + offset_); diff --git a/paddle/fluid/imperative/CMakeLists.txt b/paddle/fluid/imperative/CMakeLists.txt index a730b84a916ea2c3e17dd4becaf939cc28160457..5db422119966948f75970874e13d416ea699158a 100644 --- a/paddle/fluid/imperative/CMakeLists.txt +++ b/paddle/fluid/imperative/CMakeLists.txt @@ -1,5 +1,5 @@ if(WITH_PYTHON) -cc_library(layer SRCS layer.cc DEPS proto_desc operator) -cc_library(tracer SRCS tracer.cc DEPS proto_desc) +cc_library(layer SRCS layer.cc DEPS proto_desc operator device_context blas) +cc_library(tracer SRCS tracer.cc DEPS proto_desc device_context) cc_library(engine SRCS engine.cc) endif() diff --git a/paddle/fluid/imperative/layer.cc b/paddle/fluid/imperative/layer.cc index b7df4b8886d629e98225c95eae9a4f2ed9400710..83fc6ee2e299f5fa18d5cc6f220c0be6a66e709d 100644 --- a/paddle/fluid/imperative/layer.cc +++ b/paddle/fluid/imperative/layer.cc @@ -13,6 +13,7 @@ // limitations under the License. #include "paddle/fluid/imperative/layer.h" + #include #include #include @@ -22,6 +23,9 @@ #include "paddle/fluid/framework/lod_tensor.h" #include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/operator.h" +#include "paddle/fluid/framework/tensor_util.h" +#include "paddle/fluid/operators/math/blas.h" +#include "paddle/fluid/platform/device_context.h" #include "paddle/fluid/string/printf.h" namespace paddle { @@ -34,22 +38,66 @@ std::map py_funcs_; using framework::Variable; -void AddTo(Variable* src, Variable* dst) { - framework::LoDTensor* dst_tensor = dst->GetMutable(); - framework::LoDTensor* src_tensor = src->GetMutable(); +namespace detail { + +template +class TensorAddToFunctor : public boost::static_visitor<> { + public: + TensorAddToFunctor(int64_t numel, const T* x, T* y) + : numel_(numel), x_(x), y_(y) {} + + void operator()(const platform::CPUPlace& place) { + platform::CPUDeviceContext* ctx = dynamic_cast( + platform::DeviceContextPool::Instance().Get(place)); + auto blas = operators::math::GetBlas(*ctx); + blas.AXPY(numel_, 1., x_, y_); + } + +#ifdef PADDLE_WITH_CUDA + void operator()(const platform::CUDAPlace& place) { + platform::CUDADeviceContext* ctx = + dynamic_cast( + platform::DeviceContextPool::Instance().Get(place)); + auto blas = operators::math::GetBlas(*ctx); + blas.AXPY(numel_, 1., x_, y_); + } +#else + void operator()(const platform::CUDAPlace& place) { + PADDLE_THROW("Do NOT support gradient merge in place %s", place); + } +#endif + + // there is NO blas in CUDAPinnedPlace + void operator()(const platform::CUDAPinnedPlace& place) { + PADDLE_THROW("Do NOT support gradient merge in place %s", place); + } + + private: + int64_t numel_; + const T* x_; + T* y_; +}; + +} // namespace detail + +void AddTo(Variable* src, Variable* dst, platform::Place place) { + framework::Tensor* dst_tensor = dst->GetMutable(); + framework::Tensor* src_tensor = src->GetMutable(); + // FIXME(minqiyang): loss_grad op will pass a zero grad of label // ugly fix for it if (src_tensor->numel() == 0) { return; } + PADDLE_ENFORCE(dst_tensor->numel() == src_tensor->numel(), "dst_numel %lld vs. src_numel %lld", dst_tensor->numel(), src_tensor->numel()); - float* dst_data = dst_tensor->mutable_data(platform::CPUPlace()); - const float* src_data = src_tensor->data(); - for (int64_t i = 0; i < src_tensor->numel(); ++i) { - dst_data[i] += src_data[i]; - } + + detail::TensorAddToFunctor func( + src_tensor->numel(), src_tensor->data(), + dst_tensor->mutable_data(place)); + boost::apply_visitor(func, place); } class Autograd { @@ -120,66 +168,104 @@ class Autograd { } }; +std::unique_ptr VarBase::NewVarBase(const platform::Place& dst_place, + const bool blocking) const { + PADDLE_ENFORCE(var_->IsInitialized(), + "Variable must be initialized when getting numpy tensor"); + + std::unique_ptr new_var(new VarBase()); + framework::LoDTensor* tensor = + new_var->var_->GetMutable(); + tensor->Resize(var_->Get().dims()); + tensor->set_lod(var_->Get().lod()); + + if (blocking) { + platform::DeviceContext* dev_ctx = + platform::DeviceContextPool::Instance().Get(dst_place); + + framework::TensorCopySync(var_->Get(), dst_place, + tensor); + + dev_ctx->Wait(); + } else { + framework::TensorCopy(var_->Get(), dst_place, tensor); + } + + if (platform::is_gpu_place(dst_place)) { + VLOG(3) << "copy tensor " << var_desc_->Name() << " from gpu"; + } + + return new_var; +} + framework::LoDTensor& VarBase::GradValue() { VLOG(3) << "get var grad " << var_desc_->Name(); return *(grads_->var_->GetMutable()); } std::map> OpBase::ApplyGrad() { - if (!grad_op_desc_ && backward_id_ <= 0) { + if (grad_op_descs_.empty() && backward_id_ <= 0) { LOG(WARNING) << "op with no grad: " << op_desc_->Type(); return {}; } - std::map> grad_outputs; + std::vector grad_outputs; if (backward_id_ > 0) { VLOG(3) << "py_layer_grad"; - grad_outputs[framework::GradVarName(PyLayer::kFwdOut)] = PyLayer::ApplyGrad( - backward_id_, - grad_input_vars_[framework::GradVarName(PyLayer::kFwdInp)]); + grad_outputs.resize(1); + grad_outputs[0][framework::GradVarName(PyLayer::kFwdOut)] = + PyLayer::ApplyGrad( + backward_id_, + grad_input_vars_[0][framework::GradVarName(PyLayer::kFwdInp)]); } else { - VLOG(3) << "op grad " << grad_op_desc_->Type(); - for (auto it : grad_output_vars_) { - auto& outputs = grad_outputs[it.first]; - for (size_t i = 0; i < it.second.size(); ++i) { - // Allocate a new variable - Variable* tmp_var = new framework::Variable(); - tmp_var->GetMutable(); - outputs.push_back(tmp_var); + grad_outputs.resize(grad_op_descs_.size()); + for (size_t k = 0; k < grad_op_descs_.size(); ++k) { + framework::OpDesc* grad_op_desc = grad_op_descs_[k]; + VLOG(3) << "op grad " << grad_op_desc->Type(); + for (auto it : grad_output_vars_[k]) { + auto& outputs = grad_outputs[k][it.first]; + for (size_t i = 0; i < it.second.size(); ++i) { + // Allocate a new variable + Variable* tmp_var = new framework::Variable(); + tmp_var->GetMutable(); + outputs.push_back(tmp_var); + } } - } - framework::RuntimeContext ctx(grad_input_vars_, grad_outputs); + framework::RuntimeContext ctx(grad_input_vars_[k], grad_outputs[k]); - // No need to do compile time infer shape here. - // grad_op_desc_->InferShape(*block_); - grad_op_desc_->InferVarType(block_); + // No need to do compile time infer shape here. + // grad_op_desc_->InferShape(*block_); + grad_op_desc->InferVarType(block_); - std::unique_ptr opbase = - framework::OpRegistry::CreateOp(*grad_op_desc_); - framework::OperatorWithKernel* op_kernel = - dynamic_cast(opbase.get()); - PADDLE_ENFORCE_NOT_NULL(op_kernel, "only support op with kernel"); + std::unique_ptr opbase = + framework::OpRegistry::CreateOp(*grad_op_desc); + framework::OperatorWithKernel* op_kernel = + dynamic_cast(opbase.get()); + PADDLE_ENFORCE_NOT_NULL(op_kernel, "only support op with kernel"); - framework::Scope scope; - platform::CPUPlace place; - PreparedOp p = PreparedOp::Prepare(ctx, *op_kernel, place); - p.op.RuntimeInferShape(scope, place, ctx); - p.func(framework::ExecutionContext(p.op, scope, *p.dev_ctx, p.ctx)); + framework::Scope scope; + PreparedOp p = PreparedOp::Prepare(ctx, *op_kernel, place_); + p.op.RuntimeInferShape(scope, place_, ctx); + p.func(framework::ExecutionContext(p.op, scope, *p.dev_ctx, p.ctx)); + } } - for (auto it : grad_output_vars_) { - auto& outputs = grad_outputs[it.first]; - auto& origin_outputs = it.second; - PADDLE_ENFORCE_EQ(outputs.size(), origin_outputs.size()); - - for (size_t i = 0; i < outputs.size(); ++i) { - framework::Variable* grad = outputs[i]; - framework::Variable* orig_grad = origin_outputs[i]; - AddTo(grad, orig_grad); - delete grad; + for (size_t k = 0; k < grad_output_vars_.size(); ++k) { + for (auto it : grad_output_vars_[k]) { + auto& outputs = grad_outputs[k][it.first]; + auto& origin_outputs = it.second; + PADDLE_ENFORCE_EQ(outputs.size(), origin_outputs.size()); + + for (size_t i = 0; i < outputs.size(); ++i) { + framework::Variable* grad = outputs[i]; + framework::Variable* orig_grad = origin_outputs[i]; + AddTo(grad, orig_grad, place_); + delete grad; + } } } + return input_vars_; } @@ -188,8 +274,10 @@ void VarBase::RunBackward() { VLOG(3) << "start backward"; auto grads_t = grads_->var_->GetMutable(); - float* data = grads_t->mutable_data(platform::CPUPlace()); - std::fill(data, data + grads_t->numel(), 1.0); + operators::math::set_constant( + *(platform::DeviceContextPool::Instance().Get( + var_->GetMutable()->place())), + grads_t, 1.0); PADDLE_ENFORCE( grads_ == diff --git a/paddle/fluid/imperative/layer.h b/paddle/fluid/imperative/layer.h index 0b1077c640e076797ba7e0200dc8d0eb8bfcff16..dc97433a5102b39d03ea5cac3157c027f9d67c98 100644 --- a/paddle/fluid/imperative/layer.h +++ b/paddle/fluid/imperative/layer.h @@ -21,17 +21,21 @@ #include // NOLINT #include // NOLINT #include // NOLINT +#include // NOLINT #include "paddle/fluid/framework/op_desc.h" #include "paddle/fluid/framework/operator.h" #include "paddle/fluid/framework/var_desc.h" #include "paddle/fluid/platform/enforce.h" +#include "paddle/fluid/platform/device_context.h" #include "paddle/fluid/imperative/type_defs.h" namespace paddle { namespace imperative { +class VarBase; + namespace py = ::pybind11; class PreparedOp { @@ -81,6 +85,8 @@ class PreparedOp { return PreparedOp(op, ctx, kernel_iter->second, dev_ctx); } + inline platform::DeviceContext* GetDeviceContext() const { return dev_ctx; } + const framework::OperatorBase& op; const framework::RuntimeContext& ctx; framework::OperatorWithKernel::OpKernelFunc func; @@ -148,6 +154,9 @@ class VarBase { framework::LoDTensor& GradValue(); + std::unique_ptr NewVarBase(const platform::Place& dst_place, + const bool blocking) const; + inline std::string GradName() const { PADDLE_ENFORCE( var_desc_, @@ -175,11 +184,13 @@ class OpBase { OpBase() : op_desc_(nullptr), forward_id_(-1), - grad_op_desc_(nullptr), - backward_id_(-1) {} + backward_id_(-1), + place_(platform::CPUPlace()) {} virtual ~OpBase() { - if (grad_op_desc_) delete grad_op_desc_; + for (framework::OpDesc* desc : grad_op_descs_) { + delete desc; + } } std::map> ApplyGrad(); @@ -188,18 +199,25 @@ class OpBase { // For pure python PyLayer, use `forward_id_`, otherwise, use op_desc_. framework::OpDesc* op_desc_; int forward_id_; - // When has backward, one of `grad_op_desc_` or `backward_id_` is set, + + // When has backward, one of `grad_op_descs_` or `backward_id_` is set, // not both. - framework::OpDesc* grad_op_desc_; + // Note: each fwd op corresponds to a vector of bwd ops. + std::vector grad_op_descs_; int backward_id_; + platform::Place place_; + VarBasePtrMap input_vars_; VarBasePtrMap output_vars_; OpBasePtrMap pre_ops_; std::map> pre_ops_out_idx_; - framework::VariableValueMap grad_input_vars_; - framework::VariableValueMap grad_output_vars_; + // Inputs to a vector of bwd ops. + std::vector grad_input_vars_; + // Outputs to a vector of bwd ops. + std::vector grad_output_vars_; + framework::BlockDesc* block_; }; diff --git a/paddle/fluid/imperative/tracer.cc b/paddle/fluid/imperative/tracer.cc index 843fee41f38f1247473ba06978248659495f8585..cd62807a5532e6b2309cb5a8f679c3097b51c9e9 100644 --- a/paddle/fluid/imperative/tracer.cc +++ b/paddle/fluid/imperative/tracer.cc @@ -14,33 +14,60 @@ #include "paddle/fluid/imperative/tracer.h" +#include "paddle/fluid/operators/math/math_function.h" +#include "paddle/fluid/platform/device_context.h" +#include "paddle/fluid/platform/enforce.h" + namespace paddle { namespace imperative { void CreateGradOp(const framework::OpDesc& op_desc, const std::unordered_set& no_grad_set, const std::vector& grad_sub_block, - framework::OpDesc** grad_op_desc, + std::vector* grad_op_descs, std::unordered_map* grad_to_var) { - std::vector> grad_op_descs = + PADDLE_ENFORCE(grad_op_descs->empty()); + std::vector> descs = framework::OpInfoMap::Instance() .Get(op_desc.Type()) .GradOpMaker()(op_desc, no_grad_set, grad_to_var, grad_sub_block); - PADDLE_ENFORCE(grad_op_descs.size() == 1, "Only support 1 grad op now."); - // TODO(panyx0718): Leak? - *grad_op_desc = grad_op_descs[0].release(); + for (auto& desc : descs) { + grad_op_descs->emplace_back(desc.release()); + } } -void InitVar(framework::Variable* var, framework::Variable* grad_var) { +void InitVar(framework::Variable* var, framework::Variable* grad_var, + platform::DeviceContext* dev_ctx) { + PADDLE_ENFORCE_NOT_NULL(dev_ctx, + "Could not get valid device from forward op"); auto& var_t = var->Get(); - float* data = - grad_var->GetMutable()->mutable_data( - var_t.dims(), platform::CPUPlace()); - std::fill(data, data + var_t.numel(), 0.0); + grad_var->GetMutable()->mutable_data( + var_t.dims(), dev_ctx->GetPlace()); + operators::math::set_constant( + *dev_ctx, grad_var->GetMutable(), 0.0); +} + +platform::Place GetExpectedPlace(platform::Place place, VarBasePtrMap inputs) { + platform::Place result = place; + for (auto it : inputs) { + for (VarBase* var : it.second) { + platform::Place tmp_place = + var->var_->Get().place(); + if (!platform::is_same_place(tmp_place, result)) { + PADDLE_THROW( + "Input variable should keep in the same place: %s, but get place: " + "%s of input %s instead", + result, tmp_place, it.first); + } + } + } + + return result; } void Tracer::Trace(OpBase* op, const VarBasePtrMap& inputs, const VarBasePtrMap& outputs, framework::BlockDesc* block, + const platform::Place expected_place, const bool stop_gradient) { std::map vars; @@ -105,51 +132,59 @@ void Tracer::Trace(OpBase* op, const VarBasePtrMap& inputs, PADDLE_ENFORCE_NOT_NULL(op_kernel, "only support op with kernel"); framework::Scope scope; - platform::CPUPlace place; - PreparedOp p = PreparedOp::Prepare(ctx, *op_kernel, place); - p.op.RuntimeInferShape(scope, place, ctx); - p.func(framework::ExecutionContext(p.op, scope, *p.dev_ctx, p.ctx)); + op->place_ = GetExpectedPlace(expected_place, inputs); + PreparedOp prepared_op = PreparedOp::Prepare(ctx, *op_kernel, op->place_); + prepared_op.op.RuntimeInferShape(scope, op->place_, ctx); + prepared_op.func(framework::ExecutionContext( + prepared_op.op, scope, *prepared_op.dev_ctx, prepared_op.ctx)); if (!stop_gradient) { - framework::OpDesc* grad_op_desc; - // TODO(panyx): Is this leaked? std::unique_ptr> grad_to_var( new std::unordered_map()); - CreateGradOp(*op_desc, {}, {block}, &grad_op_desc, grad_to_var.get()); - op->grad_op_desc_ = grad_op_desc; - - for (auto it : grad_op_desc->Inputs()) { - auto& grad_in_vars = op->grad_input_vars_[it.first]; - for (const std::string& grad_invar : it.second) { - block->FindRecursiveOrCreateVar(grad_invar); - auto var_it = grad_to_var->find(grad_invar); - if (var_it == grad_to_var->end()) { - auto fwd_var_it = vars.find(grad_invar); - PADDLE_ENFORCE(fwd_var_it != vars.end()); - // Forward inputs or outputs. - grad_in_vars.push_back(fwd_var_it->second->var_); - } else { - VarBase* var = vars[var_it->second]; - if (!var->grads_->var_->IsInitialized()) { - InitVar(var->var_, var->grads_->var_); + CreateGradOp(*op_desc, {}, {block}, &op->grad_op_descs_, grad_to_var.get()); + + op->grad_input_vars_.resize(op->grad_op_descs_.size()); + op->grad_output_vars_.resize(op->grad_op_descs_.size()); + for (size_t i = 0; i < op->grad_op_descs_.size(); ++i) { + framework::OpDesc* grad_op_desc = op->grad_op_descs_[i]; + for (auto it : grad_op_desc->Inputs()) { + auto& grad_in_vars = op->grad_input_vars_[i][it.first]; + for (const std::string& grad_invar : it.second) { + block->FindRecursiveOrCreateVar(grad_invar); + auto var_it = grad_to_var->find(grad_invar); + if (var_it == grad_to_var->end()) { + auto fwd_var_it = vars.find(grad_invar); + PADDLE_ENFORCE(fwd_var_it != vars.end()); + // Forward inputs or outputs. + grad_in_vars.push_back(fwd_var_it->second->var_); + } else { + VarBase* var = vars[var_it->second]; + if (!var->grads_->var_->IsInitialized()) { + InitVar(var->var_, var->grads_->var_, + prepared_op.GetDeviceContext()); + } + // Douts. + grad_in_vars.push_back(var->grads_->var_); } - // Douts. - grad_in_vars.push_back(var->grads_->var_); } } - } - for (auto it : grad_op_desc->Outputs()) { - auto& grad_out_vars = op->grad_output_vars_[it.first]; - for (const std::string& grad_outvar : it.second) { - block->FindRecursiveOrCreateVar(grad_outvar); - auto var_it = grad_to_var->find(grad_outvar); - PADDLE_ENFORCE(var_it != grad_to_var->end()); - VarBase* var = vars[var_it->second]; - if (!var->grads_->var_->IsInitialized()) { - InitVar(var->var_, var->grads_->var_); + for (auto it : grad_op_desc->Outputs()) { + auto& grad_out_vars = op->grad_output_vars_[i][it.first]; + for (const std::string& grad_outvar : it.second) { + block->FindRecursiveOrCreateVar(grad_outvar); + auto var_it = grad_to_var->find(grad_outvar); + PADDLE_ENFORCE(var_it != grad_to_var->end(), + "Could not found the grad op output var, should this " + "operator %s's stop gradient be True", + op_desc->Type()); + VarBase* var = vars[var_it->second]; + if (!var->grads_->var_->IsInitialized()) { + InitVar(var->var_, var->grads_->var_, + prepared_op.GetDeviceContext()); + } + grad_out_vars.push_back(var->grads_->var_); } - grad_out_vars.push_back(var->grads_->var_); } } } @@ -178,10 +213,12 @@ std::vector Tracer::PyTrace(OpBase* op, out->TrackPreOp(op, PyLayer::kFwdOut, i, stop_gradient); } if (!stop_gradient) { + op->grad_input_vars_.resize(1); + op->grad_output_vars_.resize(1); auto& grad_input_vars = - op->grad_input_vars_[framework::GradVarName(PyLayer::kFwdInp)]; + op->grad_input_vars_[0][framework::GradVarName(PyLayer::kFwdInp)]; auto& grad_output_vars = - op->grad_output_vars_[framework::GradVarName(PyLayer::kFwdOut)]; + op->grad_output_vars_[0][framework::GradVarName(PyLayer::kFwdOut)]; for (const VarBase* inp : inputs) { grad_input_vars.push_back(inp->var_); @@ -189,16 +226,23 @@ std::vector Tracer::PyTrace(OpBase* op, for (VarBase* out : outputs) { grad_input_vars.push_back(out->var_); } + + platform::CPUPlace place; for (VarBase* out : outputs) { grad_input_vars.push_back(out->grads_->var_); if (!grad_input_vars.back()->IsInitialized()) { - InitVar(out->var_, grad_input_vars.back()); + // TODO(minqiyang): Add GPU support for PyLayer, only support CPU now + InitVar(out->var_, grad_input_vars.back(), + platform::DeviceContextPool::Instance().Get(place)); } } + for (const VarBase* inp : inputs) { grad_output_vars.push_back(inp->grads_->var_); if (!grad_output_vars.back()->IsInitialized()) { - InitVar(inp->var_, grad_output_vars.back()); + // TODO(minqiyang): Add GPU support for PyLayer, only support CPU now + InitVar(inp->var_, grad_output_vars.back(), + platform::DeviceContextPool::Instance().Get(place)); } } } diff --git a/paddle/fluid/imperative/tracer.h b/paddle/fluid/imperative/tracer.h index f225d8abe6c0635d2bdd8dba0b12c7fc3a4110db..690838215581b09ff35a0ea13f30655b77e6e187 100644 --- a/paddle/fluid/imperative/tracer.h +++ b/paddle/fluid/imperative/tracer.h @@ -22,6 +22,7 @@ #include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/imperative/engine.h" #include "paddle/fluid/imperative/layer.h" +#include "paddle/fluid/platform/place.h" namespace paddle { namespace imperative { @@ -34,21 +35,25 @@ void CreateGradOp(const framework::OpDesc& op_desc, void InitVar(framework::Variable* var, framework::Variable* grad_var); +platform::Place GetExpectedPlace(platform::Place place, VarBasePtrMap inputs); + class Tracer { public: explicit Tracer(framework::BlockDesc* root_block) : root_block_(root_block) {} virtual ~Tracer() {} - void Trace(OpBase* op, - const std::map>& inputs, - const std::map>& outputs, - framework::BlockDesc* block, const bool stop_gradient = false); + void Trace(OpBase* op, const VarBasePtrMap& inputs, + const VarBasePtrMap& outputs, framework::BlockDesc* block, + const platform::Place expected_place, + const bool stop_gradient = false); std::vector PyTrace(OpBase* op, const std::vector& inputs, bool stop_gradient = false); private: + platform::Place GetPlace(const VarBasePtrMap& inputs); + framework::BlockDesc* root_block_; }; diff --git a/paddle/fluid/inference/analysis/argument.h b/paddle/fluid/inference/analysis/argument.h index 88ce61f9b928aba1945bddc1f9f6b785834780ca..a2546ead93c3baeb8029f6451d8a60dcc75f8571 100644 --- a/paddle/fluid/inference/analysis/argument.h +++ b/paddle/fluid/inference/analysis/argument.h @@ -28,6 +28,7 @@ #include "paddle/fluid/framework/ir/graph.h" #include "paddle/fluid/framework/program_desc.h" #include "paddle/fluid/framework/scope.h" +#include "paddle/fluid/inference/api/paddle_analysis_config.h" #include "paddle/fluid/platform/variant.h" namespace paddle { @@ -130,10 +131,14 @@ struct Argument { DECL_ARGUMENT_FIELD(tensorrt_max_batch_size, TensorRtMaxBatchSize, int); DECL_ARGUMENT_FIELD(tensorrt_workspace_size, TensorRtWorkspaceSize, int); DECL_ARGUMENT_FIELD(tensorrt_min_subgraph_size, TensorRtMinSubgraphSize, int); + DECL_ARGUMENT_FIELD(tensorrt_precision_mode, TensorRtPrecisionMode, + contrib::AnalysisConfig::Precision); // Memory optimized related. DECL_ARGUMENT_FIELD(enable_memory_optim, EnableMemoryOptim, bool); - DECL_ARGUMENT_FIELD(memory_optim_force_update, MemoryOptimForceUpdate, bool); + DECL_ARGUMENT_FIELD(static_memory_optim, StaticMemoryOptim, bool); + DECL_ARGUMENT_FIELD(static_memory_optim_force_update, + StaticMemoryOptimForceUpdate, bool); // Indicate which kind of sort algorithm is used for operators, the memory // optimization relays on the sort algorithm. DECL_ARGUMENT_FIELD(memory_optim_sort_kind, MemoryOptimSortKind, int); diff --git a/paddle/fluid/inference/analysis/helper.cc b/paddle/fluid/inference/analysis/helper.cc index ca40c01fc57dbcc2ca16770a1b7d798de8b5625b..4f5c50d0d6b9ac94130cb82fb342ae5ee592f2c0 100644 --- a/paddle/fluid/inference/analysis/helper.cc +++ b/paddle/fluid/inference/analysis/helper.cc @@ -36,6 +36,14 @@ void SetAttr(framework::proto::OpDesc *op, const std::string &name, attr->set_i(data); } template <> +void SetAttr(framework::proto::OpDesc *op, const std::string &name, + const bool &data) { + auto *attr = op->add_attrs(); + attr->set_name(name); + attr->set_type(paddle::framework::proto::AttrType::BOOLEAN); + attr->set_b(data); +} +template <> void SetAttr(framework::proto::OpDesc *op, const std::string &name, const int64_t &data) { auto *attr = op->add_attrs(); diff --git a/paddle/fluid/inference/analysis/helper.h b/paddle/fluid/inference/analysis/helper.h index de04713b531dc421b885473cc8956e8ba6b63574..120f6ef27d49ae59ec36304dc3742cd9ca0afa4b 100644 --- a/paddle/fluid/inference/analysis/helper.h +++ b/paddle/fluid/inference/analysis/helper.h @@ -17,6 +17,7 @@ limitations under the License. */ #include #include #include +#include #include #include #include @@ -29,9 +30,14 @@ limitations under the License. */ #include "paddle/fluid/platform/port.h" #ifdef _WIN32 +#include +#include #define GCC_ATTRIBUTE(attr__) ; +#define MKDIR(path) _mkdir(path) #else +#include #define GCC_ATTRIBUTE(attr__) __attribute__((attr__)); +#define MKDIR(path) mkdir(path, S_IRWXU | S_IRWXG | S_IROTH | S_IXOTH) #endif #define __SHOULD_USE_RESULT__ GCC_ATTRIBUTE(warn_unused_result) @@ -163,6 +169,54 @@ static bool PathExists(const std::string &path) { return false; } +static std::string GetDirRoot(const std::string &path) { + char sep = '/'; + +#ifdef _WIN32 + sep = '\\'; +#endif + + size_t i = path.rfind(sep, path.length()); + if (i != std::string::npos) { + return (path.substr(0, i)); + } + return path; +} + +static std::string GetOrCreateModelOptCacheDir(const std::string &model_root) { + std::string opt_cache_dir = model_root + "/_opt_cache/"; + if (!PathExists(opt_cache_dir)) { + PADDLE_ENFORCE(MKDIR(opt_cache_dir.c_str()) != -1, + "Can not create optimize cache directory: %s, Make sure you " + "have permission to write", + opt_cache_dir); + } + return opt_cache_dir; +} + +static std::string GetTrtCalibPath(const std::string &model_root, + const std::string &engine_key) { + return model_root + "/trt_calib_" + engine_key; +} + +// If there is no calib table data file in model_opt_cache_dir, return "". +static std::string GetTrtCalibTableData(const std::string &model_opt_cache_dir, + const std::string &engine_key, + bool enable_int8) { + std::string trt_calib_table_path = + GetTrtCalibPath(model_opt_cache_dir, engine_key); + if (enable_int8 && FileExists(trt_calib_table_path)) { + VLOG(3) << "Calibration table file: " << trt_calib_table_path + << "is found here"; + std::ifstream infile(trt_calib_table_path, std::ios::in); + std::stringstream buffer; + buffer << infile.rdbuf(); + std::string calibration_data(buffer.str()); + return calibration_data; + } + return ""; +} + } // namespace analysis } // namespace inference } // namespace paddle diff --git a/paddle/fluid/inference/analysis/ir_pass_manager.cc b/paddle/fluid/inference/analysis/ir_pass_manager.cc index 4e1464226450b833e6d8dae2be2dcad89dd1e5e4..99611ce84b23896dd173831a03d77c6e0252d998 100644 --- a/paddle/fluid/inference/analysis/ir_pass_manager.cc +++ b/paddle/fluid/inference/analysis/ir_pass_manager.cc @@ -67,6 +67,20 @@ void IRPassManager::CreatePasses(Argument *argument, pass->Set("max_batch_size", new int(argument->tensorrt_max_batch_size())); pass->Set("min_subgraph_size", new int(argument->tensorrt_min_subgraph_size())); + pass->Set("program", + new framework::ProgramDesc *(&argument->main_program())); + + bool enable_int8 = argument->tensorrt_precision_mode() == + contrib::AnalysisConfig::Precision::kInt8; + + pass->Set("enable_int8", new bool(enable_int8)); + std::string model_opt_cache_dir = + argument->Has("model_dir") + ? argument->model_dir() + : GetDirRoot(argument->model_program_path()); + pass->Set( + "model_opt_cache_dir", + new std::string(GetOrCreateModelOptCacheDir(model_opt_cache_dir))); } // graph_ = pass->Apply(std::move(graph_)); @@ -91,11 +105,14 @@ std::unique_ptr IRPassManager::Apply(std::unique_ptr graph) { } framework::proto::ProgramDesc IRPassManager::AcquireProgram( - std::unique_ptr *graph, const ProgramDesc &program) const { + std::unique_ptr *graph, ProgramDesc *program) const { auto pass = framework::ir::PassRegistry::Instance().Get("graph_to_program_pass"); - ProgramDesc desc(program); + // Direct using ProgramDesc desc(argument->main_program()) may cause + // incomplete copies of information. + ProgramDesc desc; + desc.CopyFrom(*program->Proto()); pass->SetNotOwned("program", &desc); auto *the_graph = graph->release(); *graph = pass->Apply(std::unique_ptr(the_graph)); diff --git a/paddle/fluid/inference/analysis/ir_pass_manager.h b/paddle/fluid/inference/analysis/ir_pass_manager.h index 983a582649706fa6eedb5aa459b5ac53b98f658b..2a595cb36b8345157b3fd26afc62aabfa98b87bc 100644 --- a/paddle/fluid/inference/analysis/ir_pass_manager.h +++ b/paddle/fluid/inference/analysis/ir_pass_manager.h @@ -29,6 +29,7 @@ #include "paddle/fluid/framework/program_desc.h" #include "paddle/fluid/framework/scope.h" #include "paddle/fluid/inference/analysis/argument.h" +#include "paddle/fluid/inference/analysis/helper.h" namespace paddle { namespace inference { @@ -42,8 +43,8 @@ class IRPassManager final { std::unique_ptr Apply(std::unique_ptr graph); - framework::proto::ProgramDesc AcquireProgram( - std::unique_ptr *graph, const ProgramDesc &program) const; + framework::proto::ProgramDesc AcquireProgram(std::unique_ptr *graph, + ProgramDesc *program) const; framework::ir::Graph &graph() const { return *graph_; } diff --git a/paddle/fluid/inference/analysis/ir_passes/tensorrt_subgraph_pass.cc b/paddle/fluid/inference/analysis/ir_passes/tensorrt_subgraph_pass.cc index 5f25303cc1eaa6b563f0f8f4289b38499eb487cc..69a9caec030600332c9f11ba255e4e642bd41e96 100644 --- a/paddle/fluid/inference/analysis/ir_passes/tensorrt_subgraph_pass.cc +++ b/paddle/fluid/inference/analysis/ir_passes/tensorrt_subgraph_pass.cc @@ -13,6 +13,7 @@ // limitations under the License. #include +#include #include #include @@ -67,12 +68,33 @@ std::unique_ptr analysis::TensorRtSubgraphPass::ApplyImpl( return graph; } +std::string GenerateEngineKey(const std::set &engine_inputs, + const std::set &engine_outputs) { + std::string engine_hash_key = ""; + for (auto name : engine_inputs) { + engine_hash_key += name; + } + for (auto name : engine_outputs) { + engine_hash_key += name; + } + auto engine_key = std::to_string(std::hash()(engine_hash_key)); + return engine_key; +} + void TensorRtSubgraphPass::CreateTensorRTOp(framework::ir::Node *node, Graph *graph) const { auto *op_desc = node->Op(); auto &subgraph = *Agent(node).subgraph(); PADDLE_ENFORCE(!subgraph.empty()); + framework::ProgramDesc *program_desc = + Get("program"); + // Add new block for TensorRTEngineOP + const framework::BlockDesc &main_block = + program_desc->Block(framework::kRootBlockIndex); + // const framework::BlockDesc& main_block = program_desc->Block(0); + framework::BlockDesc *new_block = program_desc->AppendBlock(main_block); + // An fake block desc. framework::proto::BlockDesc block_proto; framework::BlockDesc block_desc(nullptr, &block_proto); @@ -82,13 +104,18 @@ void TensorRtSubgraphPass::CreateTensorRTOp(framework::ir::Node *node, subgraph.size()); for (auto *node : subgraph) { + auto *new_block_op = new_block->AppendOp(); auto *op = block_desc.AppendOp(); + *new_block_op->Proto() = *node->Op()->Proto(); *op->Proto() = *node->Op()->Proto(); } - // collect inputs - std::unordered_set input_names; - std::unordered_set input_names_with_id; + // Then, we will use the input_names_with_id and output_names_with_id to + // generate the eigine key. + // So, We use set instead of unordered_set here to ensure that the engine key + // is unique. + std::set input_names; + std::set input_names_with_id; for (auto *x : node->inputs) { input_names.insert(x->Name()); input_names_with_id.insert(x->Name() + std::to_string(x->id())); @@ -96,8 +123,8 @@ void TensorRtSubgraphPass::CreateTensorRTOp(framework::ir::Node *node, op_desc->SetInput( "Xs", std::vector(input_names.begin(), input_names.end())); - std::unordered_set output_names; - std::unordered_set output_names_with_id; + std::set output_names; + std::set output_names_with_id; for (auto *x : node->outputs) { output_names.insert(x->Name()); output_names_with_id.insert(x->Name() + std::to_string(x->id())); @@ -182,7 +209,6 @@ void TensorRtSubgraphPass::CreateTensorRTOp(framework::ir::Node *node, // to Tensor. std::vector output_mapping; for (auto name : output_names) { - // LOG(INFO) << name << " " << output_name_map.size(); PADDLE_ENFORCE(output_name_map.count(name) != 0); output_mapping.push_back(output_name_map[name]); } @@ -193,16 +219,29 @@ void TensorRtSubgraphPass::CreateTensorRTOp(framework::ir::Node *node, *vars->Add() = *node->Var()->Proto(); } } + PADDLE_ENFORCE(!block_desc.Proto()->vars().empty(), "the block has no var-desc"); PADDLE_ENFORCE(!output_mapping.empty()); - // Set attrs + op_desc->SetBlockAttr("sub_block", new_block); SetAttr(op_desc->Proto(), "subgraph", block_desc.Proto()->SerializeAsString()); + // Set attrs SetAttr(op_desc->Proto(), "max_batch_size", Get("max_batch_size")); SetAttr(op_desc->Proto(), "workspace_size", Get("workspace_size")); SetAttr(op_desc->Proto(), "parameters", ExtractParameters(graph->Nodes())); SetAttr(op_desc->Proto(), "output_name_mapping", output_mapping); + + auto enable_int8 = Get("enable_int8"); + auto engine_key = + GenerateEngineKey(input_names_with_id, output_names_with_id); + + std::string calibration_data = GetTrtCalibTableData( + Get("model_opt_cache_dir"), engine_key, enable_int8); + SetAttr(op_desc->Proto(), "calibration_data", calibration_data); + + SetAttr(op_desc->Proto(), "enable_int8", enable_int8); + SetAttr(op_desc->Proto(), "engine_key", engine_key); } std::vector ExtractParameters( diff --git a/paddle/fluid/inference/analysis/passes/CMakeLists.txt b/paddle/fluid/inference/analysis/passes/CMakeLists.txt index 691c336ebe4b6a6cb60023859b21665b6a4756a8..9d74dc6c211e4fcb6d1e7de5369eee847f49fc78 100644 --- a/paddle/fluid/inference/analysis/passes/CMakeLists.txt +++ b/paddle/fluid/inference/analysis/passes/CMakeLists.txt @@ -1,6 +1,6 @@ cc_library(ir_graph_build_pass SRCS ir_graph_build_pass.cc DEPS analysis_pass argument ir_pass_manager) cc_library(ir_analysis_pass SRCS ir_analysis_pass.cc DEPS analysis_pass argument ir_pass_manager) -cc_library(memory_optim_pass SRCS memory_optimize_pass.cc DEPS analysis_pass) +cc_library(memory_optim_pass SRCS memory_optimize_pass.cc DEPS analysis_pass zero_copy_tensor) cc_library(ir_params_sync_among_devices_pass SRCS ir_params_sync_among_devices_pass.cc DEPS analysis_pass argument ir_pass_manager) cc_library(ir_graph_to_program_pass SRCS ir_graph_to_program_pass.cc DEPS analysis_pass graph_to_program_pass) diff --git a/paddle/fluid/inference/analysis/passes/ir_graph_to_program_pass.cc b/paddle/fluid/inference/analysis/passes/ir_graph_to_program_pass.cc index f1da37af3cc5fa55eb66a1822aefe96eda1dc4fb..6b3d80fcef0be1527062edbb37ea39cc5d95a168 100644 --- a/paddle/fluid/inference/analysis/passes/ir_graph_to_program_pass.cc +++ b/paddle/fluid/inference/analysis/passes/ir_graph_to_program_pass.cc @@ -31,7 +31,11 @@ void IrGraphToProgramPass::RunImpl(Argument *argument) { } std::unique_ptr graph(argument->main_graph_ptr()); - framework::ProgramDesc desc(argument->main_program()); + + // Direct using ProgramDesc desc(argument->main_program()) may cause + // incomplete copies of information. + framework::ProgramDesc desc; + desc.CopyFrom(*argument->main_program().Proto()); pass->SetNotOwned("program", &desc); auto thegraph = pass->Apply(std::move(graph)); thegraph.release(); // the argument still own the graph. diff --git a/paddle/fluid/inference/analysis/passes/memory_optimize_pass.cc b/paddle/fluid/inference/analysis/passes/memory_optimize_pass.cc index 57683c0b727ef1c922e3a308db28d0af4f193602..3d1be9196fdeacd8ff852dbb595473a687352ccf 100644 --- a/paddle/fluid/inference/analysis/passes/memory_optimize_pass.cc +++ b/paddle/fluid/inference/analysis/passes/memory_optimize_pass.cc @@ -444,6 +444,26 @@ std::vector>> DeseralizeBatchVarShapes( return batch_shapes; } +// Replace the -1 in shape to a real number to fake the shape. +std::vector>> FakeBatchVarShapes( + const framework::ProgramDesc& program) { + std::vector>> res; + res.emplace_back(); + auto& record = res.front(); + const int fake_batch_size = 3; + for (auto* var : program.Block(0).AllVars()) { + if (var->GetType() == + framework::proto::VarType::Type::VarType_Type_LOD_TENSOR) { + auto shape = var->GetShape(); + for (auto& v : shape) { + if (v < 0) v = fake_batch_size; + } + record[var->Name()].assign(shape.begin(), shape.end()); + } + } + return res; +} + // Calculate the average dim of each tensor from the batch shape cache. std::unordered_map GetBatchAverageSize( const std::vector>>& batches) { @@ -478,6 +498,7 @@ std::vector> AnalysisBatchShapesByBatchSize( std::unordered_map var_batchsize_hashes; for (auto& batch : batches) { for (auto& ele : batch) { + PADDLE_ENFORCE(!ele.second.empty()); int batch_size = ele.second.front(); // TODO(Superjomn) might consume large memory here, use combine hash. var_batchsize_hashes[ele.first] << batch_size; @@ -538,9 +559,21 @@ std::vector> AnalysisBatchShapesBySimilarSize( std::string MemoryOptimizePass::repr() const { return "memory optimize pass"; } +std::pair GetRange( + const std::unordered_map& ave_size) { + auto res = std::make_pair(std::numeric_limits::max(), + std::numeric_limits::min()); + for (auto& item : ave_size) { + res.first = std::min(item.second, res.first); + res.second = std::max(item.second, res.second); + } + return res; +} + void MemoryOptimizePass::RunImpl(Argument* argument) { // When force update, should not optimize memory. - if (!argument->enable_memory_optim() || argument->memory_optim_force_update()) + if (!argument->enable_memory_optim() || + argument->static_memory_optim_force_update()) return; graph_ = argument->main_graph_ptr(); @@ -549,21 +582,38 @@ void MemoryOptimizePass::RunImpl(Argument* argument) { argument->model_program_path_valid() ? argument->model_program_path() : ""); VLOG(3) << "Load memory cache from " << path; - if (inference::IsFileExists(path)) { - VLOG(4) << "Performing memory optimize"; - auto batches = DeseralizeBatchVarShapes(path); - auto var_batch_ave_size = GetBatchAverageSize(batches); + std::vector>> batches; + + if (argument->static_memory_optim() && inference::IsFileExists(path)) { + string::PrettyLogInfo("--- Performing static memory optimize"); + batches = DeseralizeBatchVarShapes(path); + } else { + string::PrettyLogInfo("--- Performing dynamic memory optimize"); + batches = FakeBatchVarShapes(argument->main_program()); + } + auto var_batch_ave_size = GetBatchAverageSize(batches); + + // Get min and max memory size. + const auto range = GetRange(var_batch_ave_size); + const int cluster_size = std::max( + static_cast((range.second - range.first) / 100 /*cluster num*/), + 1024); + const int cluster_size1 = std::max( + static_cast((range.second - range.first) / 1000 /*cluster num*/), + 1024); - std::unordered_map tensor_nodes; - space_table_t space_table; - CollectVarMemorySize(var_batch_ave_size, &tensor_nodes, &space_table); + std::unordered_map tensor_nodes; + space_table_t space_table; + CollectVarMemorySize(var_batch_ave_size, &tensor_nodes, &space_table); - std::unordered_map reuse_table; - double max_saving_ratio = 0.; + std::unordered_map reuse_table; + double max_saving_ratio = 0.; - std::vector> strategies; + std::vector> strategies; - for (int sort_kind = 0; sort_kind < 2; sort_kind++) { + for (int sort_kind = 0; sort_kind < 2; sort_kind++) { + if (argument->static_memory_optim()) { + // This strategy only make scene in static memory optimize. strategies.emplace_back([&, sort_kind] { auto clustered_vars_by_batch_size = AnalysisBatchShapesByBatchSize(batches); @@ -572,71 +622,67 @@ void MemoryOptimizePass::RunImpl(Argument* argument) { space_table, &reuse_table, sort_kind, &allocation); return allocation; }); + } - strategies.emplace_back([&, sort_kind] { - auto clustered_vars_by_ave_size = AnalysisBatchShapesBySimilarSize( - space_table, batches, 1024); // interval 1kb - MemoryAllocation allocation; - MakeReusePlan(clustered_vars_by_ave_size, var_batch_ave_size, - space_table, &reuse_table, sort_kind, &allocation); - return allocation; - }); + strategies.emplace_back([&, sort_kind] { + auto clustered_vars_by_ave_size = + AnalysisBatchShapesBySimilarSize(space_table, batches, cluster_size); + MemoryAllocation allocation; + MakeReusePlan(clustered_vars_by_ave_size, var_batch_ave_size, space_table, + &reuse_table, sort_kind, &allocation); + return allocation; + }); + + strategies.emplace_back([&, sort_kind] { + auto clustered_vars_by_ave_size = + AnalysisBatchShapesBySimilarSize(space_table, batches, cluster_size1); + MemoryAllocation allocation; + MakeReusePlan(clustered_vars_by_ave_size, var_batch_ave_size, space_table, + &reuse_table, sort_kind, &allocation); + return allocation; + }); + + strategies.emplace_back([&, sort_kind] { + auto clustered_vars_by_ave_size = AnalysisBatchShapesBySimilarSize( + space_table, batches, + std::numeric_limits::max()); // no intervals + MemoryAllocation allocation; + MakeReusePlan(clustered_vars_by_ave_size, var_batch_ave_size, space_table, + &reuse_table, sort_kind, &allocation); + return allocation; + }); + } - strategies.emplace_back([&, sort_kind] { - auto clustered_vars_by_ave_size = AnalysisBatchShapesBySimilarSize( - space_table, batches, 1024 * 1024); // interval 1MB - MemoryAllocation allocation; - MakeReusePlan(clustered_vars_by_ave_size, var_batch_ave_size, - space_table, &reuse_table, sort_kind, &allocation); - return allocation; - }); + std::function* best_strategy{nullptr}; - strategies.emplace_back([&, sort_kind] { - auto clustered_vars_by_ave_size = AnalysisBatchShapesBySimilarSize( - space_table, batches, - std::numeric_limits::max()); // no intervals - MemoryAllocation allocation; - MakeReusePlan(clustered_vars_by_ave_size, var_batch_ave_size, - space_table, &reuse_table, sort_kind, &allocation); - return allocation; - }); + // Try all strategies to get the best result. + for (auto& strategy : strategies) { + auto allocation = strategy(); + string::PrettyLogDetail("--- get strategy saving %f memory for workspace", + allocation.GetSavingRatio()); + if (allocation.GetSavingRatio() > max_saving_ratio) { + max_saving_ratio = allocation.GetSavingRatio(); + best_strategy = &strategy; } + } + if (!best_strategy) { + LOG(ERROR) << "This model makes poor memory optimize, skip memory optimize"; + return; + } + auto memory_allocation = (*best_strategy)(); - std::function* best_strategy{nullptr}; + string::PrettyLogInfo( + "--- Saved %.2f%s memory for workspace(temporary variables)", + memory_allocation.GetSavingRatio() * 100, "%"); - // Try all strategies to get the best result. - for (auto& strategy : strategies) { - auto allocation = strategy(); - string::PrettyLogDetail("--- get strategy saving %f memory for workspace", - allocation.GetSavingRatio()); - if (allocation.GetSavingRatio() > max_saving_ratio) { - max_saving_ratio = allocation.GetSavingRatio(); - best_strategy = &strategy; - } - } - if (!best_strategy) { - LOG(ERROR) - << "This model makes poor memory optimize, skip memory optimize"; - return; - } - auto memory_allocation = (*best_strategy)(); - - string::PrettyLogH2( - "--- Saved %.2f%s memory for workspace(temporary variables)", - memory_allocation.GetSavingRatio() * 100, "%"); - string::PrettyLogDetail("--- Allocated %d MB", - memory_allocation.allocated / 1024. / 1024.); - string::PrettyLogDetail("--- Saved %d MB", - memory_allocation.saved / 1024. / 1024.); - argument->main_graph().Set(framework::ir::kGraphToProgramVarsToRemove, - new std::unordered_set); - auto& vars2remove = - argument->main_graph().Get>( - framework::ir::kGraphToProgramVarsToRemove); - - PerformReusePlan(reuse_table, memory_allocation.sort_kind, &vars2remove); - argument->SetMemoryOptimSortKind(memory_allocation.sort_kind); - } + argument->main_graph().Set(framework::ir::kGraphToProgramVarsToRemove, + new std::unordered_set); + auto& vars2remove = + argument->main_graph().Get>( + framework::ir::kGraphToProgramVarsToRemove); + + PerformReusePlan(reuse_table, memory_allocation.sort_kind, &vars2remove); + argument->SetMemoryOptimSortKind(memory_allocation.sort_kind); } float MemoryOptimizePass::MemoryAllocation::GetSavingRatio() const { diff --git a/paddle/fluid/inference/analysis/passes/memory_optimize_pass.h b/paddle/fluid/inference/analysis/passes/memory_optimize_pass.h index fa1ad9c8c6aeff60ec4468f41140c57be790af7f..216f416de0d1003b944337ee98fb4e6a22c66fc5 100644 --- a/paddle/fluid/inference/analysis/passes/memory_optimize_pass.h +++ b/paddle/fluid/inference/analysis/passes/memory_optimize_pass.h @@ -15,7 +15,7 @@ #pragma once #include "paddle/fluid/inference/analysis/analysis_pass.h" -#include "paddle/fluid/inference/analysis/passes/memory_optimize_pass.h" +#include "paddle/fluid/platform/port.h" namespace paddle { namespace inference { diff --git a/paddle/fluid/inference/api/analysis_config.cc b/paddle/fluid/inference/api/analysis_config.cc index f9da3004ed8306ef08144d096afa4f86133e492d..8efd514bd8397f099fd07321ad7e5d4ca253e229 100644 --- a/paddle/fluid/inference/api/analysis_config.cc +++ b/paddle/fluid/inference/api/analysis_config.cc @@ -95,12 +95,14 @@ contrib::AnalysisConfig::AnalysisConfig(const contrib::AnalysisConfig &other) { CP_MEMBER(memory_pool_init_size_mb_); CP_MEMBER(enable_memory_optim_); - CP_MEMBER(memory_optim_force_update_); + CP_MEMBER(static_memory_optim_); + CP_MEMBER(static_memory_optim_force_update_); // TensorRT releated. CP_MEMBER(use_tensorrt_); CP_MEMBER(tensorrt_workspace_size_); CP_MEMBER(tensorrt_max_batchsize_); CP_MEMBER(tensorrt_min_subgraph_size_); + CP_MEMBER(tensorrt_precision_mode_); // MKLDNN releated. CP_MEMBER(use_mkldnn_); CP_MEMBER(mkldnn_enabled_op_types_); @@ -140,9 +142,9 @@ void contrib::AnalysisConfig::EnableMKLDNN() { Update(); } -void contrib::AnalysisConfig::EnableTensorRtEngine(int workspace_size, - int max_batch_size, - int min_subgraph_size) { +void contrib::AnalysisConfig::EnableTensorRtEngine( + int workspace_size, int max_batch_size, int min_subgraph_size, + contrib::AnalysisConfig::Precision precision_mode) { #ifdef PADDLE_WITH_CUDA if (!use_gpu()) { LOG(ERROR) << "To use TensorRT engine, please call EnableGpu() first"; @@ -153,6 +155,7 @@ void contrib::AnalysisConfig::EnableTensorRtEngine(int workspace_size, tensorrt_workspace_size_ = workspace_size; tensorrt_max_batchsize_ = max_batch_size; tensorrt_min_subgraph_size_ = min_subgraph_size; + tensorrt_precision_mode_ = precision_mode; Update(); #else @@ -238,7 +241,8 @@ std::string contrib::AnalysisConfig::SerializeInfoCache() { ss << tensorrt_min_subgraph_size_; ss << enable_memory_optim_; - ss << memory_optim_force_update_; + ss << static_memory_optim_; + ss << static_memory_optim_force_update_; ss << use_mkldnn_; for (auto &item : mkldnn_enabled_op_types_) ss << item; @@ -278,9 +282,11 @@ float contrib::AnalysisConfig::fraction_of_gpu_memory_for_pool() const { #endif } -void contrib::AnalysisConfig::EnableMemoryOptim(bool force_update_cache) { +void contrib::AnalysisConfig::EnableMemoryOptim( + bool static_optim, bool force_update_static_cache) { enable_memory_optim_ = true; - memory_optim_force_update_ = force_update_cache; + static_memory_optim_ = static_optim; + static_memory_optim_force_update_ = force_update_static_cache; Update(); } @@ -300,4 +306,16 @@ void contrib::AnalysisConfig::SetModelBuffer(const char *prog_buffer, Update(); } +NativeConfig contrib::AnalysisConfig::ToNativeConfig() const { + NativeConfig config; + config.model_dir = model_dir_; + config.prog_file = prog_file_; + config.param_file = params_file_; + config.use_gpu = use_gpu_; + config.device = device_id_; + config.fraction_of_gpu_memory = fraction_of_gpu_memory_for_pool(); + config.specify_input_name = specify_input_name_; + return config; +} + } // namespace paddle diff --git a/paddle/fluid/inference/api/analysis_predictor.cc b/paddle/fluid/inference/api/analysis_predictor.cc index 2b0cad5faa0e31cb7546d405e05e36754915f653..66374cb7f07b3d9b6bfbff8382a3dfa7e8f2b04f 100644 --- a/paddle/fluid/inference/api/analysis_predictor.cc +++ b/paddle/fluid/inference/api/analysis_predictor.cc @@ -15,6 +15,7 @@ #include "paddle/fluid/inference/api/analysis_predictor.h" #include #include +#include #include #include #include @@ -25,6 +26,7 @@ #include "paddle/fluid/framework/naive_executor.h" #include "paddle/fluid/framework/scope.h" #include "paddle/fluid/framework/var_type_traits.h" +#include "paddle/fluid/inference/analysis/helper.h" #include "paddle/fluid/inference/analysis/passes/memory_optimize_pass.h" #include "paddle/fluid/inference/api/helper.h" #include "paddle/fluid/inference/api/paddle_inference_api.h" @@ -37,6 +39,8 @@ #if PADDLE_WITH_TENSORRT #include "paddle/fluid/inference/tensorrt/convert/op_converter.h" +#include "paddle/fluid/inference/tensorrt/trt_int8_calibrator.h" + #endif DECLARE_bool(profile); @@ -44,6 +48,12 @@ DECLARE_bool(profile); namespace paddle { using contrib::AnalysisConfig; +using inference::Singleton; +#if PADDLE_WITH_TENSORRT +using inference::tensorrt::TRTInt8Calibrator; +using inference::tensorrt::TRTCalibratorEngine; +using inference::tensorrt::TRTCalibratorEngineManager; +#endif namespace { bool IsPersistable(const framework::VarDesc *var) { @@ -113,6 +123,15 @@ bool AnalysisPredictor::PrepareProgram( if (!program) { if (!LoadProgramDesc()) return false; + // If not cloned, the parameters should be loaded. + // If config_.ir_optim() is True, parameters is loaded in + // OptimizeInferenceProgram(), but other persistable variables + // (like RAW type var) are not created in scope. + // If config_.ir_optim() is False, parameters is loaded in LoadParameters(), + // still need to create other persistable variables. + // So in both case, create persistable variables at first. + executor_->CreateVariables(*inference_program_, 0, true, sub_scope_); + // Optimize the program, and load parameters and modify them in the // scope_. // This will change the scope_ address. @@ -120,15 +139,6 @@ bool AnalysisPredictor::PrepareProgram( status_ir_optim_enabled_ = true; OptimizeInferenceProgram(); } else { - // If the parent_scope is passed, we assert that the persistable variables - // are already created, so just create the no persistable variables. - - // If not cloned, the parameters should be loaded - // OptimizeInferenceProgram. - // So in both cases, just the local variables are needed to load, not the - // parematers. - executor_->CreateVariables(*inference_program_, 0, true, sub_scope_); - // Load parameters LOG(INFO) << "load parameters "; LoadParameters(); @@ -298,15 +308,15 @@ void AnalysisPredictor::GetFetchOne(const framework::LoDTensor &fetch, bool AnalysisPredictor::GetFetch(std::vector *outputs, framework::Scope *scope) { VLOG(3) << "Predictor::get_fetch"; - outputs->resize(fetchs_.size()); - for (size_t i = 0; i < fetchs_.size(); ++i) { - int idx = boost::get(fetchs_[i]->GetAttr("col")); + outputs->resize(fetches_.size()); + for (size_t i = 0; i < fetches_.size(); ++i) { + int idx = boost::get(fetches_[i]->GetAttr("col")); PADDLE_ENFORCE((size_t)idx == i); framework::LoDTensor &fetch = framework::GetFetchVariable(*scope, "fetch", idx); auto type = fetch.type(); auto output = &(outputs->at(i)); - output->name = fetchs_[idx]->Input("X")[0]; + output->name = fetches_[idx]->Input("X")[0]; if (type == framework::proto::VarType::FP32) { GetFetchOne(fetch, output); output->dtype = PaddleDType::FLOAT32; @@ -327,7 +337,9 @@ void AnalysisPredictor::OptimizeInferenceProgram() { argument_.SetUseGPU(config_.use_gpu()); argument_.SetGPUDeviceId(config_.gpu_device_id()); argument_.SetEnableMemoryOptim(config_.enable_memory_optim()); - argument_.SetMemoryOptimForceUpdate(config_.memory_optim_force_update_); + argument_.SetStaticMemoryOptim(config_.static_memory_optim_); + argument_.SetStaticMemoryOptimForceUpdate( + config_.static_memory_optim_force_update_); argument_.SetModelFromMemory(config_.model_from_memory_); // Analyze inference_program if (!config_.model_dir().empty()) { @@ -337,6 +349,8 @@ void AnalysisPredictor::OptimizeInferenceProgram() { !config_.params_file().empty(), "Either model_dir or (param_file, prog_file) should be set."); PADDLE_ENFORCE(!config_.prog_file().empty()); + std::string dir = inference::analysis::GetDirRoot(config_.prog_file()); + argument_.SetModelProgramPath(config_.prog_file()); argument_.SetModelParamsPath(config_.params_file()); } @@ -347,6 +361,7 @@ void AnalysisPredictor::OptimizeInferenceProgram() { argument_.SetTensorRtWorkspaceSize(config_.tensorrt_workspace_size_); argument_.SetTensorRtMaxBatchSize(config_.tensorrt_max_batchsize_); argument_.SetTensorRtMinSubgraphSize(config_.tensorrt_min_subgraph_size_); + argument_.SetTensorRtPrecisionMode(config_.tensorrt_precision_mode_); } if (config_.use_mkldnn_) { @@ -361,7 +376,7 @@ void AnalysisPredictor::OptimizeInferenceProgram() { } argument_.SetIrAnalysisPasses(passes); argument_.SetAnalysisPasses(config_.pass_builder()->AnalysisPasses()); - argument_.SetScopeNotOwned(const_cast(scope_.get())); + argument_.SetScopeNotOwned(scope_.get()); Analyzer().Run(&argument_); PADDLE_ENFORCE(argument_.scope_valid()); @@ -422,10 +437,10 @@ void AnalysisPredictor::PrepareFeedFetch() { feed_names_[op->Output("Out")[0]] = idx; } else if (op->Type() == "fetch") { int idx = boost::get(op->GetAttr("col")); - if (fetchs_.size() <= static_cast(idx)) { - fetchs_.resize(idx + 1); + if (fetches_.size() <= static_cast(idx)) { + fetches_.resize(idx + 1); } - fetchs_[idx] = op; + fetches_[idx] = op; } } } @@ -567,7 +582,67 @@ bool AnalysisPredictor::LoadParameters() { return true; } +#if PADDLE_WITH_TENSORRT +bool AnalysisPredictor::SaveTrtCalibToDisk() { + PADDLE_ENFORCE(config_.tensorrt_engine_enabled(), + "This func can be invoked only in trt mode"); + auto &block = inference_program_->Block(0); + for (auto &op_desc : block.AllOps()) { + if (op_desc->Type() == "tensorrt_engine") { + std::string engine_name = + boost::get(op_desc->GetAttr("engine_key")); + if (!Singleton::Global().Has(engine_name)) { + LOG(ERROR) << "You should run the predictor(with trt) on the real data " + "to generate calibration info"; + return false; + } + TRTCalibratorEngine *calib_engine = + Singleton::Global().Get(engine_name); + LOG(INFO) << "Wait for calib threads done."; + calib_engine->calib_->waitAndSetDone(); + LOG(INFO) << "Generating TRT Calibration table data, this may cost a lot " + "of time..."; + calib_engine->thr_->join(); + std::string calibration_table_data = + calib_engine->calib_->getCalibrationTableAsString(); + + if (calibration_table_data.empty()) { + LOG(ERROR) << "the calibration table is empty."; + return false; + } + + std::string model_opt_cache_dir = + argument_.Has("model_dir") + ? argument_.model_dir() + : inference::analysis::GetDirRoot(argument_.model_program_path()); + + std::string calibration_table_data_path = + inference::analysis::GetTrtCalibPath( + inference::analysis::GetOrCreateModelOptCacheDir( + model_opt_cache_dir), + engine_name); + + std::ofstream ofile(calibration_table_data_path, std::ios::out); + LOG(INFO) << "Write Paddle-TRT INT8 calibration table data to file " + << calibration_table_data_path; + ofile << calibration_table_data; + ofile.close(); + } + } + // Free all calibrator resources. + Singleton::Global().DeleteALL(); + return true; +} +#endif + AnalysisPredictor::~AnalysisPredictor() { +#if PADDLE_WITH_TENSORRT + if (config_.tensorrt_engine_enabled() && + config_.tensorrt_precision_mode_ == AnalysisConfig::Precision::kInt8 && + Singleton::Global().Has()) { + SaveTrtCalibToDisk(); + } +#endif if (FLAGS_profile) { platform::DisableProfiler(platform::EventSortingKey::kTotal, "./profile.log"); @@ -638,12 +713,12 @@ bool AnalysisPredictor::need_collect_var_shapes_for_memory_optim() { // check if the cache exists if (!config_.enable_memory_optim()) { need = false; - } else if (config_.enable_memory_optim() && + } else if (config_.static_memory_optim_ && !inference::IsFileExists(inference::analysis::GetMemoryCachePath( config_.model_dir(), config_.prog_file()))) { need = true; - } else if (config_.enable_memory_optim() && - config_.memory_optim_force_update_) { + } else if (config_.static_memory_optim_ && + config_.static_memory_optim_force_update_) { need = true; } @@ -651,6 +726,10 @@ bool AnalysisPredictor::need_collect_var_shapes_for_memory_optim() { return need; } +std::string AnalysisPredictor::GetSeriazlizedProgram() const { + return inference_program_->Proto()->SerializeAsString(); +} + template <> std::unique_ptr CreatePaddlePredictor( const contrib::AnalysisConfig &config) { diff --git a/paddle/fluid/inference/api/analysis_predictor.h b/paddle/fluid/inference/api/analysis_predictor.h index 9095b6ec1af6794c19e94fc9326a48239b3ba145..fa1d0d596df5a3619af74e0fead3a0b376186e08 100644 --- a/paddle/fluid/inference/api/analysis_predictor.h +++ b/paddle/fluid/inference/api/analysis_predictor.h @@ -75,6 +75,8 @@ class AnalysisPredictor : public PaddlePredictor { void SetMkldnnThreadID(int tid); + std::string GetSeriazlizedProgram() const override; + protected: // For memory optimization. bool need_collect_var_shapes_for_memory_optim(); @@ -97,6 +99,21 @@ class AnalysisPredictor : public PaddlePredictor { void GetFetchOne(const framework::LoDTensor &fetchs, PaddleTensor *output_data); +#if PADDLE_WITH_TENSORRT + // When we use Paddle-TRT INT8 engine, we need to generate calibration table + // data first, + // the calibration table contains the range for each op's input and output, + // this whole process can be divided into several steps: + // + // 1. Builds a 32-bit engine, runs it on the calibration set, and records a + // histogram for each + // tensor of the distribution of activation values. + // 2. Builds a calibration table from the histograms. + // + // After step 2, we need to store the calibration table on disk + bool SaveTrtCalibToDisk(); +#endif + // Some more detailed tests, they are made the friends of the predictor, so that // the all the details can be tested. #if PADDLE_WITH_TESTING @@ -115,7 +132,7 @@ class AnalysisPredictor : public PaddlePredictor { std::shared_ptr inference_program_; std::vector feeds_; std::map feed_names_; - std::vector fetchs_; + std::vector fetches_; // Memory buffer for feed inputs. The temporary LoDTensor will cause serious // concurrency problems, wrong results and memory leak, so cache them. std::vector feed_tensors_; diff --git a/paddle/fluid/inference/api/analysis_predictor_tester.cc b/paddle/fluid/inference/api/analysis_predictor_tester.cc index 4688e93d7102109d2c7ece9ba37bc8f2d311dcf1..20b61344da978a87baf654efd4ad2b3ae90454c0 100644 --- a/paddle/fluid/inference/api/analysis_predictor_tester.cc +++ b/paddle/fluid/inference/api/analysis_predictor_tester.cc @@ -215,6 +215,8 @@ TEST(AnalysisPredictor, memory_optim) { { // The first predictor help to cache the memory optimize strategy. auto predictor = CreatePaddlePredictor(config); + LOG(INFO) << "serialized program: " << predictor->GetSeriazlizedProgram(); + ASSERT_FALSE(predictor->GetSeriazlizedProgram().empty()); // Run several times to check the parameters are not reused by mistake. for (int i = 0; i < 5; i++) { diff --git a/paddle/fluid/inference/api/api.cc b/paddle/fluid/inference/api/api.cc index 9be059c73e20ebeeff2c4b6e8e5502e4a56fd0d6..6cd18277d63200f5bccf180a7ae3196b0ce126ff 100644 --- a/paddle/fluid/inference/api/api.cc +++ b/paddle/fluid/inference/api/api.cc @@ -12,6 +12,8 @@ // See the License for the specific language governing permissions and // limitations under the License. +#include +#include "paddle/fluid/framework/commit.h" #include "paddle/fluid/framework/lod_tensor.h" #include "paddle/fluid/framework/scope.h" #include "paddle/fluid/inference/api/paddle_inference_api.h" @@ -97,4 +99,12 @@ void PaddleBuf::Free() { } } +std::string get_version() { + std::stringstream ss; + ss << "version: " << framework::paddle_version() << "\n"; + ss << "commit: " << framework::paddle_commit() << "\n"; + ss << "branch: " << framework::paddle_compile_branch() << "\n"; + return ss.str(); +} + } // namespace paddle diff --git a/paddle/fluid/inference/api/api_tester.cc b/paddle/fluid/inference/api/api_tester.cc index 7a579610eefda24c911edd28b5f3a178aa10ab1e..2c450ef7cead4d5c3870d5e9186eb221e5dc19a0 100644 --- a/paddle/fluid/inference/api/api_tester.cc +++ b/paddle/fluid/inference/api/api_tester.cc @@ -61,4 +61,10 @@ TEST(paddle_inference_api, demo) { predictor->Run({}, &outputs); } +TEST(paddle_inference_api, get_version) { + LOG(INFO) << "paddle version:\n" << get_version(); + auto version = get_version(); + ASSERT_FALSE(version.empty()); +} + } // namespace paddle diff --git a/paddle/fluid/inference/api/paddle_analysis_config.h b/paddle/fluid/inference/api/paddle_analysis_config.h index 1cee8904500636d7b49e6b4e54595dbce6a79954..5b899b26d60dec3634d7016c925143e1ae26992d 100644 --- a/paddle/fluid/inference/api/paddle_analysis_config.h +++ b/paddle/fluid/inference/api/paddle_analysis_config.h @@ -42,6 +42,10 @@ struct AnalysisConfig { explicit AnalysisConfig(const std::string& model_dir); explicit AnalysisConfig(const std::string& prog_file, const std::string& params_file); + enum class Precision { + kFloat32 = 0, + kInt8, + }; /** Set model with a directory. */ @@ -135,7 +139,8 @@ struct AnalysisConfig { * subgraph is less than this, it will not transfer to TensorRT engine. */ void EnableTensorRtEngine(int workspace_size = 1 << 20, - int max_batch_size = 1, int min_subgraph_size = 3); + int max_batch_size = 1, int min_subgraph_size = 3, + Precision precision = Precision::kFloat32); /** A boolean state telling whether the TensorRT engine is used. */ bool tensorrt_engine_enabled() const { return use_tensorrt_; } @@ -162,17 +167,7 @@ struct AnalysisConfig { /** Transform the AnalysisConfig to NativeConfig. */ - NativeConfig ToNativeConfig() const { - NativeConfig config; - config.model_dir = model_dir_; - config.prog_file = prog_file_; - config.param_file = params_file_; - config.use_gpu = use_gpu_; - config.device = device_id_; - config.fraction_of_gpu_memory = fraction_of_gpu_memory_for_pool(); - config.specify_input_name = specify_input_name_; - return config; - } + NativeConfig ToNativeConfig() const; /** Specify the operator type list to use MKLDNN acceleration. * @param op_list the operator type list. */ @@ -195,7 +190,8 @@ struct AnalysisConfig { /** Turn on memory optimize * NOTE still in development, will release latter. */ - void EnableMemoryOptim(bool force_update_cache = false); + void EnableMemoryOptim(bool static_optim = false, + bool force_update_static_cache = false); /** Tell whether the memory optimization is activated. */ bool enable_memory_optim() const; @@ -238,10 +234,12 @@ struct AnalysisConfig { // We set this variable to control the minimum number of nodes in the // subgraph, 3 as default value. int tensorrt_min_subgraph_size_{3}; + Precision tensorrt_precision_mode_; // memory reuse related. bool enable_memory_optim_{false}; - bool memory_optim_force_update_{false}; + bool static_memory_optim_{false}; + bool static_memory_optim_force_update_{false}; bool use_mkldnn_{false}; std::unordered_set mkldnn_enabled_op_types_; diff --git a/paddle/fluid/inference/api/paddle_api.h b/paddle/fluid/inference/api/paddle_api.h index 46b510fd1ec94c59032b8f41a2ac4d6aa87dc150..406983224615fbdb649301f1ffe3fbd136938a61 100644 --- a/paddle/fluid/inference/api/paddle_api.h +++ b/paddle/fluid/inference/api/paddle_api.h @@ -215,6 +215,14 @@ class PaddlePredictor { */ virtual ~PaddlePredictor() = default; + /** \brief Get the serialized model program that executes in inference phase. + * Its data type is ProgramDesc, which is a protobuf message. + */ + virtual std::string GetSeriazlizedProgram() const { + assert(false); // Force raise error. + return "NotImplemented"; + }; + /** The common configs for all the predictors. */ struct Config { @@ -288,4 +296,6 @@ std::unique_ptr CreatePaddlePredictor(const ConfigT& config); int PaddleDtypeSize(PaddleDType dtype); +std::string get_version(); + } // namespace paddle diff --git a/paddle/fluid/inference/api/paddle_pass_builder.h b/paddle/fluid/inference/api/paddle_pass_builder.h index d3a60d209922ebe8d31723ca25c71a952ea08bd6..391932a1ee018c45818457c55fd8f82a22ab7405 100644 --- a/paddle/fluid/inference/api/paddle_pass_builder.h +++ b/paddle/fluid/inference/api/paddle_pass_builder.h @@ -154,13 +154,16 @@ class GpuPassStrategy : public PassStrategy { public: GpuPassStrategy() : PassStrategy({}) { passes_.assign({ - "infer_clean_graph_pass", // - "conv_affine_channel_fuse_pass", // - "conv_eltwiseadd_affine_channel_fuse_pass", // - "conv_bn_fuse_pass", // - "conv_elementwise_add_act_fuse_pass", // - "conv_elementwise_add2_act_fuse_pass", // - "conv_elementwise_add_fuse_pass", // + "infer_clean_graph_pass", // + "conv_affine_channel_fuse_pass", // + "conv_eltwiseadd_affine_channel_fuse_pass", // + "conv_bn_fuse_pass", // +#if CUDNN_VERSION >= 7100 // To run conv_fusion, the version of cudnn must be + // guaranteed at least v7 + "conv_elementwise_add_act_fuse_pass", // + "conv_elementwise_add2_act_fuse_pass", // + "conv_elementwise_add_fuse_pass", // +#endif }); for (int i = 6; i >= 3; i--) { diff --git a/paddle/fluid/inference/tensorrt/CMakeLists.txt b/paddle/fluid/inference/tensorrt/CMakeLists.txt index 9afeafd176c70bc03166ec7732ae5e2faf67ea54..f4977d08c4d051b8a528e122c47948c3c81d153c 100644 --- a/paddle/fluid/inference/tensorrt/CMakeLists.txt +++ b/paddle/fluid/inference/tensorrt/CMakeLists.txt @@ -1,4 +1,4 @@ -nv_library(tensorrt_engine SRCS engine.cc DEPS ${GLOB_OPERATOR_DEPS} framework_proto device_context) +nv_library(tensorrt_engine SRCS engine.cc trt_int8_calibrator.cc DEPS ${GLOB_OPERATOR_DEPS} framework_proto device_context) nv_library(tensorrt_op_teller SRCS op_teller.cc DEPS framework_proto) nv_test(test_tensorrt SRCS test_tensorrt.cc DEPS dynload_cuda device_context dynamic_loader) nv_test(test_tensorrt_engine SRCS test_engine.cc DEPS dynload_cuda tensorrt_engine) diff --git a/paddle/fluid/inference/tensorrt/convert/test_op_converter.cc b/paddle/fluid/inference/tensorrt/convert/test_op_converter.cc index 01d7f700da9cc67d0ebbd3d9649e3823f58a8811..c5a413221ebff6b9be114151dbb93fd23a148440 100644 --- a/paddle/fluid/inference/tensorrt/convert/test_op_converter.cc +++ b/paddle/fluid/inference/tensorrt/convert/test_op_converter.cc @@ -29,9 +29,9 @@ TEST(OpConverter, ConvertBlock) { // init trt engine cudaStream_t stream_; std::unique_ptr engine_; - engine_.reset(new TensorRTEngine(5, 1 << 15, &stream_)); - engine_->InitNetwork(); PADDLE_ENFORCE_EQ(cudaStreamCreate(&stream_), 0); + engine_.reset(new TensorRTEngine(5, 1 << 15, stream_)); + engine_->InitNetwork(); engine_->DeclareInput("conv2d-X", nvinfer1::DataType::kFLOAT, nvinfer1::Dims3(2, 5, 5)); diff --git a/paddle/fluid/inference/tensorrt/convert/ut_helper.h b/paddle/fluid/inference/tensorrt/convert/ut_helper.h index f313beb73bb0d21cab1d62859a46fcc76a373548..e83961f3d7bda03a7659f175c59105dcb60708e9 100644 --- a/paddle/fluid/inference/tensorrt/convert/ut_helper.h +++ b/paddle/fluid/inference/tensorrt/convert/ut_helper.h @@ -78,11 +78,9 @@ class TRTConvertValidation { scope_(scope), if_add_batch_(if_add_batch), max_batch_size_(max_batch_size) { - // create engine. - engine_.reset(new TensorRTEngine(max_batch_size, workspace_size, &stream_)); - engine_->InitNetwork(); - PADDLE_ENFORCE_EQ(cudaStreamCreate(&stream_), 0); + engine_.reset(new TensorRTEngine(max_batch_size, workspace_size, stream_)); + engine_->InitNetwork(); } // Declare a Variable as input with random initialization. @@ -175,7 +173,7 @@ class TRTConvertValidation { op_->Run(scope_, place); // Execute TRT. engine_->Execute(batch_size); - cudaStreamSynchronize(*engine_->stream()); + cudaStreamSynchronize(engine_->stream()); ASSERT_FALSE(op_desc_->OutputArgumentNames().empty()); const size_t output_space_size = 3000; @@ -184,7 +182,7 @@ class TRTConvertValidation { std::vector fluid_out; std::vector trt_out(output_space_size); engine_->GetOutputInCPU(output, &trt_out[0], output_space_size); - cudaStreamSynchronize(*engine_->stream()); + cudaStreamSynchronize(engine_->stream()); auto* var = scope_.FindVar(output); auto tensor = var->GetMutable(); diff --git a/paddle/fluid/inference/tensorrt/engine.cc b/paddle/fluid/inference/tensorrt/engine.cc index f739752cbc44805cb0fb3246385609cf16ba744a..10f48462cfaf8073a4f5537d654d614d36b74db4 100644 --- a/paddle/fluid/inference/tensorrt/engine.cc +++ b/paddle/fluid/inference/tensorrt/engine.cc @@ -42,14 +42,13 @@ void TensorRTEngine::Execute(int batch_size) { PADDLE_ENFORCE(buf.device == DeviceType::GPU); buffers.push_back(buf.buffer); } - PADDLE_ENFORCE_NOT_NULL(stream_); - infer_context_->enqueue(batch_size, buffers.data(), *stream_, nullptr); - cudaStreamSynchronize(*stream_); + infer_context_->enqueue(batch_size, buffers.data(), stream_, nullptr); + cudaStreamSynchronize(stream_); SetRuntimeBatch(batch_size); } TensorRTEngine::~TensorRTEngine() { - cudaStreamSynchronize(*stream_); + cudaStreamSynchronize(stream_); // clean buffer for (auto &buf : buffers_) { if (buf.device == DeviceType::GPU && buf.buffer != nullptr) { @@ -70,6 +69,13 @@ void TensorRTEngine::FreezeNetwork() { // build engine. infer_builder_->setMaxBatchSize(max_batch_); infer_builder_->setMaxWorkspaceSize(max_workspace_); + if (enable_int8_) { + infer_builder_->setInt8Mode(true); + PADDLE_ENFORCE( + calibrator_ != nullptr, + "The precision mode is 'INT8', the calibrator should not be nullptr"); + infer_builder_->setInt8Calibrator(calibrator_); + } infer_engine_.reset(infer_builder_->buildCudaEngine(*infer_network_)); PADDLE_ENFORCE(infer_engine_ != nullptr, "build cuda engine failed!"); @@ -173,7 +179,7 @@ void TensorRTEngine::GetOutputInGPU(const std::string &name, void *dst, auto &buf = buffer(name); PADDLE_ENFORCE_NOT_NULL(buf.buffer, "buffer should be allocated before"); PADDLE_ENFORCE_EQ(cudaMemcpyAsync(dst, buf.buffer, dst_size, - cudaMemcpyDeviceToDevice, *stream_), + cudaMemcpyDeviceToDevice, stream_), 0); } @@ -194,7 +200,7 @@ void TensorRTEngine::GetOutputInCPU(const std::string &name, void *dst, auto &buf = buffer(name); PADDLE_ENFORCE_NOT_NULL(buf.buffer, "buffer should be allocated before"); PADDLE_ENFORCE_EQ(0, cudaMemcpyAsync(dst, buf.buffer, dst_size, - cudaMemcpyDeviceToHost, *stream_)); + cudaMemcpyDeviceToHost, stream_)); } Buffer &TensorRTEngine::buffer(const std::string &name) { @@ -211,12 +217,11 @@ void TensorRTEngine::SetInputFromCPU(const std::string &name, const void *data, auto &buf = buffer(name); PADDLE_ENFORCE_NOT_NULL(buf.buffer); PADDLE_ENFORCE_NOT_NULL(data); - PADDLE_ENFORCE_NOT_NULL(stream_); PADDLE_ENFORCE_LE(size, buf.max_size, "buffer is too small"); PADDLE_ENFORCE(buf.device == DeviceType::GPU); buf.size = size; PADDLE_ENFORCE_EQ(0, cudaMemcpyAsync(buf.buffer, data, size, - cudaMemcpyHostToDevice, *stream_)); + cudaMemcpyHostToDevice, stream_)); } void TensorRTEngine::SetInputFromGPU(const std::string &name, const void *data, @@ -227,7 +232,7 @@ void TensorRTEngine::SetInputFromGPU(const std::string &name, const void *data, PADDLE_ENFORCE_LE(size, buf.max_size, "buffer is too small"); PADDLE_ENFORCE(buf.device == DeviceType::GPU); PADDLE_ENFORCE_EQ(0, cudaMemcpyAsync(buf.buffer, data, size, - cudaMemcpyDeviceToDevice, *stream_)); + cudaMemcpyDeviceToDevice, stream_)); } void TensorRTEngine::SetITensor(const std::string &name, diff --git a/paddle/fluid/inference/tensorrt/engine.h b/paddle/fluid/inference/tensorrt/engine.h index f5b2c28ba9e6fefc1d6c14640d696c3bf3ac8249..cdfe09b5a7fd2d1f8548dab9421f671f5a345153 100644 --- a/paddle/fluid/inference/tensorrt/engine.h +++ b/paddle/fluid/inference/tensorrt/engine.h @@ -23,12 +23,14 @@ limitations under the License. */ #include "paddle/fluid/inference/engine.h" #include "paddle/fluid/inference/tensorrt/helper.h" #include "paddle/fluid/inference/tensorrt/plugin/trt_plugin.h" +#include "paddle/fluid/inference/tensorrt/trt_int8_calibrator.h" #include "paddle/fluid/inference/utils/singleton.h" namespace paddle { namespace inference { namespace tensorrt { +class TRTInt8Calibrator; /* * TensorRT Engine. * @@ -54,17 +56,17 @@ class TensorRTEngine : public EngineBase { nvinfer1::Weights w_; }; - TensorRTEngine(int max_batch, int max_workspace, - cudaStream_t* stream = nullptr, int device = 0, + TensorRTEngine(int max_batch, int max_workspace, cudaStream_t stream, + int device = 0, bool enable_int8 = false, + TRTInt8Calibrator* calibrator = nullptr, nvinfer1::ILogger& logger = NaiveLogger::Global()) : max_batch_(max_batch), max_workspace_(max_workspace), - stream_(stream ? stream : &default_stream_), - logger_(logger), - device_(device) { - freshDeviceId(); - cudaStreamCreate(stream_); - } + stream_(stream), + device_(device), + enable_int8_(enable_int8), + calibrator_(calibrator), + logger_(logger) {} virtual ~TensorRTEngine(); @@ -102,7 +104,7 @@ class TensorRTEngine : public EngineBase { // NOTE this should be used after calling `FreezeNetwork`. Buffer& buffer(const std::string& name) override; - cudaStream_t* stream() { return stream_; } + cudaStream_t stream() { return stream_; } // Fill an input from CPU memory with name and size. void SetInputFromCPU(const std::string& name, const void* data, size_t size); @@ -142,8 +144,8 @@ class TensorRTEngine : public EngineBase { // In the normal case, the paddle-trt exists bug when runing the googlenet. // When there are more than two convolutions of 1 * 1 with the same input, the // paddle-tensorrt will do the merging optimization, which fuse those conv - // into - // one conv, and then trigger bug. So, We should use strategy to avoid this + // into one conv, and then trigger bug. So, We should use strategy to avoid + // this // optimization for the time being. This bug will be fixed in the future. std::unordered_map itensor_quote_num; @@ -156,11 +158,15 @@ class TensorRTEngine : public EngineBase { // the max memory size the engine uses int max_workspace_; + cudaStream_t stream_; + // The specific GPU id that the TensorRTEngine bounded to. + int device_; + + bool enable_int8_; + TRTInt8Calibrator* calibrator_; // batch size of the current data, will be updated each Executation. int batch_size_{-1}; - cudaStream_t* stream_; - // If stream_ is not set from outside, hold its own stream. - cudaStream_t default_stream_; + nvinfer1::ILogger& logger_; std::vector buffers_; @@ -169,8 +175,6 @@ class TensorRTEngine : public EngineBase { std::unordered_map itensor_map_; - // The specific GPU id that the TensorRTEngine bounded to. - int device_; std::vector> owned_plugin_; // TensorRT related internal members @@ -208,38 +212,6 @@ class TensorRTEngine : public EngineBase { #define TRT_ENGINE_ADD_LAYER(engine__, layer__, ARGS...) \ engine__->network()->add##layer__(ARGS); -/* - * Helper to control the TensorRT engine's creation and deletion. - */ -class TRT_EngineManager { - public: - bool HasEngine(const std::string& name) const { - return engines_.count(name) != 0; - } - - // Get an engine called `name`. - TensorRTEngine* Get(const std::string& name) const { - return engines_.at(name).get(); - } - - // Create or get an engine called `name` - TensorRTEngine* Create(int max_batch, int max_workspace, cudaStream_t* stream, - const std::string& name, int gpu_device = 0) { - auto* p = new TensorRTEngine(max_batch, max_workspace, stream, gpu_device); - engines_[name].reset(p); - return p; - } - - void DeleteALl() { - for (auto& item : engines_) { - item.second.reset(nullptr); - } - } - - private: - std::unordered_map> engines_; -}; - } // namespace tensorrt } // namespace inference } // namespace paddle diff --git a/paddle/fluid/inference/tensorrt/test_engine.cc b/paddle/fluid/inference/tensorrt/test_engine.cc index da1f6535cb3b2476cd475797861d6d2bb6d88856..9eed0f6ee9ce4d9e35bec718dc8e8435921dbd81 100644 --- a/paddle/fluid/inference/tensorrt/test_engine.cc +++ b/paddle/fluid/inference/tensorrt/test_engine.cc @@ -27,8 +27,8 @@ namespace tensorrt { class TensorRTEngineTest : public ::testing::Test { protected: void SetUp() override { - // ASSERT_EQ(0, cudaStreamCreate(&stream_)); - engine_ = new TensorRTEngine(10, 1 << 10, &stream_); + ASSERT_EQ(0, cudaStreamCreate(&stream_)); + engine_ = new TensorRTEngine(10, 1 << 10, stream_); engine_->InitNetwork(); } diff --git a/paddle/fluid/inference/tensorrt/trt_int8_calibrator.cc b/paddle/fluid/inference/tensorrt/trt_int8_calibrator.cc new file mode 100644 index 0000000000000000000000000000000000000000..4a85c8b8fe6d70052edd3be59f98582c9b2e86b9 --- /dev/null +++ b/paddle/fluid/inference/tensorrt/trt_int8_calibrator.cc @@ -0,0 +1,147 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/inference/tensorrt/trt_int8_calibrator.h" +#include "glog/logging.h" + +namespace paddle { +namespace inference { +namespace tensorrt { + +// set the batch size before constructing the thread to execute engine +int TRTInt8Calibrator::getBatchSize() const { return batch_size_; } + +TRTInt8Calibrator::TRTInt8Calibrator( + const std::unordered_map& buffers, int batch_size, + std::string engine_name, const platform::Place place) + : batch_size_(batch_size), engine_name_(engine_name) { + int i = 0; + VLOG(4) << "Init a new calibrator: " << engine_name_; + for (const auto it : buffers) { + framework::Tensor temp_tensor; + std::string input_name = it.first; + int data_size = it.second; + int num_ele = data_size / sizeof(int16_t); + framework::DDim data_shape = framework::make_ddim({num_ele}); + temp_tensor.Resize(data_shape); + data_tensors_.push_back(temp_tensor); + data_buffers_[input_name] = std::pair( + static_cast(temp_tensor.mutable_data(place)), num_ele); + i += 1; + } +} + +TRTInt8Calibrator::TRTInt8Calibrator(const std::string& calib_data) + : batch_size_(0), + calib_running_(false), + data_is_set_(false), + done_(true), + calibration_table_(calib_data) {} + +void TRTInt8Calibrator::waitAndSetDone() { + std::unique_lock lk(mut_); + while ((calib_running_ || data_is_set_) && !done_) cond_.wait(lk); + if (!done_) { + done_ = true; + cond_.notify_all(); + } +} + +// There might be more than one input for trt subgraph, +// So, we use a map to store input information. +bool TRTInt8Calibrator::setBatch( + const std::unordered_map& data) { + VLOG(3) << "set batch: " << engine_name_; + std::unique_lock lk(mut_); + // There is a producer and a consumer. The producer set the batch data and + // the consumer get the batch data. The size of the data pool is one. + // So, the producer has to wait for the consumer to finish processing before + // they can set the data. + while ((calib_running_ || data_is_set_) && (!done_)) cond_.wait(lk); + // The done_ is set to true using waitAndSetDone, When all calibration data + // are processed. + if (done_) return false; + + // Sets the batch. + for (const auto& it : data) { + auto dataptr = data_buffers_.find(it.first); + if (dataptr == data_buffers_.end()) { + LOG(FATAL) << "FATAL " << engine_name_ << " input name '" << it.first + << "' does not match with the buffer names"; + } + const auto& d = dataptr->second; + PADDLE_ENFORCE( + cudaMemcpy(d.first, it.second, d.second, cudaMemcpyDeviceToDevice), + "Fail to cudaMemcpy %s for %s", engine_name_, it.first); + } + + data_is_set_ = true; + cond_.notify_all(); + return true; +} + +bool TRTInt8Calibrator::getBatch(void** bindings, const char** names, + int num_bindings) { + VLOG(4) << "get batch: " << engine_name_; + std::unique_lock lk(mut_); + // The consumer has just finished processing a data. + // The producer can set the data again. + calib_running_ = false; + cond_.notify_all(); + + // As long as there is data in the pool, the consumer can get it. + while (!data_is_set_ && !done_) cond_.wait(lk); + if (done_) return false; + + // Gets the batch + for (int i = 0; i < num_bindings; i++) { + auto it = data_buffers_.find(names[i]); + if (it == data_buffers_.end()) { + LOG(FATAL) << "Calibration engine asked for unknown tensor name '" + << names[i] << "' at position " << i; + } + bindings[i] = it->second.first; + } + + data_is_set_ = false; + calib_running_ = true; + VLOG(4) << "get batch done: " << engine_name_; + return true; +} + +void TRTInt8Calibrator::setDone() { + std::unique_lock lk(mut_); + done_ = true; + cond_.notify_all(); +} + +const void* TRTInt8Calibrator::readCalibrationCache(size_t& length) { + if (calibration_table_.empty()) return nullptr; + length = calibration_table_.size(); + return calibration_table_.data(); +} + +void TRTInt8Calibrator::writeCalibrationCache(const void* ptr, + std::size_t length) { + calibration_table_ = std::string((const char*)ptr, length); + VLOG(4) << "Got calibration data for " << engine_name_ << " " << ptr + << " length=" << length; +} +TRTInt8Calibrator::~TRTInt8Calibrator() { + VLOG(4) << "Destroying calibrator for " << engine_name_; +} + +} // namespace tensorrt +} // namespace inference +} // namespace paddle diff --git a/paddle/fluid/inference/tensorrt/trt_int8_calibrator.h b/paddle/fluid/inference/tensorrt/trt_int8_calibrator.h new file mode 100644 index 0000000000000000000000000000000000000000..919f5d55f88c3a6473f66371e2f3d91f3c4721c5 --- /dev/null +++ b/paddle/fluid/inference/tensorrt/trt_int8_calibrator.h @@ -0,0 +1,128 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include "paddle/fluid/framework/tensor.h" +#include "paddle/fluid/inference/tensorrt/engine.h" +#include "paddle/fluid/platform/place.h" + +namespace paddle { +namespace inference { +namespace tensorrt { + +class TensorRTEngine; + +struct TRTInt8Calibrator : public nvinfer1::IInt8EntropyCalibrator { + public: + TRTInt8Calibrator(const std::unordered_map& buffers, + int batch_size, std::string engine_name, + const platform::Place place); + + explicit TRTInt8Calibrator(const std::string& calibration_data); + ~TRTInt8Calibrator(); + + int getBatchSize() const override; + + bool getBatch(void* bindings[], const char* names[], + int num_bindings) override; + + bool setBatch(const std::unordered_map& data); + void setDone(); + void waitAndSetDone(); + + const void* readCalibrationCache(std::size_t& length) override; + void writeCalibrationCache(const void* ptr, std::size_t length) override; + const std::string& getCalibrationTableAsString() { + return calibration_table_; + } + + private: + const int batch_size_; + + bool calib_running_{true}; + bool data_is_set_{false}; + bool done_{false}; + + std::mutex mut_; + std::condition_variable cond_; + + std::unordered_map> data_buffers_; + std::vector data_tensors_; + + std::string engine_name_; + std::string calibration_table_; +}; + +class TRTCalibratorEngine { + public: + TRTCalibratorEngine() {} + std::unique_ptr calib_; + std::unique_ptr thr_; + std::unique_ptr engine_; +}; +/* + * Manager to control the TensorRT Int8 calibration creation and deltetion. + */ +class TRTCalibratorEngineManager { + public: + bool Has() const { return res_.size() > 0; } + bool Has(const std::string& name) const { + if (res_.count(name) == 0) return false; + return res_.at(name).get() != nullptr; + } + + // Get Int8Calibrator via name + TRTCalibratorEngine* Get(const std::string& name) const { + return res_.at(name).get(); + } + + // Look up or create a calibrator. + TRTCalibratorEngine* LookupOrCreate(const std::string& engine_name) { + if (res_.count(engine_name) == 0) { + auto* p = new TRTCalibratorEngine; + res_[engine_name].reset(p); + } + return res_.at(engine_name).get(); + } + + // Create an Int8Calibrator + TRTCalibratorEngine* Create(const std::string& engine_name) { + auto* p = new TRTCalibratorEngine; + res_[engine_name].reset(p); + return p; + } + + void DeleteALL() { + for (auto& item : res_) { + item.second.reset(nullptr); + } + } + + private: + std::unordered_map> res_; +}; + +} // namespace tensorrt +} // namespace inference +} // namespace paddle diff --git a/paddle/fluid/inference/tests/api/CMakeLists.txt b/paddle/fluid/inference/tests/api/CMakeLists.txt index fa2e19bc4c3b80f58e5d6c63eef4e7592b0cbfea..b0f7dcc0dffcb71a8b88c764dbced05fde36745e 100644 --- a/paddle/fluid/inference/tests/api/CMakeLists.txt +++ b/paddle/fluid/inference/tests/api/CMakeLists.txt @@ -54,6 +54,7 @@ else() message(WARNING "These tests has been disabled in OSX or WITH_MKL=OFF before being fixed: \n test_analyzer_seq_pool1") endif() + # RNN2 set(RNN2_INSTALL_DIR "${INFERENCE_DEMO_INSTALL_DIR}/rnn2") download_model_and_data(${RNN2_INSTALL_DIR} "rnn2_model.tar.gz" "rnn2_data.txt.tar.gz") @@ -115,10 +116,9 @@ if (NOT EXISTS ${MOBILENET_INSTALL_DIR}) endif() inference_analysis_api_test_with_refer_result(test_analyzer_mobilenet_transpose ${MOBILENET_INSTALL_DIR} analyzer_vis_tester.cc SERIAL) -# bert -set(BERT_INSTALL_DIR "${INFERENCE_DEMO_INSTALL_DIR}/bert") -download_model_and_data(${BERT_INSTALL_DIR} "bert_model.tar.gz" "bert_data.txt.tar.gz") -inference_analysis_api_test(test_analyzer_bert ${BERT_INSTALL_DIR} analyzer_bert_tester.cc) +# googlenet +inference_analysis_api_test_with_fake_data(test_analyzer_googlenet + "${INFERENCE_DEMO_INSTALL_DIR}/googlenet" analyzer_resnet50_tester.cc "googlenet.tar.gz" SERIAL) # resnet50 inference_analysis_api_test_with_fake_data(test_analyzer_resnet50 @@ -128,6 +128,11 @@ inference_analysis_api_test_with_fake_data(test_analyzer_resnet50 inference_analysis_api_test_with_fake_data(test_analyzer_mobilenet_depthwise_conv "${INFERENCE_DEMO_INSTALL_DIR}/mobilenet_depthwise_conv" analyzer_resnet50_tester.cc "mobilenet_model.tar.gz" SERIAL) +# bert +set(BERT_INSTALL_DIR "${INFERENCE_DEMO_INSTALL_DIR}/bert") +download_model_and_data(${BERT_INSTALL_DIR} "bert_model.tar.gz" "bert_data.txt.tar.gz") +inference_analysis_api_test(test_analyzer_bert ${BERT_INSTALL_DIR} analyzer_bert_tester.cc) + # anakin if (WITH_ANAKIN AND WITH_MKL) # only needed in CI # anakin rnn1 diff --git a/paddle/fluid/inference/tests/api/analyzer_bert_tester.cc b/paddle/fluid/inference/tests/api/analyzer_bert_tester.cc index aced71b77475d351d2da59a6ad4c1c26da800fd7..24cbd39ea0fbf425a8fed66f6413008f1a97408d 100644 --- a/paddle/fluid/inference/tests/api/analyzer_bert_tester.cc +++ b/paddle/fluid/inference/tests/api/analyzer_bert_tester.cc @@ -135,19 +135,6 @@ bool ParseLine(const std::string &line, return true; } -// Print outputs to log -void PrintOutputs(const std::vector &outputs) { - LOG(INFO) << "example_id\tcontradiction\tentailment\tneutral"; - - for (size_t i = 0; i < outputs.front().data.length(); i += 3) { - LOG(INFO) << (i / 3) << "\t" - << static_cast(outputs.front().data.data())[i] << "\t" - << static_cast(outputs.front().data.data())[i + 1] - << "\t" - << static_cast(outputs.front().data.data())[i + 2]; - } -} - bool LoadInputData(std::vector> *inputs) { if (FLAGS_infer_data.empty()) { LOG(ERROR) << "please set input data path"; diff --git a/paddle/fluid/inference/tests/api/analyzer_dam_tester.cc b/paddle/fluid/inference/tests/api/analyzer_dam_tester.cc index 4ec9404ab42bcd9cc0608f033cb2777106a29583..e78ab942d113323fecf5510dca85fb5db734efc8 100644 --- a/paddle/fluid/inference/tests/api/analyzer_dam_tester.cc +++ b/paddle/fluid/inference/tests/api/analyzer_dam_tester.cc @@ -253,7 +253,7 @@ void compare(bool use_mkldnn = false) { } // Compare result of NativeConfig and AnalysisConfig with memory optimization. -TEST(Analyzer_dam, compare_with_memory_optim) { +TEST(Analyzer_dam, compare_with_static_memory_optim) { // The small dam will core in CI, but works in local. if (FLAGS_max_turn_num == 9) { contrib::AnalysisConfig cfg, cfg1; @@ -263,7 +263,7 @@ TEST(Analyzer_dam, compare_with_memory_optim) { SetInput(&input_slots_all); // Run the first time to force to update memory cache SetConfig(&cfg); - cfg.EnableMemoryOptim(true); + cfg.EnableMemoryOptim(true, true /*force update*/); CompareNativeAndAnalysis( reinterpret_cast(&cfg), @@ -271,7 +271,7 @@ TEST(Analyzer_dam, compare_with_memory_optim) { // Run second time to use the memory cache and perform memory optimization. SetConfig(&cfg1); - cfg1.EnableMemoryOptim(); + cfg1.EnableMemoryOptim(true, false /*do not force update*/); CompareNativeAndAnalysis( reinterpret_cast(&cfg1), @@ -279,6 +279,24 @@ TEST(Analyzer_dam, compare_with_memory_optim) { } } +TEST(Analyzer_dam, compare_with_dynamic_memory_optim) { + // The small dam will core in CI, but works in local. + if (FLAGS_max_turn_num == 9) { + contrib::AnalysisConfig cfg, cfg1; + DataRecord data(FLAGS_infer_data, FLAGS_batch_size); + + std::vector> input_slots_all; + SetInput(&input_slots_all); + // Run the first time to force to update memory cache + SetConfig(&cfg); + cfg.EnableMemoryOptim(); + + CompareNativeAndAnalysis( + reinterpret_cast(&cfg), + input_slots_all); + } +} + TEST(Analyzer_dam, compare) { compare(); } #ifdef PADDLE_WITH_MKLDNN diff --git a/paddle/fluid/inference/tests/api/tester_helper.h b/paddle/fluid/inference/tests/api/tester_helper.h index d2ca1d0b0098fd377725cc0fdf002d8d2e3539ec..b1f7a3464ac6027faffe283bccaf9793eae939e1 100644 --- a/paddle/fluid/inference/tests/api/tester_helper.h +++ b/paddle/fluid/inference/tests/api/tester_helper.h @@ -56,6 +56,13 @@ DECLARE_int32(paddle_num_threads); namespace paddle { namespace inference { +float Random(float low, float high) { + static std::random_device rd; + static std::mt19937 mt(rd()); + std::uniform_real_distribution dist(low, high); + return dist(mt); +} + void PrintConfig(const PaddlePredictor::Config *config, bool use_analysis) { const auto *analysis_config = reinterpret_cast(config); @@ -176,7 +183,7 @@ void SetFakeImageInput(std::vector> *inputs, float *input_data = static_cast(input.data.data()); // fill input data, for profile easily, do not use random data here. for (size_t j = 0; j < len; ++j) { - *(input_data + j) = static_cast(j) / len; + *(input_data + j) = Random(0.0, 1.0) / 10.; } } (*inputs).emplace_back(input_slots); @@ -344,6 +351,16 @@ void CompareNativeAndAnalysis( CompareResult(analysis_outputs, native_outputs); } +void CompareNativeAndAnalysis( + PaddlePredictor *native_pred, PaddlePredictor *analysis_pred, + const std::vector> &inputs) { + int batch_size = FLAGS_batch_size; + std::vector native_outputs, analysis_outputs; + native_pred->Run(inputs[0], &native_outputs, batch_size); + analysis_pred->Run(inputs[0], &analysis_outputs, batch_size); + CompareResult(analysis_outputs, native_outputs); +} + template std::string LoDTensorSummary(const framework::LoDTensor &tensor) { std::stringstream ss; diff --git a/paddle/fluid/inference/tests/api/trt_models_tester.cc b/paddle/fluid/inference/tests/api/trt_models_tester.cc index 5aca807ee3aee1bb323abe6d5c3700dfc08e30b4..db7109b7505d4fe4dcfcf88f303aa262bc5b44fb 100644 --- a/paddle/fluid/inference/tests/api/trt_models_tester.cc +++ b/paddle/fluid/inference/tests/api/trt_models_tester.cc @@ -107,6 +107,27 @@ void compare(std::string model_dir, bool use_tensorrt) { inputs_all); } +void compare_continuous_input(std::string model_dir, bool use_tensorrt) { + contrib::AnalysisConfig analysis_config; + SetConfig(&analysis_config, model_dir, true, + use_tensorrt, FLAGS_batch_size); + auto config = + reinterpret_cast(&analysis_config); + auto native_pred = CreateTestPredictor(config, false); + auto analysis_pred = CreateTestPredictor(config, true); + for (int i = 0; i < 100; i++) { + std::vector> inputs_all; + if (!FLAGS_prog_filename.empty() && !FLAGS_param_filename.empty()) { + SetFakeImageInput(&inputs_all, model_dir, true, FLAGS_prog_filename, + FLAGS_param_filename); + } else { + SetFakeImageInput(&inputs_all, model_dir, false, "__model__", ""); + } + CompareNativeAndAnalysis(native_pred.get(), analysis_pred.get(), + inputs_all); + } +} + TEST(TensorRT_mobilenet, compare) { std::string model_dir = FLAGS_infer_model + "/mobilenet"; compare(model_dir, /* use_tensorrt */ true); @@ -162,5 +183,15 @@ TEST(TensorRT_mobilenet, profile) { profile(model_dir, true, false); } +TEST(resnet50, compare_continuous_input) { + std::string model_dir = FLAGS_infer_model + "/resnet50"; + compare_continuous_input(model_dir, true); +} + +TEST(resnet50, compare_continuous_input_native) { + std::string model_dir = FLAGS_infer_model + "/resnet50"; + compare_continuous_input(model_dir, false); +} + } // namespace inference } // namespace paddle diff --git a/paddle/fluid/memory/allocation/legacy_allocator.cc b/paddle/fluid/memory/allocation/legacy_allocator.cc index 64aa63ffe9705d75e70c8d9d9cbc433dd6358596..5d8684f083bda8499000c9fd0a7617cf129db13b 100644 --- a/paddle/fluid/memory/allocation/legacy_allocator.cc +++ b/paddle/fluid/memory/allocation/legacy_allocator.cc @@ -14,6 +14,7 @@ #include "paddle/fluid/memory/allocation/legacy_allocator.h" #include +#include #include #include "glog/logging.h" #include "paddle/fluid/memory/detail/buddy_allocator.h" @@ -37,7 +38,7 @@ template void *Alloc(const Place &place, size_t size); template -void Free(const Place &place, void *p); +void Free(const Place &place, void *p, size_t size); template size_t Used(const Place &place); @@ -52,6 +53,11 @@ size_t memory_usage(const platform::Place &p); using BuddyAllocator = detail::BuddyAllocator; +std::unordered_map> + gpu_mem_info; + BuddyAllocator *GetCPUBuddyAllocator() { // We tried thread_local for inference::RNN1 model, but that not works much // for multi-thread test. @@ -98,7 +104,8 @@ void *Alloc(const platform::CPUPlace &place, size_t size) { } template <> -void Free(const platform::CPUPlace &place, void *p) { +void Free(const platform::CPUPlace &place, void *p, + size_t size) { VLOG(10) << "Free pointer=" << p << " on " << platform::Place(place); GetCPUBuddyAllocator()->Free(p); } @@ -177,9 +184,16 @@ void *Alloc(const platform::CUDAPlace &place, LOG(WARNING) << "GPU memory used: " << string::HumanReadableSize(Used(place)); platform::SetDeviceId(cur_dev); - } - if (FLAGS_init_allocated_mem) { - cudaMemset(ptr, 0xEF, size); + } else { + gpu_mem_info[place.device].first += size; + if (gpu_mem_info[place.device].first > gpu_mem_info[place.device].second) { + gpu_mem_info[place.device].second = gpu_mem_info[place.device].first; + VLOG(3) << "device: " << place.device << " peak memory usage : " + << (gpu_mem_info[place.device].second >> 20) << " MiB"; + } + if (FLAGS_init_allocated_mem) { + cudaMemset(ptr, 0xEF, size); + } } return ptr; #else @@ -188,9 +202,11 @@ void *Alloc(const platform::CUDAPlace &place, } template <> -void Free(const platform::CUDAPlace &place, void *p) { +void Free(const platform::CUDAPlace &place, void *p, + size_t size) { #ifdef PADDLE_WITH_CUDA GetGPUBuddyAllocator(place.device)->Free(p); + gpu_mem_info[place.device].first -= size; #else PADDLE_THROW("'CUDAPlace' is not supported in CPU only device."); #endif @@ -243,7 +259,7 @@ void *Alloc(const platform::CUDAPinnedPlace &place, template <> void Free(const platform::CUDAPinnedPlace &place, - void *p) { + void *p, size_t size) { #ifdef PADDLE_WITH_CUDA GetCUDAPinnedBuddyAllocator()->Free(p); #else @@ -264,15 +280,17 @@ struct AllocVisitor : public boost::static_visitor { }; struct FreeVisitor : public boost::static_visitor { - inline explicit FreeVisitor(void *ptr) : ptr_(ptr) {} + inline explicit FreeVisitor(void *ptr, size_t size) + : ptr_(ptr), size_(size) {} template inline void operator()(const Place &place) const { - Free(place, ptr_); + Free(place, ptr_, size_); } private: void *ptr_; + size_t size_; }; size_t Usage::operator()(const platform::CPUPlace &cpu) const { @@ -304,8 +322,9 @@ Allocation *LegacyAllocator::AllocateImpl(size_t size, Allocator::Attr attr) { } void LegacyAllocator::Free(Allocation *allocation) { - boost::apply_visitor(legacy::FreeVisitor(allocation->ptr()), - allocation->place()); + boost::apply_visitor( + legacy::FreeVisitor(allocation->ptr(), allocation->size()), + allocation->place()); delete allocation; } } // namespace allocation diff --git a/paddle/fluid/operators/CMakeLists.txt b/paddle/fluid/operators/CMakeLists.txt index 992a2bdd5ad639bf6176328e94da6eb71a41790c..e099425b94221bf1229e936fc1781615d13dbc26 100644 --- a/paddle/fluid/operators/CMakeLists.txt +++ b/paddle/fluid/operators/CMakeLists.txt @@ -13,6 +13,7 @@ add_subdirectory(detection) add_subdirectory(elementwise) add_subdirectory(fused) add_subdirectory(metrics) +add_subdirectory(ngraph) add_subdirectory(optimizers) add_subdirectory(reduce_ops) add_subdirectory(sequence_ops) @@ -66,7 +67,7 @@ set(COMMON_OP_DEPS ${OP_HEADER_DEPS}) set(COMMON_OP_DEPS ${COMMON_OP_DEPS} selected_rows_functor selected_rows lod_tensor maxouting unpooling pooling lod_rank_table context_project sequence_pooling executor) set(COMMON_OP_DEPS ${COMMON_OP_DEPS} dynload_warpctc) set(COMMON_OP_DEPS ${COMMON_OP_DEPS} sequence_padding sequence_scale cos_sim_functor memory jit_kernel_helper concat_and_split cross_entropy softmax vol2col im2col sampler tree2col) -set(COMMON_OP_DEPS ${COMMON_OP_DEPS} sequence2batch lstm_compute matrix_bit_code gru_compute activation_functions) +set(COMMON_OP_DEPS ${COMMON_OP_DEPS} sequence2batch lstm_compute matrix_bit_code gru_compute activation_functions beam_search) if (WITH_GPU) set(COMMON_OP_DEPS ${COMMON_OP_DEPS} depthwise_conv prelu) endif() @@ -86,7 +87,6 @@ set(GLOB_OPERATOR_DEPS ${OPERATOR_DEPS} CACHE INTERNAL "Global Op dependencies") cc_test(gather_test SRCS gather_test.cc DEPS tensor) cc_test(scatter_test SRCS scatter_test.cc DEPS tensor math_function) cc_test(beam_search_decode_op_test SRCS beam_search_decode_op_test.cc DEPS lod_tensor) -cc_test(beam_search_op_test SRCS beam_search_op_test.cc DEPS lod_tensor beam_search_op) cc_test(strided_memcpy_test SRCS strided_memcpy_test.cc DEPS tensor memory) cc_test(save_load_op_test SRCS save_load_op_test.cc DEPS save_op load_op) cc_test(save_load_combine_op_test SRCS save_load_combine_op_test.cc DEPS save_combine_op load_combine_op) diff --git a/paddle/fluid/operators/beam_search_op.cc b/paddle/fluid/operators/beam_search_op.cc index 30f700f1d91c5a81f39594b6dab7e5e717c9818f..e78ecc1a12309fe084a4165e5bb0d8bfb1dcf957 100644 --- a/paddle/fluid/operators/beam_search_op.cc +++ b/paddle/fluid/operators/beam_search_op.cc @@ -12,205 +12,15 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ -#include -#include +#include "paddle/fluid/operators/beam_search_op.h" + #include #include - -#include "paddle/fluid/framework/lod_tensor.h" #include "paddle/fluid/framework/op_registry.h" -#include "paddle/fluid/operators/beam_search_op.h" namespace paddle { namespace operators { -void BeamSearch::operator()(const framework::LoDTensor &pre_ids, - const framework::LoDTensor &pre_scores, - framework::LoDTensor *selected_ids, - framework::LoDTensor *selected_scores) { - auto abs_lod = framework::ToAbsOffset(ids_->lod()); - auto &high_level = abs_lod[lod_level_]; - - auto items = SelectTopBeamSizeItems(pre_ids, pre_scores); - auto selected_items = ToMap(items, high_level.back()); - VLOG(3) << "selected_items:"; - for (size_t i = 0; i < selected_items.size(); ++i) { - VLOG(3) << "offset:" << i; - for (auto &item : selected_items[i]) { - VLOG(3) << ItemToString(item); - } - } - - PruneEndBeams(pre_ids, &selected_items); - // calculate the output tensor's height - size_t num_instances = std::accumulate( - std::begin(selected_items), std::end(selected_items), 0, - [](size_t a, std::vector &b) { return a + b.size(); }); - // the output tensor shape should be [num_instances, 1] - auto dims = framework::make_ddim( - std::vector({static_cast(num_instances), 1})); - selected_ids->Resize(dims); - selected_scores->Resize(dims); - - std::map> hash; - framework::LoD new_lod; - auto *ids_data = selected_ids->mutable_data(platform::CPUPlace()); - auto *scores_data = - selected_scores->mutable_data(platform::CPUPlace()); - - // fill in data - std::vector low_level; - size_t low_offset = 0; - for (auto &items : selected_items) { - low_level.push_back(low_offset); - for (auto &item : items) { - ids_data[low_offset] = item.id; - scores_data[low_offset] = item.score; - low_offset++; - } - } - low_level.push_back(low_offset); - - // fill lod - framework::LoD lod(2); - lod[0].assign(high_level.begin(), high_level.end()); - lod[1].assign(low_level.begin(), low_level.end()); - if (!framework::CheckLoD(lod)) { - PADDLE_THROW("lod %s is not right", framework::LoDToString(lod)); - } - selected_ids->set_lod(lod); - selected_scores->set_lod(lod); -} - -void BeamSearch::PruneEndBeams(const framework::LoDTensor &pre_ids, - std::vector> *items) { - auto *pre_ids_data = pre_ids.data(); - auto abs_lod = framework::ToAbsOffset(ids_->lod()); - auto &high_level = abs_lod[lod_level_]; - for (size_t src_idx = 0; src_idx < high_level.size() - 1; ++src_idx) { - size_t src_prefix_start = high_level[src_idx]; - size_t src_prefix_end = high_level[src_idx + 1]; - bool finish_flag = true; - for (size_t offset = src_prefix_start; offset < src_prefix_end; offset++) { - for (auto &item : items->at(offset)) { - if (item.id != static_cast(end_id_) || - pre_ids_data[offset] != end_id_) { - finish_flag = false; - break; - } - } - if (!finish_flag) break; - } - if (finish_flag) { // all branchs of the beam (source sentence) end and - // prune this beam - for (size_t offset = src_prefix_start; offset < src_prefix_end; offset++) - items->at(offset).clear(); - } - } -} - -std::vector> BeamSearch::ToMap( - const std::vector> &items, size_t element_num) { - std::vector> result; - result.resize(element_num); - for (auto &entries : items) { - for (const auto &item : entries) { - result[item.offset].push_back(item); - } - } - return result; -} - -std::vector> BeamSearch::SelectTopBeamSizeItems( - const framework::LoDTensor &pre_ids, - const framework::LoDTensor &pre_scores) { - std::vector> result; - std::vector items; - // for each source sentence, select the top beam_size items across all - // candidate sets. - while (NextItemSet(pre_ids, pre_scores, &items)) { - std::nth_element( - std::begin(items), std::begin(items) + beam_size_, std::end(items), - [](const Item &a, const Item &b) { return a.score > b.score; }); - // prune the top beam_size items. - if (items.size() > beam_size_) { - items.resize(beam_size_); - } - result.emplace_back(items); - } - VLOG(3) << "SelectTopBeamSizeItems result size " << result.size(); - for (auto &items : result) { - VLOG(3) << "item set:"; - for (auto &item : items) { - VLOG(3) << ItemToString(item); - } - } - - return result; -} - -// the candidates of a source -bool BeamSearch::NextItemSet(const framework::LoDTensor &pre_ids, - const framework::LoDTensor &pre_scores, - std::vector *items) { - if (sent_offset_ >= ids_->NumElements(lod_level_)) { - return false; - } - // find the current candidates - auto ids = *ids_; - auto scores = *scores_; - - auto abs_lod = framework::ToAbsOffset(ids.lod()); - - auto *ids_data = ids.data(); - auto *scores_data = scores.data(); - - size_t instance_dim = 1; - for (int i = 1; i < ids.dims().size(); i++) { - instance_dim *= ids.dims()[i]; - } - - auto *pre_ids_data = pre_ids.data(); - auto *pre_scores_data = pre_scores.data(); - items->clear(); - items->reserve(framework::product(ids.dims())); - for (size_t offset = abs_lod[lod_level_][sent_offset_]; - offset < abs_lod[lod_level_][sent_offset_ + 1]; offset++) { - auto pre_id = pre_ids_data[offset]; - auto pre_score = pre_scores_data[offset]; - if (pre_id == end_id_) { - // Allocate all probability mass to eos_id for finished branchs and the - // other candidate ids can be ignored. - items->emplace_back(offset, end_id_, pre_score); - } else { - for (size_t d = 0; d < instance_dim; d++) { - const size_t dim_offset = offset * instance_dim + d; - items->emplace_back(offset, ids_data[dim_offset], - scores_data[dim_offset]); - } - } - } - - sent_offset_++; - return true; -} - -std::ostream &operator<<(std::ostream &os, const BeamSearch::Item &item) { - os << "{"; - os << "offset: " << item.offset << ", "; - os << "id: " << item.id << ", "; - os << "score: " << item.score << ""; - os << "}"; - - return os; -} - -std::string ItemToString(const BeamSearch::Item &item) { - std::ostringstream stream; - stream << item; - return stream.str(); -} - class BeamSearchOpMaker : public framework::OpProtoAndCheckerMaker { public: void Make() override { @@ -219,18 +29,23 @@ class BeamSearchOpMaker : public framework::OpProtoAndCheckerMaker { "(LoDTensor) The LoDTensor containing the selected ids at the " "previous step. It should be a tensor with shape (batch_size, 1) " "and lod `[[0, 1, ... , batch_size], [0, 1, ..., batch_size]]` at " - "thefirst step."); + "the first step."); AddInput("pre_scores", "(LoDTensor) The LoDTensor containing the accumulated " "scores corresponding to the selected ids at the previous step."); AddInput("ids", "(LoDTensor) The LoDTensor containing the candidates ids. Its " - "shape should be (batch_size * beam_size, K), where K supposed to " - "be beam_size."); + "shape should be (batch_size * beam_size, W). If not set, it will " + "be calculated out according to Input(scores) in this operator.") + .AsDispensable(); AddInput("scores", - "(LoDTensor) The LodTensor containing the accumulated scores " - "corresponding to Input(ids) and its shape is the same as the " - "shape of Input(ids)."); + "(LoDTensor) The LoDTensor containing the current scores " + "corresponding to Input(ids). If Input(ids) is not nullptr, its " + "shape is the same as that of Input(ids)." + "If is_accumulated is true, Input(scores) is accumulated scores " + "and will be used derectedly. Else, each score will be " + "transformed to the log field and accumulate Input(pre_sores) " + "first."); AddOutput("selected_ids", "A LodTensor that stores the IDs selected by beam search."); AddOutput("selected_scores", @@ -242,6 +57,9 @@ class BeamSearchOpMaker : public framework::OpProtoAndCheckerMaker { AddAttr("beam_size", "beam size for beam search"); AddAttr("end_id", "the token id which indicates the end of a sequence"); + AddAttr("is_accumulated", + "Whether the Input(scores) is accumulated scores.") + .SetDefault(true); AddComment(R"DOC( This operator does the search in beams for one time step. @@ -265,10 +83,9 @@ class BeamSearchOp : public framework::OperatorWithKernel { public: using framework::OperatorWithKernel::OperatorWithKernel; - protected: void InferShape(framework::InferShapeContext *ctx) const override { for (const std::string &arg : - std::vector({"pre_ids", "ids", "scores"})) { + std::vector({"pre_ids", "scores"})) { PADDLE_ENFORCE(ctx->HasInput(arg), "BeamSearch need input argument '%s'", arg); } @@ -279,12 +96,22 @@ class BeamSearchOp : public framework::OperatorWithKernel { } } + protected: framework::OpKernelType GetExpectedKernelType( const framework::ExecutionContext &ctx) const override { - framework::OpKernelType kt = framework::OpKernelType( - ctx.Input("pre_ids")->type(), - platform::CPUPlace()); - return kt; + auto *scores = ctx.Input("scores"); + size_t level = ctx.Attr("level"); + size_t batch_size = scores->lod()[level].size() - 1; + // The current CUDA kernel only support cases with batch_size < 4. + // Compute on CPU for cases with batch_size > 4. + if (batch_size <= 4) { + return framework::OpKernelType( + ctx.Input("pre_ids")->type(), ctx.GetPlace()); + } else { + return framework::OpKernelType( + ctx.Input("pre_ids")->type(), + platform::CPUPlace()); + } } }; diff --git a/paddle/fluid/operators/beam_search_op.cu.cc b/paddle/fluid/operators/beam_search_op.cu.cc new file mode 100644 index 0000000000000000000000000000000000000000..4ef9476eee5d3fac4decd7273da824b2f2349199 --- /dev/null +++ b/paddle/fluid/operators/beam_search_op.cu.cc @@ -0,0 +1,24 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/operators/beam_search_op.h" +#include "paddle/fluid/framework/op_registry.h" + +namespace ops = paddle::operators; +REGISTER_OP_CUDA_KERNEL( + beam_search, + ops::BeamSearchOpKernel, + ops::BeamSearchOpKernel, + ops::BeamSearchOpKernel, + ops::BeamSearchOpKernel); diff --git a/paddle/fluid/operators/beam_search_op.h b/paddle/fluid/operators/beam_search_op.h index b5e2ed05924cc8b7bc06058b9b1103ba10be486e..1b939e742de06aedf187d25d002d19e0a4fafc9d 100644 --- a/paddle/fluid/operators/beam_search_op.h +++ b/paddle/fluid/operators/beam_search_op.h @@ -4,7 +4,7 @@ Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at -http://www.apache.org/licenses/LICENSE-2.0 + http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, @@ -14,187 +14,12 @@ limitations under the License. */ #pragma once -#include -#include -#include "paddle/fluid/framework/lod_tensor.h" #include "paddle/fluid/framework/operator.h" +#include "paddle/fluid/operators/math/beam_search.h" namespace paddle { namespace operators { -/* - * This is an implementation of beam search. - * - * To explain the details, lets take machine translation task for example, in - * this task, one source sentence is translated to multiple target sentences, - * during this period, one sentence will be translated to multiple translation - * prefixes(target sentence that have not ended), in each time step a prefix - * will have some candidates, input the candidate ids and their corresponding - * scores (probabilities), it will sort and select the top beam_size candidates - * for each source sentence, and store the selected candidates's score and their - * corresponding ids to LoDTensors. - * - * A detailed example: - * - * Input - * - * ids: - * LoD (should have 2 levels) - * first level: [0, 1, 4] - * second level: [0, 1, 2, 3, 4] - * - * tensor's data - * [ - * [4, 2, 5] - * [2, 1, 3] - * [3, 5, 2] - * [8, 2, 1] - * ] - * - * scores: - * LoD same as `ids` - * tensor's data - * [ - * [0.5, 0.3, 0.2] - * [0.6, 0.3, 0.1] - * [0.9, 0.5, 0.1] - * [0.7, 0.5, 0.1] - * ] - * - * the inputs means that there are 2 source sentences to translate, and the - * first source has 1 prefix, the second source has 2 prefix. - * - * lets assume beam size is 2, and the beam search's output should be - * LoD - * first level: - * [0, 1, 2] - * second level: - * [0, 2, 4] - * - * id tensor's data - * [[ - * 4, - * 1, - * 3, - * 8, - * ]] - * - * score tensor's data - * [[ - * 0.5, - * 0.3, - * 0.9, - * 0.7 - * ]] - * - * TODO all the prune operations should be in the beam search, so it is better - * to split the beam search algorithm into a sequence of smaller operators, and - * the prune operators can be inserted in this sequence. - */ -class BeamSearch { - public: - // TODO(superjom) make type customizable - using id_t = size_t; - using score_t = float; - /* - * Input the arguments that needed by this class. - */ - BeamSearch(const framework::LoDTensor& ids, - const framework::LoDTensor& scores, size_t level, size_t beam_size, - int end_id) - : beam_size_(beam_size), - ids_(&ids), - scores_(&scores), - lod_level_(level), - end_id_(end_id) {} - - /* - * The main function of beam search. - * - * @selected_ids: a [None, 1]-shaped tensor with LoD. - * In a machine translation model, it might be the candidate term id sets, - * each set stored as a varience-length sequence. - * The format might be described with a two-level LoD - * - [[0 1] - * - [0 1 2]] - * - [[] - * - [0 1]] - * the first level of LoD tells that there are two source sentences. The - * second level describes the details of the candidate id set's offsets in - * the - * source sentences. - * - * @selected_scores: a LoD tensor with the same shape and LoD with - * selected_ids. - * It stores the corresponding scores of candidate ids in selected_ids. - * - * Return false if all the input tensor is empty, in machine translation task - * that means no candidates is provided, and the task will stop running. - */ - void operator()(const framework::LoDTensor& pre_ids, - const framework::LoDTensor& pre_scores, - framework::LoDTensor* selected_ids, - framework::LoDTensor* selected_scores); - /* - * The basic items help to sort. - */ - struct Item { - Item() {} - Item(size_t offset, size_t id, float score) - : offset(offset), id(id), score(score) {} - // offset in the higher lod level. - size_t offset; - // // prefix id in the lower lod level. - // size_t prefix; - // the candidate id - id_t id; - // the corresponding score - score_t score; - }; - - protected: - /* - * Prune the source sentences all branchs finished, and it is optional. - * Pruning must one step later than finishing (thus pre_ids is needed here), - * since the end tokens must be writed out. - */ - void PruneEndBeams(const framework::LoDTensor& pre_ids, - std::vector>* items); - - /* - * Transform the items into a map whose key is offset, value is the items. - * NOTE low performance. - */ - std::vector> ToMap( - const std::vector>& inputs, size_t element_num); - - /* - * For each source, select top beam_size records. - */ - std::vector> SelectTopBeamSizeItems( - const framework::LoDTensor& pre_ids, - const framework::LoDTensor& pre_scores); - - /* - * Get the items of next source sequence, return false if no remaining items. - */ - bool NextItemSet(const framework::LoDTensor& pre_ids, - const framework::LoDTensor& pre_scores, - std::vector* items); - - private: - size_t beam_size_; - const framework::LoDTensor* ids_; - const framework::LoDTensor* scores_; - size_t lod_level_{0}; - size_t sent_offset_{0}; - int end_id_{0}; -}; - -std::ostream& operator<<(std::ostream& os, const BeamSearch::Item& item); - -std::string ItemToString(const BeamSearch::Item& item); - template class BeamSearchOpKernel : public framework::OpKernel { public: @@ -203,7 +28,7 @@ class BeamSearchOpKernel : public framework::OpKernel { auto* scores = context.Input("scores"); auto* pre_ids = context.Input("pre_ids"); auto* pre_scores = context.Input("pre_scores"); - PADDLE_ENFORCE_NOT_NULL(ids); + PADDLE_ENFORCE_NOT_NULL(scores); PADDLE_ENFORCE_NOT_NULL(pre_ids); PADDLE_ENFORCE_NOT_NULL(pre_scores); @@ -211,14 +36,20 @@ class BeamSearchOpKernel : public framework::OpKernel { size_t level = context.Attr("level"); size_t beam_size = context.Attr("beam_size"); int end_id = context.Attr("end_id"); - BeamSearch alg(*ids, *scores, level, beam_size, end_id); + bool is_accumulated = context.Attr("is_accumulated"); + auto selected_ids = context.Output("selected_ids"); auto selected_scores = context.Output("selected_scores"); PADDLE_ENFORCE_NOT_NULL(selected_ids); PADDLE_ENFORCE_NOT_NULL(selected_scores); - alg(*pre_ids, *pre_scores, selected_ids, selected_scores); + + math::BeamSearchFunctor alg; + alg(context.template device_context(), pre_ids, pre_scores, + ids, scores, selected_ids, selected_scores, level, beam_size, end_id, + is_accumulated); } }; + } // namespace operators } // namespace paddle diff --git a/paddle/fluid/operators/beam_search_op_test.cc b/paddle/fluid/operators/beam_search_op_test.cc deleted file mode 100644 index 40b46781daa989fcd89887a3c01e97e39ea71255..0000000000000000000000000000000000000000 --- a/paddle/fluid/operators/beam_search_op_test.cc +++ /dev/null @@ -1,92 +0,0 @@ -/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. - - Licensed under the Apache License, Version 2.0 (the "License"); - you may not use this file except in compliance with the License. - You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. */ - -#include "paddle/fluid/operators/beam_search_op.h" - -#include -#include - -namespace paddle { -namespace test { - -using std::vector; -using framework::LoDTensor; -using framework::LoD; -using operators::BeamSearch; -using paddle::platform::CPUPlace; -using std::cout; -using std::endl; - -void CreateInput(LoDTensor* ids, LoDTensor* scores) { - LoD lod; - vector level0({0, 2, 4}); - vector level1({0, 1, 2, 3, 4}); - lod.push_back(level0); - lod.push_back(level1); - ids->set_lod(lod); - scores->set_lod(lod); - - auto dims = framework::make_ddim(vector({4, 3})); - ids->Resize(dims); - scores->Resize(dims); - CPUPlace place; - - auto* ids_data = ids->mutable_data(place); - auto* scores_data = scores->mutable_data(place); - vector _ids({4, 2, 5, 2, 1, 3, 3, 5, 2, 8, 2, 1}); - vector _scores( - {0.5f, 0.3f, 0.2f, 0.6f, 0.3f, 0.1f, 0.9f, 0.5f, 0.1f, 0.7f, 0.5f, 0.1f}); - - for (int i = 0; i < 12; i++) { - ids_data[i] = _ids[i]; - scores_data[i] = _scores[i]; - } -} - -// It seems that beam_search_op has bugs. -TEST(DISABLED_beam_search_op, run) { - CPUPlace place; - LoDTensor ids, scores; - CreateInput(&ids, &scores); - - LoDTensor pre_ids; - pre_ids.Resize(framework::make_ddim(vector(4, 1))); - for (int i = 0; i < 4; i++) { - pre_ids.mutable_data(place)[i] = i + 1; - } - LoDTensor pre_scores; - pre_scores.Resize(framework::make_ddim(vector(4, 1))); - for (int i = 0; i < 4; i++) { - pre_scores.mutable_data(place)[i] = 0.1 * (i + 1); - } - - BeamSearch beamsearch(ids, scores, (size_t)0, (size_t)2, 0); - LoDTensor sids, sscores; - beamsearch(pre_ids, pre_scores, &sids, &sscores); - - LOG(INFO) << "score: " << sscores << endl; - - ASSERT_EQ(sids.lod(), sscores.lod()); - - vector tids({4, 2, 3, 8}); - vector tscores({0.5f, 0.6f, 0.9f, 0.7f}); - - for (int i = 0; i < 4; i++) { - ASSERT_EQ(tids[i], sids.data()[i]); - ASSERT_EQ(tscores[i], sscores.data()[i]); - } -} - -} // namespace test -} // namespace paddle diff --git a/paddle/fluid/operators/bpr_loss_op.h b/paddle/fluid/operators/bpr_loss_op.h index e223be7af82146e7c69c7c5aab8f08d0fe0d1710..f9570e4e2ed0d9ac8739410eb7cd7397ad09fae4 100644 --- a/paddle/fluid/operators/bpr_loss_op.h +++ b/paddle/fluid/operators/bpr_loss_op.h @@ -87,8 +87,8 @@ class BprLossGradientOpKernel : public framework::OpKernel { auto* label = ctx.Input("Label"); auto* dx = ctx.Output(framework::GradVarName("X")); - const int step_size = x->dims()[0]; - const int num_classes = x->dims()[1]; + const size_t step_size = static_cast(x->dims()[0]); + const size_t num_classes = static_cast(x->dims()[1]); T* dx_data = dx->mutable_data(ctx.GetPlace()); const T* dy_data = dy->data(); const T* x_data = x->data(); diff --git a/paddle/fluid/operators/conv_fusion_op.cu.cc b/paddle/fluid/operators/conv_fusion_op.cu.cc index f97ebecfdd90beade3bef824c04ad7b2763eb036..d8b997cca613f660046106512fc03bf55f9b992d 100644 --- a/paddle/fluid/operators/conv_fusion_op.cu.cc +++ b/paddle/fluid/operators/conv_fusion_op.cu.cc @@ -104,9 +104,7 @@ class CUDNNConvFusionOpKernel : public framework::OpKernel { // ------------------- cudnn conv algorithm --------------------- cudnnConvolutionFwdAlgo_t algo; auto handle = dev_ctx.cudnn_handle(); - - Tensor cudnn_workspace; - void* cudnn_workspace_ptr = nullptr; + auto workspace_handle = dev_ctx.cudnn_workspace_handle(); CUDNN_ENFORCE(platform::dynload::cudnnSetConvolutionMathType( cudnn_conv_desc, CUDNN_DEFAULT_MATH)); @@ -120,24 +118,19 @@ class CUDNNConvFusionOpKernel : public framework::OpKernel { workspace_size_limit, &algo)); VLOG(3) << "cuDNN forward algo " << algo; } else { - cudnn_workspace = - ctx.AllocateTmpTensor( - framework::make_ddim( - {static_cast(workspace_size_limit)}), - dev_ctx); - cudnn_workspace_ptr = static_cast(cudnn_workspace.data()); - auto search_func = [&]() { int returned_algo_count; std::array fwd_perf_stat; - - CUDNN_ENFORCE(platform::dynload::cudnnFindConvolutionForwardAlgorithmEx( - handle, cudnn_input_desc, input_data, cudnn_filter_desc, - filter_data, cudnn_conv_desc, cudnn_output_desc, output_data, - kNUM_CUDNN_FWD_ALGS, &returned_algo_count, fwd_perf_stat.data(), - cudnn_workspace_ptr, workspace_size_limit)); - + auto cudnn_find_func = [&](void* cudnn_workspace) { + CUDNN_ENFORCE( + platform::dynload::cudnnFindConvolutionForwardAlgorithmEx( + handle, cudnn_input_desc, input_data, cudnn_filter_desc, + filter_data, cudnn_conv_desc, cudnn_output_desc, output_data, + kNUM_CUDNN_FWD_ALGS, &returned_algo_count, + fwd_perf_stat.data(), cudnn_workspace, workspace_size_limit)); + }; + workspace_handle.RunFunc(cudnn_find_func, workspace_size_limit); VLOG(3) << "Perf result: (algo: stat, time, memory)"; for (int i = 0; i < returned_algo_count; ++i) { const auto& stat = fwd_perf_stat[i]; @@ -188,15 +181,6 @@ class CUDNNConvFusionOpKernel : public framework::OpKernel { PADDLE_ENFORCE_LE(workspace_size_in_bytes, workspace_size_limit, "workspace_size to be allocated exceeds the limit"); - if (!cudnn_workspace_ptr) { - cudnn_workspace = - ctx.AllocateTmpTensor( - framework::make_ddim( - {static_cast(workspace_size_in_bytes)}), - dev_ctx); - cudnn_workspace_ptr = static_cast(cudnn_workspace.data()); - } - if ((activation == "identity") && (!residual)) { // Only the CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_PRECOMP_GEMM algo is // enabled with CUDNN_ACTIVATION_IDENTITY in cuDNN lib. @@ -204,12 +188,13 @@ class CUDNNConvFusionOpKernel : public framework::OpKernel { // cudnnConvolutionForward and cudnnAddTensor // ------------- cudnn conv forward and bias add --------------------- ScalingParamType alpha = 1.0f, beta = 0.0f; - - CUDNN_ENFORCE(platform::dynload::cudnnConvolutionForward( - handle, &alpha, cudnn_input_desc, input_data, cudnn_filter_desc, - filter_data, cudnn_conv_desc, algo, cudnn_workspace_ptr, - workspace_size_in_bytes, &beta, cudnn_output_desc, output_data)); - + auto cudnn_func = [&](void* cudnn_workspace) { + CUDNN_ENFORCE(platform::dynload::cudnnConvolutionForward( + handle, &alpha, cudnn_input_desc, input_data, cudnn_filter_desc, + filter_data, cudnn_conv_desc, algo, cudnn_workspace, + workspace_size_in_bytes, &beta, cudnn_output_desc, output_data)); + }; + workspace_handle.RunFunc(cudnn_func, workspace_size_in_bytes); CUDNN_ENFORCE(platform::dynload::cudnnAddTensor( handle, &alpha, cudnn_bias_desc, bias_data, &alpha, cudnn_output_desc, output_data)); @@ -220,13 +205,15 @@ class CUDNNConvFusionOpKernel : public framework::OpKernel { // ------------------- cudnn conv+bias+act forward -------------------- ScalingParamType alpha1 = 1.0f; ScalingParamType alpha2 = residual ? 1.0f : 0.0f; - - CUDNN_ENFORCE(platform::dynload::cudnnConvolutionBiasActivationForward( - handle, &alpha1, cudnn_input_desc, input_data, cudnn_filter_desc, - filter_data, cudnn_conv_desc, algo, cudnn_workspace_ptr, - workspace_size_in_bytes, &alpha2, cudnn_output_desc, residual_data, - cudnn_bias_desc, bias_data, cudnn_act_desc, cudnn_output_desc, - output_data)); + auto cudnn_func = [&](void* cudnn_workspace) { + CUDNN_ENFORCE(platform::dynload::cudnnConvolutionBiasActivationForward( + handle, &alpha1, cudnn_input_desc, input_data, cudnn_filter_desc, + filter_data, cudnn_conv_desc, algo, cudnn_workspace, + workspace_size_in_bytes, &alpha2, cudnn_output_desc, residual_data, + cudnn_bias_desc, bias_data, cudnn_act_desc, cudnn_output_desc, + output_data)); + }; + workspace_handle.RunFunc(cudnn_func, workspace_size_in_bytes); } std::vector channels = ctx.Attr>("split_channels"); if (channels.size()) { diff --git a/paddle/fluid/operators/conv_transpose_cudnn_op.cu.cc b/paddle/fluid/operators/conv_transpose_cudnn_op.cu.cc index 016cf8448c5e07fdedab8c5e4a7d0ae9e2ded1ee..f44094ca6b7b7f23f2e7593ad79e4e2a6f0d3070 100644 --- a/paddle/fluid/operators/conv_transpose_cudnn_op.cu.cc +++ b/paddle/fluid/operators/conv_transpose_cudnn_op.cu.cc @@ -104,18 +104,16 @@ class CUDNNConvTransposeOpKernel : public framework::OpKernel { int output_offset = output->numel() / output->dims()[0] / groups; int filter_offset = filter->numel() / groups; T alpha = 1.0f, beta = 0.0f; - - auto temp_allocation = - platform::DeviceTemporaryAllocator::Instance().Get(dev_ctx).Allocate( - workspace_size_in_bytes); - void* cudnn_workspace = temp_allocation->ptr(); - + auto workspace_handle = dev_ctx.cudnn_workspace_handle(); for (int g = 0; g < groups; g++) { - CUDNN_ENFORCE(platform::dynload::cudnnConvolutionBackwardData( - handle, &alpha, cudnn_filter_desc, filter_data + filter_offset * g, - cudnn_input_desc, input_data + input_offset * g, cudnn_conv_desc, - algo, cudnn_workspace, workspace_size_in_bytes, &beta, - cudnn_output_desc, output_data + output_offset * g)); + auto cudnn_func = [&](void* cudnn_workspace) { + CUDNN_ENFORCE(platform::dynload::cudnnConvolutionBackwardData( + handle, &alpha, cudnn_filter_desc, filter_data + filter_offset * g, + cudnn_input_desc, input_data + input_offset * g, cudnn_conv_desc, + algo, cudnn_workspace, workspace_size_in_bytes, &beta, + cudnn_output_desc, output_data + output_offset * g)); + }; + workspace_handle.RunFunc(cudnn_func, workspace_size_in_bytes); } } }; @@ -211,22 +209,20 @@ class CUDNNConvTransposeGradOpKernel : public framework::OpKernel { output_grad->numel() / output_grad->dims()[0] / groups; int filter_offset = filter->numel() / groups; T alpha = 1.0f, beta = 0.0f; - - auto temp_allocation = - platform::DeviceTemporaryAllocator::Instance().Get(dev_ctx).Allocate( - workspace_size_in_bytes); - void* cudnn_workspace = temp_allocation->ptr(); - + auto workspace_handle = dev_ctx.cudnn_workspace_handle(); if (input_grad) { T* input_grad_data = input_grad->mutable_data(ctx.GetPlace()); // Because beta is zero, it is unnecessary to reset input_grad. for (int g = 0; g < groups; g++) { - CUDNN_ENFORCE(platform::dynload::cudnnConvolutionForward( - handle, &alpha, cudnn_output_desc, - output_grad_data + output_grad_offset * g, cudnn_filter_desc, - filter_data + filter_offset * g, cudnn_conv_desc, data_algo, - cudnn_workspace, workspace_size_in_bytes, &beta, cudnn_input_desc, - input_grad_data + input_offset * g)); + auto cudnn_func = [&](void* cudnn_workspace) { + CUDNN_ENFORCE(platform::dynload::cudnnConvolutionForward( + handle, &alpha, cudnn_output_desc, + output_grad_data + output_grad_offset * g, cudnn_filter_desc, + filter_data + filter_offset * g, cudnn_conv_desc, data_algo, + cudnn_workspace, workspace_size_in_bytes, &beta, cudnn_input_desc, + input_grad_data + input_offset * g)); + }; + workspace_handle.RunFunc(cudnn_func, workspace_size_in_bytes); } } @@ -236,12 +232,15 @@ class CUDNNConvTransposeGradOpKernel : public framework::OpKernel { // Because beta is zero, it is unnecessary to reset filter_grad. // Gradient with respect to the filter for (int g = 0; g < groups; g++) { - CUDNN_ENFORCE(platform::dynload::cudnnConvolutionBackwardFilter( - handle, &alpha, cudnn_output_desc, - output_grad_data + output_grad_offset * g, cudnn_input_desc, - input_data + input_offset * g, cudnn_conv_desc, filter_algo, - cudnn_workspace, workspace_size_in_bytes, &beta, cudnn_filter_desc, - filter_grad_data + filter_offset * g)); + auto cudnn_func = [&](void* cudnn_workspace) { + CUDNN_ENFORCE(platform::dynload::cudnnConvolutionBackwardFilter( + handle, &alpha, cudnn_output_desc, + output_grad_data + output_grad_offset * g, cudnn_input_desc, + input_data + input_offset * g, cudnn_conv_desc, filter_algo, + cudnn_workspace, workspace_size_in_bytes, &beta, + cudnn_filter_desc, filter_grad_data + filter_offset * g)); + }; + workspace_handle.RunFunc(cudnn_func, workspace_size_in_bytes); } } } diff --git a/paddle/fluid/operators/distributed/CMakeLists.txt b/paddle/fluid/operators/distributed/CMakeLists.txt index cb492f999532fff3562050135c3a1abdcda06ad5..fc28fe818dc0bd2a8607118c015b6b5fd168fb43 100644 --- a/paddle/fluid/operators/distributed/CMakeLists.txt +++ b/paddle/fluid/operators/distributed/CMakeLists.txt @@ -20,7 +20,7 @@ if(WITH_GRPC) collective_client.cc collective_server.cc ${GRPC_SRCS} PROTO send_recv.proto - DEPS lod_tensor selected_rows_functor memory) + DEPS lod_tensor selected_rows_functor memory scope ${GRPC_DEPS}) set_source_files_properties(grpc_serde_test.cc rpc_server_test.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS}) set(RPC_DEPS sendrecvop_rpc ${GRPC_DEPS}) @@ -32,15 +32,17 @@ else() set(BRPC_SRCS brpc/brpc_client.cc brpc/brpc_server.cc brpc/brpc_sendrecvop_utils.cc brpc/brpc_variable_response.cc brpc/brpc_rdma_pool.cc) set_source_files_properties(${BRPC_SRCS} parameter_prefetch.cc rpc_server_test.cc brpc/brpc_serde_test.cc collective_server.cc collective_server_test.cc collective_client.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS}) + set(BRPC_DEPS brpc ssl crypto protobuf leveldb snappystream snappy zlib) + brpc_library(sendrecvop_rpc SRCS sendrecvop_utils.cc request_handler_impl.cc rpc_client.cc rpc_server.cc variable_response.cc collective_client.cc collective_server.cc ${BRPC_SRCS} PROTO send_recv.proto - DEPS lod_tensor selected_rows memory) + DEPS lod_tensor selected_rows memory scope ${BRPC_DEPS}) - set(RPC_DEPS sendrecvop_rpc brpc ssl crypto protobuf leveldb snappystream snappy zlib) + set(RPC_DEPS sendrecvop_rpc ${BRPC_DEPS}) cc_test(brpc_serde_test SRCS brpc/brpc_serde_test.cc DEPS ${RPC_DEPS} gflags glog executor proto_desc lookup_sparse_table_op SERIAL) endif() diff --git a/paddle/fluid/operators/distributed/brpc/brpc_client.cc b/paddle/fluid/operators/distributed/brpc/brpc_client.cc index 87bdb83503783b32720eb57bd303ad7eb4bc17a8..b8e63f42e2040730ac79c57651d86d9e3176fa01 100644 --- a/paddle/fluid/operators/distributed/brpc/brpc_client.cc +++ b/paddle/fluid/operators/distributed/brpc/brpc_client.cc @@ -62,7 +62,7 @@ VarHandlePtr BRPCClient::AsyncSendVar(const std::string& ep, const std::string var_name_val = var_name; const framework::Scope* p_scope = &scope; const auto ch_ptr = GetChannel(ep_val); - const std::string method = "SendRPC"; + const std::string method = kSendRPC; VarHandlePtr var_h(new VarHandle(ep, method, var_name_val, p_ctx, p_scope)); framework::AsyncIO([=] { @@ -156,15 +156,18 @@ VarHandlePtr BRPCClient::_AsyncGetVar(const std::string& ep, const platform::DeviceContext& ctx, const framework::Scope& scope, const std::string& var_name, + const std::string& out_var_name, const std::string& method_name, int64_t time_out) { const platform::DeviceContext* p_ctx = &ctx; const std::string ep_val = ep; const std::string var_name_val = var_name; + const std::string out_varname_val = out_var_name; const framework::Scope* p_scope = &scope; const auto ch_ptr = GetChannel(ep_val); - const std::string method = "GetRPC"; - VarHandlePtr var_h(new VarHandle(ep, method, var_name_val, p_ctx, p_scope)); + const std::string method = kGetRPC; + VarHandlePtr var_h( + new VarHandle(ep, method, out_varname_val, p_ctx, p_scope)); framework::AsyncIO([=] { auto ch_ctx = ch_ptr->Pop(); @@ -175,6 +178,7 @@ VarHandlePtr BRPCClient::_AsyncGetVar(const std::string& ep, sendrecv::VariableMessage req; req.set_varname(var_name_val); + req.set_out_varname(out_varname_val); req.set_trainer_id(trainer_id_); google::protobuf::Closure* done = brpc::NewCallback( @@ -182,8 +186,10 @@ VarHandlePtr BRPCClient::_AsyncGetVar(const std::string& ep, platform::RecordRPCEvent record_event(method, p_ctx); - if (method_name == "GetMonomerVariable") { + if (method_name == kGetMonomerRPC) { ch_ctx->stub->GetMonomerVariable(cntl, &req, response, done); + } else if (method_name == kGetNoBarrierRPC) { + ch_ctx->stub->GetVariableNoBarrier(cntl, &req, response, done); } else { ch_ctx->stub->GetVariable(cntl, &req, response, done); } @@ -198,25 +204,39 @@ VarHandlePtr BRPCClient::_AsyncGetVar(const std::string& ep, return var_h; } +VarHandlePtr BRPCClient::AsyncGetVarNoBarrier( + const std::string& ep, const platform::DeviceContext& ctx, + const framework::Scope& scope, const std::string& var_name, + const std::string& out_var_name, int64_t time_out) { + std::string var_name_no_barrier = + string::Sprintf("%s%s", var_name, WITHOUT_BARRIER_MESSAGE); + + return _AsyncGetVar(ep, ctx, scope, var_name_no_barrier, out_var_name, + kGetNoBarrierRPC, time_out); +} + VarHandlePtr BRPCClient::AsyncGetMonomerVariable( const std::string& ep, const platform::DeviceContext& ctx, const framework::Scope& scope, const std::string& var_name, int64_t time_out) { - return _AsyncGetVar(ep, ctx, scope, var_name, "GetMonomerVariable", time_out); + return _AsyncGetVar(ep, ctx, scope, var_name, var_name, kGetMonomerRPC, + time_out); } VarHandlePtr BRPCClient::AsyncGetMonomerBarrier(const std::string& ep, const std::string& var_name, int64_t time_out) { - return AsyncSendMessage(ep, "GetMonomerBarrier", var_name, time_out); + return AsyncSendMessage(ep, kSendMonomerFetchBarrierRPC, var_name, time_out); } VarHandlePtr BRPCClient::AsyncGetVar(const std::string& ep, const platform::DeviceContext& ctx, const framework::Scope& scope, const std::string& var_name, + const std::string& out_var_name, int64_t time_out) { - return _AsyncGetVar(ep, ctx, scope, var_name, "GetVariable", time_out); + return _AsyncGetVar(ep, ctx, scope, var_name, out_var_name, kGetRPC, + time_out); } VarHandlePtr BRPCClient::AsyncPrefetchVar(const std::string& ep, @@ -234,7 +254,7 @@ VarHandlePtr BRPCClient::AsyncPrefetchVar(const std::string& ep, const framework::Scope* p_scope = &scope; const auto ch_ptr = GetChannel(ep_val); - const std::string method = "PrefetchRPC"; + const std::string method = kPrefetchRPC; VarHandlePtr var_h( new VarHandle(ep, method, out_var_name_val, p_ctx, p_scope)); @@ -270,7 +290,7 @@ VarHandlePtr BRPCClient::AsyncPrefetchVar(const std::string& ep, VarHandlePtr BRPCClient::AsyncSendBatchBarrier(const std::string& ep, int64_t time_out) { - return AsyncSendMessage(ep, "BatchBarrierRPC", BATCH_BARRIER_MESSAGE, + return AsyncSendMessage(ep, kBatchBarrierRPC, BATCH_BARRIER_MESSAGE, time_out); } @@ -286,7 +306,7 @@ VarHandlePtr BRPCClient::AsyncSendFetchBarrier(const std::string& ep, sendrecv::VariableMessage req; req.set_varname(FETCH_BARRIER_MESSAGE); - const std::string method = "FetchBarrierRPC"; + const std::string method = kFetchBarrierRPC; // var handle VarHandlePtr var_h( new VarHandle(ep, method, FETCH_BARRIER_MESSAGE, nullptr, nullptr)); @@ -367,7 +387,7 @@ ChannelQueuePtr BRPCClient::GetChannel(const std::string& ep) { VarHandlePtr BRPCClient::AsyncSendComplete(const std::string& ep, int64_t time_out) { - return AsyncSendMessage(ep, "SendCompleteRPC", COMPLETE_MESSAGE, time_out); + return AsyncSendMessage(ep, kSendCompleteRPC, COMPLETE_MESSAGE, time_out); } void BRPCClient::SendComplete() { @@ -394,9 +414,9 @@ VarHandlePtr BRPCClient::AsyncSendVarMessage( google::protobuf::Closure* done = brpc::NewCallback( &HandleSendResponse, cntl, response, var_h, ch_ptr, ch_ctx, this); - if (method_name == "CheckPointNotifyRPC") { + if (method_name == kCheckPointNotifyRPC) { ch_ctx->stub->CheckpointNotify(cntl, &req, response, done); - } else if (method_name == "GetMonomerBarrier") { + } else if (method_name == kSendMonomerFetchBarrierRPC) { ch_ctx->stub->GetMonomerBarrier(cntl, &req, response, done); } else { ch_ctx->stub->SendVariable(cntl, &req, response, done); diff --git a/paddle/fluid/operators/distributed/brpc/brpc_client.h b/paddle/fluid/operators/distributed/brpc/brpc_client.h index 2066ade8a5621f2c201b76690421a943db44535e..501a593b11d35c160348e42ee47216a85647aac4 100644 --- a/paddle/fluid/operators/distributed/brpc/brpc_client.h +++ b/paddle/fluid/operators/distributed/brpc/brpc_client.h @@ -65,6 +65,7 @@ class BRPCClient : public RPCClient { const platform::DeviceContext& ctx, const framework::Scope& scope, const std::string& var_name, + const std::string& out_var_name, int64_t time_out = FLAGS_rpc_deadline) override; VarHandlePtr AsyncGetMonomerBarrier( @@ -76,6 +77,13 @@ class BRPCClient : public RPCClient { const framework::Scope& scope, const std::string& var_name, int64_t time_out = FLAGS_rpc_deadline) override; + VarHandlePtr AsyncGetVarNoBarrier(const std::string& ep, + const platform::DeviceContext& ctx, + const framework::Scope& scope, + const std::string& var_name, + const std::string& out_varname, + int64_t time_out = FLAGS_rpc_deadline); + VarHandlePtr AsyncPrefetchVar(const std::string& ep, const platform::DeviceContext& ctx, const framework::Scope& scope, @@ -103,6 +111,7 @@ class BRPCClient : public RPCClient { const platform::DeviceContext& ctx, const framework::Scope& scope, const std::string& var_name, + const std::string& out_var_name, const std::string& method_name, int64_t time_out = FLAGS_rpc_deadline); diff --git a/paddle/fluid/operators/distributed/brpc/brpc_server.cc b/paddle/fluid/operators/distributed/brpc/brpc_server.cc index cbe0bd09c7b272c35b78818aa9e26feeb5497779..fea9b09414638b607ca7f7d558ce14a2d5bfa03d 100644 --- a/paddle/fluid/operators/distributed/brpc/brpc_server.cc +++ b/paddle/fluid/operators/distributed/brpc/brpc_server.cc @@ -45,6 +45,13 @@ class BRPCServiceImpl : public SendRecvService { rpc_server_->GetThreadNum(distributed::kRequestGet))); } + it = rpc_call_map.find(distributed::kRequestGetNoBarrier); + if (it != rpc_call_map.end()) { + request_getnobarrier_h_ = it->second; + getnobarrier_threads_.reset(new paddle::framework::ThreadPool( + rpc_server_->GetThreadNum(distributed::kRequestGetNoBarrier))); + } + it = rpc_call_map.find(distributed::kRequestPrefetch); if (it != rpc_call_map.end()) { request_prefetch_h_ = it->second; @@ -112,6 +119,14 @@ class BRPCServiceImpl : public SendRecvService { [=] { _GetVariable(cntl_butil, request, response, done); }); } + void GetVariableNoBarrier(google::protobuf::RpcController* cntl_butil, + const VariableMessage* request, + VariableMessage* response, + google::protobuf::Closure* done) override { + getnobarrier_threads_->Run( + [=] { _GetVariableNoBarrier(cntl_butil, request, response, done); }); + } + void _GetVariable(google::protobuf::RpcController* cntl_butil, const VariableMessage* request, VariableMessage* response, google::protobuf::Closure* done) { @@ -122,23 +137,59 @@ class BRPCServiceImpl : public SendRecvService { brpc::Controller* cntl = static_cast(cntl_butil); std::string varname = request->varname(); + std::string out_varname = request->out_varname(); VLOG(3) << "RequestGet varname:" << varname + << ", out_varname:" << out_varname << ", trainer_id:" << request->trainer_id() << ", from:" << cntl->remote_side(); auto scope = request_get_h_->scope(); - auto invar = scope->FindVar(varname); + paddle::framework::Variable* invar = nullptr; + int trainer_id = request->trainer_id(); + paddle::framework::Variable* outvar = nullptr; + + request_get_h_->Handle(varname, scope, invar, &outvar, trainer_id, + out_varname); + + if (outvar) { + distributed::SerializeToIOBuf(out_varname, outvar, + *request_get_h_->dev_ctx(), response, + &cntl->response_attachment(), "", false); + } + } + + void _GetVariableNoBarrier(google::protobuf::RpcController* cntl_butil, + const VariableMessage* request, + VariableMessage* response, + google::protobuf::Closure* done) { + PADDLE_ENFORCE(request_getnobarrier_h_ != nullptr, + "RequestGetNoBarrier handler should be registed first!"); + + brpc::ClosureGuard done_guard(done); + brpc::Controller* cntl = static_cast(cntl_butil); + + std::string varname = request->varname(); + std::string out_varname = request->out_varname(); int trainer_id = request->trainer_id(); + + VLOG(3) << "RequestGetNoBarrier varname:" << varname + << ", out_varname:" << out_varname << ", trainer_id:" << trainer_id + << ", from:" << cntl->remote_side(); + + auto scope = request_getnobarrier_h_->scope(); + paddle::framework::Variable* invar = nullptr; paddle::framework::Variable* outvar = nullptr; - request_get_h_->Handle(varname, scope, invar, &outvar, trainer_id); + request_getnobarrier_h_->Handle(varname, scope, invar, &outvar, trainer_id, + out_varname); if (outvar) { - distributed::SerializeToIOBuf(varname, outvar, *request_get_h_->dev_ctx(), - response, &cntl->response_attachment(), "", - false); + distributed::SerializeToIOBuf( + out_varname, outvar, *request_getnobarrier_h_->dev_ctx(), response, + &cntl->response_attachment(), "", false); } } + void PrefetchVariable(google::protobuf::RpcController* cntl_butil, const VariableMessage* request, VariableMessage* response, @@ -282,6 +333,7 @@ class BRPCServiceImpl : public SendRecvService { private: distributed::RequestHandler* request_send_h_{nullptr}; distributed::RequestHandler* request_get_h_{nullptr}; + distributed::RequestHandler* request_getnobarrier_h_{nullptr}; distributed::RequestHandler* request_prefetch_h_{nullptr}; distributed::RequestHandler* request_checkpoint_h_{nullptr}; distributed::RequestHandler* request_get_monomer_handler_h_{nullptr}; @@ -289,9 +341,10 @@ class BRPCServiceImpl : public SendRecvService { distributed::RPCServer* rpc_server_{nullptr}; - // FIXME(gongwb): brpc should support process one rpce use one threadpool. + // FIXME(gongwb): brpc should support process one rpc use one threadpool. std::unique_ptr send_threads_; std::unique_ptr get_threads_; + std::unique_ptr getnobarrier_threads_; std::unique_ptr prefetch_threads_; std::unique_ptr checkpoint_notify_threads_; }; diff --git a/paddle/fluid/operators/distributed/grpc/grpc_client.cc b/paddle/fluid/operators/distributed/grpc/grpc_client.cc index 7875c16c3cf412ee06fa7c8eb36400b1096f156b..52310f8d04db6a5df9967c0a5ec9a5e95a24cdab 100644 --- a/paddle/fluid/operators/distributed/grpc/grpc_client.cc +++ b/paddle/fluid/operators/distributed/grpc/grpc_client.cc @@ -74,7 +74,7 @@ VarHandlePtr GRPCClient::AsyncSendVar(const std::string& ep, const framework::Scope* p_scope = &scope; const auto ch = GetChannel(ep_val); SendProcessor* s = new SendProcessor(ch); - const std::string method = "SendRPC"; + const std::string method = kSendRPC; VarHandlePtr h(new VarHandle(ep, method, var_name_val, p_ctx, p_scope)); s->Prepare(h, time_out); @@ -107,7 +107,7 @@ VarHandlePtr GRPCClient::AsyncSendVar(const std::string& ep, void ProcGetResponse(const VarHandle& var_h, const ::grpc::ByteBuffer& ret_msg) { - VLOG(100) << "ProcGetResponse"; + VLOG(4) << "ProcGetResponse"; framework::Variable* outvar = nullptr; // get response's trainer_id is not used int trainer_id; @@ -127,59 +127,74 @@ VarHandlePtr GRPCClient::AsyncGetVar(const std::string& ep, const platform::DeviceContext& ctx, const framework::Scope& scope, const std::string& var_name, + const std::string& out_varname, int64_t time_out) { - return _AsyncGetVar(ep, ctx, scope, var_name, + return _AsyncGetVar(ep, ctx, scope, kGetRPC, var_name, out_varname, "/sendrecv.SendRecvService/GetVariable", time_out); } +VarHandlePtr GRPCClient::AsyncGetVarNoBarrier( + const std::string& ep, const platform::DeviceContext& ctx, + const framework::Scope& scope, const std::string& var_name, + const std::string& out_varname, int64_t time_out) { + std::string var_name_no_barrier = + string::Sprintf("%s%s", var_name, WITHOUT_BARRIER_MESSAGE); + + return _AsyncGetVar( + ep, ctx, scope, kGetNoBarrierRPC, var_name_no_barrier, out_varname, + "/sendrecv.SendRecvService/GetVariableNoBarrier", time_out); +} + VarHandlePtr GRPCClient::AsyncGetMonomerVariable( const std::string& ep, const platform::DeviceContext& ctx, const framework::Scope& scope, const std::string& var_name, int64_t time_out) { - return _AsyncGetVar(ep, ctx, scope, var_name, + return _AsyncGetVar(ep, ctx, scope, kGetMonomerRPC, var_name, var_name, "/sendrecv.SendRecvService/GetMonomerVariable", time_out); } -VarHandlePtr GRPCClient::_AsyncGetVar(const std::string& ep, - const platform::DeviceContext& ctx, - const framework::Scope& scope, - const std::string& var_name, - const std::string& rpc_path, - int64_t time_out) { +VarHandlePtr GRPCClient::_AsyncGetVar( + const std::string& ep, const platform::DeviceContext& ctx, + const framework::Scope& scope, const std::string& method, + const std::string& var_name, const std::string& out_varname, + const std::string& rpc_path, int64_t time_out) { const platform::DeviceContext* p_ctx = &ctx; const std::string ep_val = ep; const std::string var_name_val = var_name; + const std::string out_varname_val = out_varname; const framework::Scope* p_scope = &scope; const auto ch = GetChannel(ep_val); GetProcessor* s = new GetProcessor(ch); - const std::string method = "GetRPC"; - VarHandlePtr h(new VarHandle(ep, method, var_name_val, p_ctx, p_scope)); + + VarHandlePtr h(new VarHandle(ep, method, out_varname_val, p_ctx, p_scope)); s->Prepare(h, time_out); - framework::AsyncIO([var_name_val, s, method, p_ctx, h, rpc_path, this] { - // prepare input - sendrecv::VariableMessage req; - req.set_varname(var_name_val); - req.set_trainer_id(trainer_id_); - ::grpc::ByteBuffer buf; - RequestToByteBuffer(req, &buf); + framework::AsyncIO( + [var_name_val, out_varname_val, s, method, p_ctx, h, rpc_path, this] { + // prepare input + sendrecv::VariableMessage req; + req.set_varname(var_name_val); + req.set_out_varname(out_varname_val); + req.set_trainer_id(trainer_id_); + ::grpc::ByteBuffer buf; + RequestToByteBuffer(req, &buf); - VLOG(3) << s->GetVarHandlePtr()->String() << " begin"; + VLOG(3) << s->GetVarHandlePtr()->String() << " begin"; - // stub context - s->response_call_back_ = ProcGetResponse; + // stub context + s->response_call_back_ = ProcGetResponse; - platform::RecordRPCEvent record_event(method, p_ctx); + platform::RecordRPCEvent record_event(method, p_ctx); - auto call = - s->stub_g_.PrepareUnaryCall(s->context_.get(), rpc_path, buf, &cq_); - call->StartCall(); - call->Finish(&s->reply_, &s->status_, reinterpret_cast(s)); + auto call = + s->stub_g_.PrepareUnaryCall(s->context_.get(), rpc_path, buf, &cq_); + call->StartCall(); + call->Finish(&s->reply_, &s->status_, reinterpret_cast(s)); - if (UNLIKELY(platform::IsProfileEnabled())) { - h->Wait(); - } - }); + if (UNLIKELY(platform::IsProfileEnabled())) { + h->Wait(); + } + }); req_count_++; @@ -202,7 +217,7 @@ VarHandlePtr GRPCClient::AsyncPrefetchVar(const std::string& ep, const auto ch = GetChannel(ep_val); GetProcessor* s = new GetProcessor(ch); - const std::string method = "PrefetchRPC"; + const std::string method = kPrefetchRPC; VarHandlePtr h(new VarHandle(ep, method, out_var_name_val, p_ctx, p_scope)); s->Prepare(h, time_out); @@ -242,7 +257,7 @@ VarHandlePtr GRPCClient::AsyncSendBatchBarrier(const std::string& ep, const auto ch = GetChannel(ep); BatchBarrierProcessor* s = new BatchBarrierProcessor(ch); - const std::string method = "BatchBarrierRPC"; + const std::string method = kBatchBarrierRPC; VarHandlePtr h( new VarHandle(ep, method, BATCH_BARRIER_MESSAGE, nullptr, nullptr)); s->Prepare(h, time_out); @@ -267,7 +282,7 @@ VarHandlePtr GRPCClient::AsyncSendFetchBarrier(const std::string& ep, int64_t time_out) { const auto ch = GetChannel(ep); FetchBarrierProcessor* s = new FetchBarrierProcessor(ch); - const std::string method = "FetchBarrierRPC"; + const std::string method = kFetchBarrierRPC; VarHandlePtr h( new VarHandle(ep, method, FETCH_BARRIER_MESSAGE, nullptr, nullptr)); s->Prepare(h, time_out); @@ -293,7 +308,7 @@ VarHandlePtr GRPCClient::AsyncGetMonomerBarrier(const std::string& ep, int64_t time_out) { const auto ch = GetChannel(ep); BatchBarrierProcessor* s = new BatchBarrierProcessor(ch); - const std::string method = "SendMonomerFetchBarrierRPC"; + const std::string method = kSendMonomerFetchBarrierRPC; VarHandlePtr h(new VarHandle(ep, method, var_name, nullptr, nullptr)); s->Prepare(h, time_out); @@ -320,7 +335,7 @@ VarHandlePtr GRPCClient::AsyncSendComplete(const std::string& ep, const auto ch = GetChannel(ep); BatchBarrierProcessor* s = new BatchBarrierProcessor(ch); - const std::string method = "SendCompleteRPC"; + const std::string method = kSendCompleteRPC; VarHandlePtr h(new VarHandle(ep, method, COMPLETE_MESSAGE, nullptr, nullptr)); s->Prepare(h, time_out); @@ -347,7 +362,7 @@ VarHandlePtr GRPCClient::AsyncCheckpointNotify(const std::string& ep, CheckpointNotifyProcessor* s = new CheckpointNotifyProcessor(ch); - const std::string method = "CheckPointNotifyRPC"; + const std::string method = kCheckPointNotifyRPC; VarHandlePtr h( new VarHandle(ep, method, CHECKPOINT_SAVE_MESSAGE, nullptr, nullptr)); diff --git a/paddle/fluid/operators/distributed/grpc/grpc_client.h b/paddle/fluid/operators/distributed/grpc/grpc_client.h index fa77d21257647b23b8ac9f8161a216d36d7df773..ce0d2152aa27c62b6e12881aaf2ae458597e67e6 100644 --- a/paddle/fluid/operators/distributed/grpc/grpc_client.h +++ b/paddle/fluid/operators/distributed/grpc/grpc_client.h @@ -186,8 +186,15 @@ class GRPCClient : public RPCClient { const platform::DeviceContext& ctx, const framework::Scope& scope, const std::string& var_name, + const std::string& out_varname, int64_t time_out = FLAGS_rpc_deadline) override; + VarHandlePtr AsyncGetVarNoBarrier( + const std::string& ep, const platform::DeviceContext& ctx, + const framework::Scope& scope, const std::string& var_name, + const std::string& out_varname, + int64_t time_out = FLAGS_rpc_deadline) override; + VarHandlePtr AsyncGetMonomerVariable( const std::string& ep, const platform::DeviceContext& ctx, const framework::Scope& scope, const std::string& var_name, @@ -228,11 +235,11 @@ class GRPCClient : public RPCClient { void Proceed(); std::shared_ptr GetChannel(const std::string& ep); - VarHandlePtr _AsyncGetVar(const std::string& ep, - const platform::DeviceContext& ctx, - const framework::Scope& scope, - const std::string& var_name, const std::string& rpc, - int64_t time_out); + VarHandlePtr _AsyncGetVar( + const std::string& ep, const platform::DeviceContext& ctx, + const framework::Scope& scope, const std::string& method, + const std::string& var_name, const std::string& out_varname, + const std::string& rpc_path, int64_t time_out = FLAGS_rpc_deadline); private: grpc::CompletionQueue cq_; diff --git a/paddle/fluid/operators/distributed/grpc/grpc_server.cc b/paddle/fluid/operators/distributed/grpc/grpc_server.cc index 08f777e279e34da0c0ac89afd3f660fa089599fe..4a9c158cb0ab7f2d6fecbba9f957ae6ef153074c 100644 --- a/paddle/fluid/operators/distributed/grpc/grpc_server.cc +++ b/paddle/fluid/operators/distributed/grpc/grpc_server.cc @@ -136,17 +136,65 @@ class RequestGet final : public RequestBase { void Process() override { // proc request. std::string varname = request_.varname(); + std::string out_varname = request_.out_varname(); int trainer_id = request_.trainer_id(); - VLOG(4) << "RequestGet " << varname; + + VLOG(4) << "RequestGet " << out_varname << " from " << varname; auto scope = request_handler_->scope(); - auto invar = scope->FindVar(varname); + framework::Variable* invar = nullptr; framework::Variable* outvar = nullptr; - request_handler_->Handle(varname, scope, invar, &outvar, trainer_id); + request_handler_->Handle(varname, scope, invar, &outvar, trainer_id, + out_varname); if (outvar) { - SerializeToByteBuffer(varname, outvar, *request_handler_->dev_ctx(), + SerializeToByteBuffer(out_varname, outvar, *request_handler_->dev_ctx(), + &reply_); + } + Finish(reply_, &responder_); + } + + protected: + sendrecv::VariableMessage request_; + ::grpc::ByteBuffer reply_; + ServerAsyncResponseWriter<::grpc::ByteBuffer> responder_; +}; + +class RequestGetNoBarrier final : public RequestBase { + public: + explicit RequestGetNoBarrier(GrpcService::AsyncService* service, + ::grpc::ServerCompletionQueue* cq, + RequestHandler* request_handler, int req_id) + : RequestBase(service, cq, request_handler, req_id), responder_(&ctx_) { + auto method_id = + static_cast(distributed::GrpcMethod::kGetVariableNoBarrier); + service_->RequestAsyncUnary( + method_id, &ctx_, &request_, &responder_, cq_, cq_, + reinterpret_cast(static_cast(req_id))); + } + + virtual ~RequestGetNoBarrier() {} + + std::string GetReqName() override { return request_.varname(); } + + void Process() override { + // proc request. + std::string varname = request_.varname(); + std::string out_varname = request_.out_varname(); + int trainer_id = request_.trainer_id(); + + VLOG(4) << "RequestGetNoBarrier " << out_varname << " from " << varname; + + auto scope = request_handler_->scope(); + framework::Variable* invar = nullptr; + framework::Variable* outvar = nullptr; + + request_handler_->Handle(varname, scope, invar, &outvar, trainer_id, + out_varname); + + if (outvar) { + SerializeToByteBuffer(out_varname, outvar, *request_handler_->dev_ctx(), &reply_); } Finish(reply_, &responder_); @@ -460,6 +508,9 @@ void AsyncGRPCServer::TryToRegisterNewOne(const std::string& rpc_name, b = new RequestSend(&service_, cq.get(), handler, req_id); } else if (rpc_name == kRequestGet) { b = new RequestGet(&service_, cq.get(), handler, req_id); + + } else if (rpc_name == kRequestGetNoBarrier) { + b = new RequestGetNoBarrier(&service_, cq.get(), handler, req_id); } else if (rpc_name == kRequestGetMonomerVariable) { b = new RequestGetMonomerVariable(&service_, cq.get(), handler, req_id, this); diff --git a/paddle/fluid/operators/distributed/grpc/grpc_service.h b/paddle/fluid/operators/distributed/grpc/grpc_service.h index 0b5c5151e637f0d7aeafaefefb01006ffe0f05c8..2965fe4490bedd0253682f0aef44e096232fc2fc 100644 --- a/paddle/fluid/operators/distributed/grpc/grpc_service.h +++ b/paddle/fluid/operators/distributed/grpc/grpc_service.h @@ -81,6 +81,7 @@ enum class GrpcMethod { kGetVariable, kPrefetchVariable, kCheckpointNotify, + kGetVariableNoBarrier, kGetMonomerVariable, kGetMonomerBarrier, }; @@ -94,6 +95,8 @@ inline const char* GrpcMethodName(GrpcMethod id) { return "/sendrecv.SendRecvService/SendVariable"; case GrpcMethod::kGetVariable: return "/sendrecv.SendRecvService/GetVariable"; + case GrpcMethod::kGetVariableNoBarrier: + return "/sendrecv.SendRecvService/GetVariableNoBarrier"; case GrpcMethod::kGetMonomerVariable: return "/sendrecv.SendRecvService/GetMonomerVariable"; case GrpcMethod::kGetMonomerBarrier: diff --git a/paddle/fluid/operators/distributed/request_handler.h b/paddle/fluid/operators/distributed/request_handler.h index 62b24f150b41efead24c8bdbe08c9b44e160445a..991158ac72007efc1233f852caed4f90f35fe1cd 100644 --- a/paddle/fluid/operators/distributed/request_handler.h +++ b/paddle/fluid/operators/distributed/request_handler.h @@ -42,11 +42,24 @@ constexpr char kRequestGetMonomerBarrier[] = "RequestGetMonomerBarrier"; constexpr char kRequestPrefetch[] = "RequestPrefetch"; constexpr char kRequestCheckpoint[] = "RequestCheckpoint"; constexpr char kRequestPassBarrier[] = "RequestPassBarrier"; +constexpr char kRequestGetNoBarrier[] = "GetVariableNoBarrier"; + +constexpr char kSendRPC[] = "SendRPC"; +constexpr char kGetRPC[] = "GetRPC"; +constexpr char kGetNoBarrierRPC[] = "GetNoBarrierRPC"; +constexpr char kGetMonomerRPC[] = "GetMonomerRPC"; +constexpr char kPrefetchRPC[] = "PrefetchRPC"; +constexpr char kBatchBarrierRPC[] = "BatchBarrierRPC"; +constexpr char kFetchBarrierRPC[] = "FetchBarrierRPC"; +constexpr char kSendMonomerFetchBarrierRPC[] = "SendMonomerFetchBarrierRPC"; +constexpr char kSendCompleteRPC[] = "SendCompleteRPC"; +constexpr char kCheckPointNotifyRPC[] = "CheckPointNotifyRPC"; #define LISTEN_TERMINATE_MESSAGE "TERMINATE@RECV" #define BATCH_BARRIER_MESSAGE "BATCH_BARRIER@RECV" #define FETCH_BARRIER_MESSAGE "FETCH_BARRIER@RECV" #define COMPLETE_MESSAGE "COMPLETE@RECV" +#define WITHOUT_BARRIER_MESSAGE "@WITHOUT_BARRIER@RECV" #define CHECKPOINT_SAVE_MESSAGE "SAVE@CHECKPOINTNOTIFY" #define CHECKPOINT_LOAD_MESSAGE "LOAD@CHECKPOINTNOTIFY" diff --git a/paddle/fluid/operators/distributed/request_handler_impl.cc b/paddle/fluid/operators/distributed/request_handler_impl.cc index 9722f8c96e91d2dfbe929dcc11645a40c44afb4e..a1c5c0777402b808eed6306862fd6dd41b529dbd 100644 --- a/paddle/fluid/operators/distributed/request_handler_impl.cc +++ b/paddle/fluid/operators/distributed/request_handler_impl.cc @@ -23,6 +23,7 @@ #include "paddle/fluid/framework/selected_rows.h" #include "paddle/fluid/framework/variable_helper.h" #include "paddle/fluid/operators/distributed/rpc_server.h" +#include "paddle/fluid/string/piece.h" #include "paddle/fluid/string/printf.h" namespace paddle { @@ -53,6 +54,11 @@ bool RequestSendHandler::Handle(const std::string& varname, // Async if (!sync_mode_) { VLOG(3) << "async process var: " << varname; + if (varname == BATCH_BARRIER_MESSAGE) { + PADDLE_THROW( + "async mode should not recv BATCH_BARRIER_MESSAGE or " + "COMPLETE_MESSAGE"); + } try { executor_->RunPreparedContext((*grad_to_prepared_ctx_)[varname].get(), scope); @@ -81,7 +87,8 @@ bool RequestGetHandler::Handle(const std::string& varname, const int trainer_id, const std::string& out_var_name, const std::string& table_name) { - VLOG(4) << "RequestGetHandler:" << varname; + VLOG(4) << "RequestGetHandler:" << varname + << " out_var_name: " << out_var_name; if (sync_mode_) { if (varname == FETCH_BARRIER_MESSAGE) { @@ -112,6 +119,32 @@ bool RequestGetHandler::Handle(const std::string& varname, return true; } +bool RequestGetNoBarrierHandler::Handle(const std::string& varname, + framework::Scope* scope, + framework::Variable* invar, + framework::Variable** outvar, + const int trainer_id, + const std::string& out_var_name, + const std::string& table_name) { + VLOG(4) << "RequestGetNoBarrierHandler:" << varname + << " out_var_name: " << out_var_name; + + // get var from pserver immediately without barriers + string::Piece without_barrier_piece(WITHOUT_BARRIER_MESSAGE); + string::Piece var_name_piece = string::Piece(varname); + + if (string::Contains(var_name_piece, without_barrier_piece)) { + var_name_piece = string::TrimSuffix(var_name_piece, without_barrier_piece); + VLOG(4) << "Get var " << var_name_piece << " with " + << WITHOUT_BARRIER_MESSAGE; + *outvar = scope_->FindVar(var_name_piece.ToString()); + return true; + } else { + PADDLE_THROW("GetNoBarrier must contain %s", WITHOUT_BARRIER_MESSAGE); + } + return true; +} + bool RequestPrefetchHandler::Handle(const std::string& varname, framework::Scope* scope, framework::Variable* invar, diff --git a/paddle/fluid/operators/distributed/request_handler_impl.h b/paddle/fluid/operators/distributed/request_handler_impl.h index 5e0b25c5c2ce161dee0948a07baab32dfff9be6f..f3c1b24526b8b28033c0c979f74d44a3d7a94201 100644 --- a/paddle/fluid/operators/distributed/request_handler_impl.h +++ b/paddle/fluid/operators/distributed/request_handler_impl.h @@ -67,6 +67,16 @@ class RequestGetHandler final : public RequestHandler { bool enable_dc_asgd_; }; +class RequestGetNoBarrierHandler final : public RequestHandler { + public: + RequestGetNoBarrierHandler() : RequestHandler(false) {} + virtual ~RequestGetNoBarrierHandler() {} + bool Handle(const std::string& varname, framework::Scope* scope, + framework::Variable* var, framework::Variable** outvar, + const int trainer_id, const std::string& out_var_name = "", + const std::string& table_name = "") override; +}; + static inline void BuildVar(const std::string& param_name, std::initializer_list arguments, paddle::framework::proto::OpDesc::Var* var) { diff --git a/paddle/fluid/operators/distributed/rpc_client.h b/paddle/fluid/operators/distributed/rpc_client.h index b668d869787a47ebd36f570061421ddbeae5a09a..ea54e0c2951253fc009672f4cd2e5233ed56944e 100644 --- a/paddle/fluid/operators/distributed/rpc_client.h +++ b/paddle/fluid/operators/distributed/rpc_client.h @@ -43,8 +43,15 @@ class RPCClient { const platform::DeviceContext& ctx, const framework::Scope& scope, const std::string& var_name, + const std::string& out_varname, int64_t time_out = FLAGS_rpc_deadline) = 0; + virtual VarHandlePtr AsyncGetVarNoBarrier( + const std::string& ep, const platform::DeviceContext& ctx, + const framework::Scope& scope, const std::string& var_name, + const std::string& out_varname, + int64_t time_out = FLAGS_rpc_deadline) = 0; + virtual VarHandlePtr AsyncGetMonomerVariable( const std::string& ep, const platform::DeviceContext& ctx, const framework::Scope& scope, const std::string& var_name, diff --git a/paddle/fluid/operators/distributed/rpc_server.cc b/paddle/fluid/operators/distributed/rpc_server.cc index cc5b9c29a12ec5386041dfeea22fd388d94115e6..c3a46e348c69a20953f013c7de772a37db5f4844 100644 --- a/paddle/fluid/operators/distributed/rpc_server.cc +++ b/paddle/fluid/operators/distributed/rpc_server.cc @@ -39,27 +39,33 @@ void RPCServer::SavePort() const { port_file.open(file_path); port_file << selected_port_; port_file.close(); - VLOG(4) << "selected port written to " << file_path; + VLOG(3) << "selected port written to " << file_path; } void RPCServer::WaitBarrier(const std::string& rpc_name) { + VLOG(3) << "WaitBarrier in: " << rpc_name; std::unique_lock lock(this->mutex_); barrier_cond_.wait(lock, [this, &rpc_name] { return ((barrier_counter_[rpc_name] == client_num_ && client_num_ != 0) || exit_flag_.load()); }); - VLOG(3) << "batch_barrier_: " << rpc_name << " " - << barrier_counter_[rpc_name]; + VLOG(3) << "WaitBarrier out: " << rpc_name + << " counter: " << barrier_counter_[rpc_name]; } void RPCServer::IncreaseBatchBarrier(const std::string rpc_name) { - VLOG(4) << "RPCServer begin IncreaseBatchBarrier " << rpc_name; + VLOG(3) << "RPCServer begin IncreaseBatchBarrier " << rpc_name; + // barrier msg should make sure that it's in the right cond(send|recv) + WaitCond(rpc_name); int b = 0; std::unique_lock lock(mutex_); b = ++barrier_counter_[rpc_name]; + VLOG(3) << rpc_name << " barrier_counter: " << b; if (b >= client_num_) { lock.unlock(); + VLOG(3) << "BatchBarrier counter reach " << client_num_ << " for " + << rpc_name; barrier_cond_.notify_all(); lock.lock(); } @@ -71,7 +77,7 @@ void RPCServer::Complete() { client_num_--; need_reset_all_vars_ = true; - VLOG(4) << "decrease client_num to: " << client_num_; + VLOG(3) << "decrease client_num to: " << client_num_; if (cur_cond_.load() == rpc_cond_map_[kRequestGet]) { barrier_counter_[kRequestGet]--; } @@ -105,8 +111,8 @@ void RPCServer::RegisterRPC(const std::string& rpc_name, static int cond = -1; rpc_cond_map_[rpc_name] = ++cond; - VLOG(4) << "RegisterRPC rpc_name:" << rpc_name << ", handler:" << handler - << ", cond:" << rpc_cond_map_[rpc_name]; + VLOG(3) << "RegisterRPC rpc_name: " << rpc_name << ", handler: " << handler + << ", cond: " << rpc_cond_map_[rpc_name]; } void RPCServer::SetCond(const std::string& rpc_name) { @@ -120,7 +126,7 @@ void RPCServer::SetCond(const std::string& rpc_name) { } void RPCServer::WaitCond(const std::string& rpc_name) { - VLOG(4) << "RPCServer WaitCond " << rpc_name; + VLOG(3) << "RPCServer WaitCond in " << rpc_name; int cond = 0; { std::unique_lock lock(mutex_); @@ -130,6 +136,7 @@ void RPCServer::WaitCond(const std::string& rpc_name) { std::unique_lock lock(mutex_); rpc_cond_.wait( lock, [=] { return (cur_cond_.load() == cond || exit_flag_.load()); }); + VLOG(3) << "RPCServer WaitCond out " << rpc_name; } void RPCServer::RegisterVar(const std::string& var_name, @@ -151,7 +158,7 @@ void RPCServer::RegisterVar(const std::string& var_name, } rpc_cond_.notify_all(); - VLOG(4) << "RegisterVar context:" << h.String(); + VLOG(3) << "RegisterVar context:" << h.String(); } void RPCServer::IncreaseVarBarrier(const std::string& var_name) { @@ -167,11 +174,11 @@ void RPCServer::IncreaseVarBarrier(const std::string& var_name) { barrier_cond_.notify_all(); } - VLOG(4) << "IncreaseVarBarrier context:" << h.String(); + VLOG(3) << "IncreaseVarBarrier context:" << h.String(); } void RPCServer::WaitVarBarrier(const std::string& var_name) { - VLOG(4) << "WaitBarrier var_name:" << var_name; + VLOG(3) << "WaitVarBarrier var_name:" << var_name; std::unique_lock lock(mutex_); barrier_cond_.wait(lock, [&]() { @@ -179,11 +186,11 @@ void RPCServer::WaitVarBarrier(const std::string& var_name) { exit_flag_.load()); }); - VLOG(4) << "WaitBarrier context: " << var_map_[var_name].String(); + VLOG(3) << "WaitVarBarrier context: " << var_map_[var_name].String(); } void RPCServer::SetVarCond(const std::string& var_name) { - VLOG(4) << "SetVarCond var_name:" << var_name; + VLOG(3) << "SetVarCond var_name:" << var_name; { std::unique_lock lock(mutex_); if (var_map_.find(var_name) != var_map_.end()) { @@ -193,14 +200,14 @@ void RPCServer::SetVarCond(const std::string& var_name) { } void RPCServer::WaitVarCond(const std::string& var_name) { - VLOG(4) << "WaitVarCond var_name:" << var_name; + VLOG(3) << "WaitVarCond var_name:" << var_name; std::unique_lock lock(mutex_); rpc_cond_.wait(lock, [=] { return (var_map_.find(var_name) != var_map_.end() || exit_flag_.load()); }); - VLOG(4) << "WaitVarCond var_name:" << var_name << " end"; + VLOG(3) << "WaitVarCond var_name:" << var_name << " end"; } MonomerHandle RPCServer::GetMonomer(const std::string& var_name) { diff --git a/paddle/fluid/operators/distributed/send_recv.proto.in b/paddle/fluid/operators/distributed/send_recv.proto.in index b39eef04d8d1de77cb951f90a10e69eebb495282..6303667884361be050ac62c604274c87caa72444 100644 --- a/paddle/fluid/operators/distributed/send_recv.proto.in +++ b/paddle/fluid/operators/distributed/send_recv.proto.in @@ -17,8 +17,14 @@ package sendrecv; option cc_generic_services = @cc_generic_services@; service SendRecvService { + // For parameter server round-robin like hashing, do not split tensors. + // Send and recv only one tensor + // TODO(typhoonzero): add streaming API rpc SendVariable(VariableMessage) returns (VoidMessage) {} + // Argument VariableMessage for GetVariable should only contain varname. rpc GetVariable(VariableMessage) returns (VariableMessage) {} + rpc GetVariableNoBarrier(VariableMessage) returns (VariableMessage) {} + // pre-fetch variable by given variable name and Ids rpc PrefetchVariable(VariableMessage) returns (VariableMessage) {} rpc CheckpointNotify(VariableMessage) returns (VoidMessage) {} @@ -27,12 +33,17 @@ service SendRecvService { rpc GetMonomerBarrier(VariableMessage) returns (VoidMessage) {} } +// It can be: LoDTensor、SelectedRows or NCCL_ID enum VarType { LOD_TENSOR = 0; SELECTED_ROWS = 1; NCCL_ID = 2; } +// VariableMessage is serialized paddle variable message. +// NOTICE(gongwb):don't modify this proto if you are not +// not familar with how we serialize in sendrecvop_utils.h +// and deserilize it in variable_response.h. message VariableMessage { enum Type { // Pod Types @@ -49,14 +60,21 @@ message VariableMessage { string varname = 1; // TODO(Yancey1989): reference framework::proto::VarDesc::VarType VarType type = 2; + // bool persistable is not needed for sending. + // tensor info: Type data_type = 3; repeated int64 dims = 4; + // lod details: int64 lod_level = 5; repeated LodData lod = 6; + // selected_rows height, aka. original dim0 int64 slr_height = 7; + // tensor data bytes serialized = 8; + // selected_rows data bytes rows = 9; + // Look up table block execution output variable name. string out_varname = 10; // If 1, the ps server will start profiling, the ps // server stops profiling and generates a profile to /tmp/profile_ps_* diff --git a/paddle/fluid/operators/distributed/variable_response.cc b/paddle/fluid/operators/distributed/variable_response.cc index 47ff568a1135f2f0a146faa4d5d6fc422a344f51..7825b4fc82b1f7580fea8ab4961facaf7fd64397 100644 --- a/paddle/fluid/operators/distributed/variable_response.cc +++ b/paddle/fluid/operators/distributed/variable_response.cc @@ -117,8 +117,9 @@ bool VariableResponse::CopyLodTensorData( tensor->mutable_data(ctx.GetPlace(), ToVarType(meta_.data_type())); VLOG(6) << "Tensor.memory_size = " << tensor->memory_size() - << ", Buffer Size = " << length; - PADDLE_ENFORCE_EQ(tensor->memory_size(), static_cast(length)); + << ", Buffer Size = " << length << ", dims:" << dims + << ", numel:" << tensor->numel(); + PADDLE_ENFORCE_GE(tensor->memory_size(), static_cast(length)); return ReadRaw(input, ctx, tensor->place(), tensor_data, length); } diff --git a/paddle/fluid/operators/distributed_ops/listen_and_serv_op.cc b/paddle/fluid/operators/distributed_ops/listen_and_serv_op.cc index 629f364d712694d2bc1d2483711f5247561ed1da..5b30ed472d51a37a0705d1717395da9e4ff7d743 100644 --- a/paddle/fluid/operators/distributed_ops/listen_and_serv_op.cc +++ b/paddle/fluid/operators/distributed_ops/listen_and_serv_op.cc @@ -137,7 +137,9 @@ void ListenAndServOp::RunSyncLoop( while (true) { // Get from multiple trainers, we don't care about the order in which // the gradients arrives, just add suffix 0~n and merge the gradient. + VLOG(3) << "wait all clients to send gradient"; rpc_service_->SetCond(distributed::kRequestSend); + VLOG(3) << "wait all clients to send send_barrier"; rpc_service_->WaitBarrier(distributed::kRequestSend); if (rpc_service_->IsExit()) { @@ -168,12 +170,16 @@ void ListenAndServOp::RunSyncLoop( } ParallelExecuteBlocks(parallel_blkids, executor, optimize_prepared, program, recv_scope); - VLOG(2) << "run all blocks spent " << GetTimestamp() - ts << "(ms)"; + VLOG(3) << "run all blocks spent " << GetTimestamp() - ts << "(ms)"; + VLOG(3) << "ResetReceivedVars"; ResetReceivedVars(recv_scope, dev_ctx, rpc_service_->NeedResetAllVars()); + VLOG(3) << "wait all clients to get parameters back"; rpc_service_->SetCond(distributed::kRequestGet); + VLOG(3) << "wait all clients to send fetch_barrier"; rpc_service_->WaitBarrier(distributed::kRequestGet); + VLOG(3) << "ResetBarrierCounter"; rpc_service_->ResetBarrierCounter(); } // while(true) } @@ -347,6 +353,8 @@ void ListenAndServOp::RunImpl(const framework::Scope &scope, new distributed::RequestPrefetchHandler(sync_mode)); request_checkpoint_handler_.reset(new distributed::RequestCheckpointHandler( sync_mode, checkpoint_block_id)); + request_get_no_barrier_handler_.reset( + new distributed::RequestGetNoBarrierHandler()); rpc_service_->RegisterRPC(distributed::kRequestSend, request_send_handler_.get(), @@ -359,6 +367,8 @@ void ListenAndServOp::RunImpl(const framework::Scope &scope, FLAGS_rpc_prefetch_thread_num); rpc_service_->RegisterRPC(distributed::kRequestCheckpoint, request_checkpoint_handler_.get()); + rpc_service_->RegisterRPC(distributed::kRequestGetNoBarrier, + request_get_no_barrier_handler_.get()); auto optimize_blocks = Attr>(kOptimizeBlocks); @@ -413,6 +423,7 @@ void ListenAndServOp::RunImpl(const framework::Scope &scope, f(request_get_handler_.get()); f(request_prefetch_handler_.get()); f(request_checkpoint_handler_.get()); + f(request_get_no_barrier_handler_.get()); // start the server listening after all member initialized. server_thread_.reset(new std::thread(RunServer, rpc_service_)); diff --git a/paddle/fluid/operators/distributed_ops/listen_and_serv_op.h b/paddle/fluid/operators/distributed_ops/listen_and_serv_op.h index 9431978df836121baacc12ed4e1ee6b218cc7d7a..f20442bad7c5bd96173b9d6efc4dceb13feacf5b 100644 --- a/paddle/fluid/operators/distributed_ops/listen_and_serv_op.h +++ b/paddle/fluid/operators/distributed_ops/listen_and_serv_op.h @@ -55,7 +55,6 @@ class ListenAndServOp : public framework::OperatorBase { const framework::VariableNameMap& inputs, const framework::VariableNameMap& outputs, const framework::AttributeMap& attrs); - virtual ~ListenAndServOp(); void RunSyncLoop(framework::Executor* executor, @@ -89,6 +88,8 @@ class ListenAndServOp : public framework::OperatorBase { mutable std::shared_ptr rpc_service_; mutable std::shared_ptr request_send_handler_; mutable std::shared_ptr request_get_handler_; + mutable std::shared_ptr + request_get_no_barrier_handler_; mutable std::shared_ptr request_prefetch_handler_; mutable std::shared_ptr diff --git a/paddle/fluid/operators/distributed_ops/merge_ids_op.h b/paddle/fluid/operators/distributed_ops/merge_ids_op.h index 99c57590191d58a12760fb335df76037685d1ced..05c00251b97bb5071102a43208c1fbbfa4ef8d2d 100644 --- a/paddle/fluid/operators/distributed_ops/merge_ids_op.h +++ b/paddle/fluid/operators/distributed_ops/merge_ids_op.h @@ -43,9 +43,9 @@ class MergeIdsOpKernel : public framework::OpKernel { PADDLE_ENFORCE_EQ(ids.size(), outs.size(), "the number of Ids and Out should be the same"); - size_t row_ids_size = 0; - int row_size = 0; - int embedding_size = 0; + int64_t row_ids_size = 0; + int64_t row_size = 0; + int64_t embedding_size = 0; for (size_t i = 0; i < x_tensors.size(); ++i) { const auto *x_tensor = x_tensors[i]; @@ -69,7 +69,7 @@ class MergeIdsOpKernel : public framework::OpKernel { for (size_t i = 0; i < x_tensors.size(); ++i) { const auto *row_id = row_ids[i]; - for (int j = 0; j < row_id->numel(); ++j) { + for (auto j = 0; j < row_id->numel(); ++j) { int64_t key = row_id->data()[j]; std::tuple val = std::make_tuple(i, j); selected_rows_idx_map.insert(std::make_pair(key, val)); @@ -84,13 +84,13 @@ class MergeIdsOpKernel : public framework::OpKernel { out->set_lod(out_ids->lod()); - int nums = static_cast(out_ids->dims()[0]); + auto nums = out_ids->dims()[0]; auto *out_data = out->mutable_data( framework::make_ddim({nums, embedding_size}), place); - for (int j = 0; j < nums; ++j) { - int id = out_ids->data()[j]; - auto row_tuple = selected_rows_idx_map[id]; - int64_t row_idx = std::get<1>(row_tuple); + for (auto j = 0; j < nums; ++j) { + auto id = out_ids->data()[j]; + auto row_tuple = selected_rows_idx_map.at(id); + auto row_idx = std::get<1>(row_tuple); const auto *x_tensor = x_tensors[std::get<0>(row_tuple)]; memcpy(out_data + embedding_size * j, diff --git a/paddle/fluid/operators/distributed_ops/recv_op.cc b/paddle/fluid/operators/distributed_ops/recv_op.cc index 48065437e38b2c5457c135cce03075f81110a329..120c65f29699bf2745b09ea312d1de069c8173c5 100644 --- a/paddle/fluid/operators/distributed_ops/recv_op.cc +++ b/paddle/fluid/operators/distributed_ops/recv_op.cc @@ -27,30 +27,50 @@ namespace operators { class RecvOp : public framework::OperatorBase { public: - RecvOp(const std::string& type, const framework::VariableNameMap& inputs, - const framework::VariableNameMap& outputs, - const framework::AttributeMap& attrs) + RecvOp(const std::string &type, const framework::VariableNameMap &inputs, + const framework::VariableNameMap &outputs, + const framework::AttributeMap &attrs) : OperatorBase(type, inputs, outputs, attrs) {} - void RunImpl(const framework::Scope& scope, - const platform::Place& place) const override { - auto outs = Outputs("Out"); + void RunImpl(const framework::Scope &scope, + const platform::Place &place) const override { std::vector epmap = Attr>("epmap"); + std::vector varnames = + Attr>("varnames"); int sync_mode = Attr("sync_mode"); + auto outs = Outputs("Out"); + bool with_barrier = Attr("with_barrier"); - platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance(); - auto& ctx = *pool.Get(place); + platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance(); + auto &ctx = *pool.Get(place); - distributed::RPCClient* rpc_client = + distributed::RPCClient *rpc_client = distributed::RPCClient::GetInstance( Attr("trainer_id")); - std::vector rets; - for (size_t i = 0; i < outs.size(); i++) { - VLOG(3) << "getting " << outs[i] << " from " << epmap[i]; - rets.push_back(rpc_client->AsyncGetVar(epmap[i], ctx, scope, outs[i])); - } - if (sync_mode) { + if (with_barrier) { + std::vector rets; + for (size_t i = 0; i < outs.size(); i++) { + std::string varname = varnames.size() == 0 ? outs[i] : varnames[i]; + VLOG(4) << "recv " << outs[i] << " from " << epmap[i] << " with " + << varname << " and with AsyncGetVar"; + rets.push_back( + rpc_client->AsyncGetVar(epmap[i], ctx, scope, varname, outs[i])); + } + if (sync_mode) { + for (size_t i = 0; i < rets.size(); i++) { + PADDLE_ENFORCE(rets[i]->Wait(), "internal error in RPCClient"); + } + } + } else { + std::vector rets; + for (size_t i = 0; i < outs.size(); i++) { + std::string varname = varnames.size() == 0 ? outs[i] : varnames[i]; + VLOG(4) << "recv " << outs[i] << " from " << epmap[i] << " with " + << varname << " and with AsyncGetVarNoBarrier"; + rets.push_back(rpc_client->AsyncGetVarNoBarrier(epmap[i], ctx, scope, + varname, outs[i])); + } for (size_t i = 0; i < rets.size(); i++) { PADDLE_ENFORCE(rets[i]->Wait(), "internal error in RPCClient"); } @@ -79,12 +99,23 @@ This operator can get variables from server side. "(int, default 0)" "sync recv or async recv.") .SetDefault(0); + AddAttr("with_barrier", + "(bool, default True) if with_barrier=False, will use " + "AsyncGetVarNoBarrier get variable from pserver immediately") + .SetDefault(true); + AddAttr>( + "varnames", + "(string vector, default {}) " + "sometimes we need to put received var in another name " + "for example: we need var named 'moment_1@127.0.0.1:1001', " + "and it real name on parameter server is 'moment_1'. ") + .SetDefault({}); } }; class RecvOpShapeInference : public framework::InferShapeBase { public: - void operator()(framework::InferShapeContext* ctx) const override {} + void operator()(framework::InferShapeContext *ctx) const override {} }; } // namespace operators diff --git a/paddle/fluid/operators/elementwise/elementwise_op_function.h b/paddle/fluid/operators/elementwise/elementwise_op_function.h index 7bb6934e1496cc989eee8ba82f56959522803bfb..cb8a4e7e1502e7e6ceb48e51452c2c7ab8313972 100644 --- a/paddle/fluid/operators/elementwise/elementwise_op_function.h +++ b/paddle/fluid/operators/elementwise/elementwise_op_function.h @@ -277,68 +277,6 @@ class TransformFunctor { Functor func_; }; -#define EIGEN_FUNCTOR(name, eigen_op) \ - struct Eigen##name##Functor { \ - template \ - inline void Run(const framework::Tensor *x, const framework::Tensor *y, \ - framework::Tensor *z, \ - const framework::ExecutionContext &ctx) { \ - auto x_e = framework::EigenVector::Flatten(*x); \ - auto y_e = framework::EigenVector::Flatten(*y); \ - auto z_e = framework::EigenVector::Flatten(*z); \ - z_e.device( \ - *ctx.template device_context().eigen_device()) = \ - eigen_op(x_e, y_e); \ - } \ - template \ - inline void RunBroadCast(const framework::Tensor *x, \ - const framework::Tensor *y, framework::Tensor *z, \ - const framework::ExecutionContext &ctx, int pre, \ - int n) { \ - auto x_e = framework::EigenVector::Flatten(*x); \ - auto y_e = framework::EigenVector::Flatten(*y); \ - auto z_e = framework::EigenVector::Flatten(*z); \ - auto y_bcast = y_e.reshape(Eigen::DSizes(1, n)) \ - .broadcast(Eigen::DSizes(pre, 1)) \ - .reshape(Eigen::DSizes(x_e.size())); \ - z_e.device( \ - *ctx.template device_context().eigen_device()) = \ - eigen_op(x_e, y_bcast); \ - } \ - template \ - inline void RunBroadCast2(const framework::Tensor *x, \ - const framework::Tensor *y, \ - framework::Tensor *z, \ - const framework::ExecutionContext &ctx, int pre, \ - int n, int post) { \ - auto x_e = framework::EigenVector::Flatten(*x); \ - auto y_e = framework::EigenVector::Flatten(*y); \ - auto z_e = framework::EigenVector::Flatten(*z); \ - auto y_bcast = y_e.reshape(Eigen::DSizes(1, n, 1)) \ - .broadcast(Eigen::DSizes(pre, 1, post)) \ - .reshape(Eigen::DSizes(x_e.size())); \ - z_e.device( \ - *ctx.template device_context().eigen_device()) = \ - eigen_op(x_e, y_bcast); \ - } \ - } - -#define EIGEN_ADD(x, y) ((x) + (y)) - -EIGEN_FUNCTOR(Add, EIGEN_ADD); - -#define EIGEN_SUB(x, y) ((x) - (y)) - -EIGEN_FUNCTOR(Sub, EIGEN_SUB); - -#define EIGEN_MUL(x, y) ((x) * (y)) - -EIGEN_FUNCTOR(Mul, EIGEN_MUL); - -#define EIGEN_DIV(x, y) ((x) / (y)) - -EIGEN_FUNCTOR(Div, EIGEN_DIV); - template struct ElemwiseGradNoBroadcast { const T *x_; diff --git a/paddle/fluid/operators/fused/fusion_conv_inception_op.cu b/paddle/fluid/operators/fused/fusion_conv_inception_op.cu index c72a966c575d4a63471905b82643e96454f08187..6e13887866485bd114ebf12f4bdfa8d60fca6d01 100644 --- a/paddle/fluid/operators/fused/fusion_conv_inception_op.cu +++ b/paddle/fluid/operators/fused/fusion_conv_inception_op.cu @@ -216,19 +216,18 @@ class CUDNNConvInceptionFusionOpKernel : public framework::OpKernel { out_datas.push_back( static_cast(output_data + (oc0 + oc1 + oc2) * h * w)); - auto temp_allocation = - platform::DeviceTemporaryAllocator::Instance().Get(dev_ctx).Allocate( - workspace_size_in_bytes); - void* cudnn_workspace = temp_allocation->ptr(); - for (int i = 0; i < 4; ++i) { - CUDNN_ENFORCE(platform::dynload::cudnnConvolutionBiasActivationForward( - handle, &alpha, in_desc[i], in_datas[i], filter_desc[i], - static_cast(filters[i]->data()), conv_desc[i], - algo[i], cudnn_workspace, workspace_size_in_bytes, &beta, out_desc[i], - out_datas[i], bias_desc[i], - static_cast(bias[i]->data()), cudnn_act_desc, - out_desc[i], out_datas[i])); + auto func = [&](void* cudnn_workspace) { + CUDNN_ENFORCE(platform::dynload::cudnnConvolutionBiasActivationForward( + handle, &alpha, in_desc[i], in_datas[i], filter_desc[i], + static_cast(filters[i]->data()), conv_desc[i], + algo[i], cudnn_workspace, workspace_size_in_bytes, &beta, + out_desc[i], out_datas[i], bias_desc[i], + static_cast(bias[i]->data()), cudnn_act_desc, + out_desc[i], out_datas[i])); + }; + auto workspace_handle = dev_ctx.cudnn_workspace_handle(); + workspace_handle.RunFunc(func, workspace_size_in_bytes); } cudnnTensorDescriptor_t x_desc; diff --git a/paddle/fluid/operators/grid_sampler_op.cc b/paddle/fluid/operators/grid_sampler_op.cc index 14a2524bd8f4a9f7685c84f1d9767f5f7eedf0e7..241184c6f4a19a1da0d6d75c5d4e2b372c14e9da 100644 --- a/paddle/fluid/operators/grid_sampler_op.cc +++ b/paddle/fluid/operators/grid_sampler_op.cc @@ -43,12 +43,14 @@ class GridSampleOp : public framework::OperatorWithKernel { PADDLE_ENFORCE(grid_dims[3] == 2, "Input(Grid) dims[3] should be 2."); PADDLE_ENFORCE_EQ(grid_dims[0], x_dims[0], "Input(X) and Input(Grid) dims[0] should be equal."); - PADDLE_ENFORCE_EQ( - grid_dims[1], x_dims[2], - "Input(X) dims[2] and Input(Grid) dims[1] should be equal."); - PADDLE_ENFORCE_EQ( - grid_dims[2], x_dims[3], - "Input(X) dims[3] and Input(Grid) dims[2] should be equal."); + if (ctx->IsRuntime()) { + PADDLE_ENFORCE_EQ( + grid_dims[1], x_dims[2], + "Input(X) dims[2] and Input(Grid) dims[1] should be equal."); + PADDLE_ENFORCE_EQ( + grid_dims[2], x_dims[3], + "Input(X) dims[3] and Input(Grid) dims[2] should be equal."); + } ctx->SetOutputDim("Output", x_dims); ctx->ShareLoD("X", "Output"); diff --git a/paddle/fluid/operators/jit/CMakeLists.txt b/paddle/fluid/operators/jit/CMakeLists.txt index 262094f9224407bb412f5b189a748efe13cb04b2..35775d7ec9efcdbad69e4491792f7d4e513832ad 100644 --- a/paddle/fluid/operators/jit/CMakeLists.txt +++ b/paddle/fluid/operators/jit/CMakeLists.txt @@ -21,5 +21,5 @@ endif() cc_library(jit_kernel_helper SRCS ${jit_kernel_cc_srcs} DEPS ${JIT_KERNEL_DEPS}) cc_test(jit_kernel_test SRCS test.cc DEPS jit_kernel_helper) if(NOT WIN32) - cc_binary(jit_kernel_benchmark SRCS benchmark.cc DEPS jit_kernel_helper device_tracer) + cc_binary(jit_kernel_benchmark SRCS benchmark.cc DEPS jit_kernel_helper device_tracer tensor) endif() diff --git a/paddle/fluid/operators/jit/benchmark.cc b/paddle/fluid/operators/jit/benchmark.cc index 74d6a87247821eb1d17cc97b8d8b4bcf1c832f79..186c37c56ec9410ac9a31503e33e7e334d0afc40 100644 --- a/paddle/fluid/operators/jit/benchmark.cc +++ b/paddle/fluid/operators/jit/benchmark.cc @@ -18,6 +18,7 @@ #include #include "gflags/gflags.h" #include "glog/logging.h" +#include "paddle/fluid/framework/tensor.h" #include "paddle/fluid/operators/jit/kernels.h" #include "paddle/fluid/platform/device_tracer.h" #include "paddle/fluid/platform/place.h" @@ -155,14 +156,22 @@ void BenchAllImpls(const typename KernelTuples::attr_type& attr, Args... args) { LOG(INFO) << loginfos.str(); } +using Tensor = paddle::framework::Tensor; + template void BenchXYZNKernel() { for (int d : TestSizes()) { - std::vector x(d), y(d), z(d); - RandomVec(d, x.data()); - RandomVec(d, y.data()); - BenchAllImpls, PlaceType>(d, x.data(), y.data(), - z.data(), d); + Tensor x, y, z; + x.Resize({d}); + y.Resize({d}); + z.Resize({d}); + T* x_data = x.mutable_data(PlaceType()); + T* y_data = y.mutable_data(PlaceType()); + T* z_data = z.mutable_data(PlaceType()); + RandomVec(d, x_data); + RandomVec(d, y_data); + BenchAllImpls, PlaceType>(d, x.data(), + y.data(), z_data, d); } } @@ -170,9 +179,13 @@ template void BenchAXYNKernel() { for (int d : TestSizes()) { const T a = static_cast(3); - std::vector x(d), y(d); - RandomVec(d, x.data()); - BenchAllImpls, PlaceType>(d, &a, x.data(), y.data(), + Tensor x, y; + x.Resize({d}); + y.Resize({d}); + T* x_data = x.mutable_data(PlaceType()); + T* y_data = y.mutable_data(PlaceType()); + RandomVec(d, x_data); + BenchAllImpls, PlaceType>(d, &a, x.data(), y_data, d); } } @@ -180,9 +193,13 @@ void BenchAXYNKernel() { template void BenchXYNKernel() { for (int d : TestSizes()) { - std::vector x(d), y(d); - RandomVec(d, x.data()); - BenchAllImpls, PlaceType>(d, x.data(), y.data(), d); + Tensor x, y; + x.Resize({d}); + y.Resize({d}); + T* x_data = x.mutable_data(PlaceType()); + T* y_data = y.mutable_data(PlaceType()); + RandomVec(d, x_data); + BenchAllImpls, PlaceType>(d, x.data(), y_data, d); } } @@ -192,16 +209,23 @@ void BenchLSTMKernel() { for (int d : TestSizes()) { const jit::lstm_attr_t attr(d, jit::kVSigmoid, jit::kVTanh, jit::kVTanh, use_peephole); - std::vector x(4 * d), ct_1(d), ct(d), ht(d), wp(3 * d), checked(2 * d); - RandomVec(4 * d, x.data(), -2.f, 2.f); - RandomVec(3 * d, wp.data(), -2.f, 2.f); - RandomVec(d, ct_1.data(), -2.f, 2.f); - const T* ct_1_data = ct_1.data(); - const T* wp_data = wp.data(); - T* x_data = x.data(); - T* checked_data = checked.data(); - T* ct_data = ct.data(); - T* ht_data = ht.data(); + Tensor x, ct_1, ct, ht, wp, checked; + x.Resize({4 * d}); + ct_1.Resize({d}); + ct.Resize({d}); + ht.Resize({d}); + wp.Resize({3 * d}); + checked.Resize({2 * d}); + auto place = PlaceType(); + RandomVec(x.numel(), x.mutable_data(place), -2.f, 2.f); + RandomVec(wp.numel(), wp.mutable_data(place), -2.f, 2.f); + RandomVec(ct_1.numel(), ct_1.mutable_data(place), -2.f, 2.f); + const T* ct_1_data = ct_1.data(); + const T* wp_data = wp.data(); + T* x_data = x.mutable_data(place); + T* checked_data = checked.mutable_data(place); + T* ct_data = ct.mutable_data(place); + T* ht_data = ht.mutable_data(place); jit::lstm_t step; step.gates = x_data; step.ct_1 = ct_1_data; @@ -220,12 +244,16 @@ template void BenchGRUKernel() { for (int d : TestSizes()) { const jit::gru_attr_t attr(d, jit::kVSigmoid, jit::kVTanh); - std::vector x(3 * d), ht_1(d), ht(d); - RandomVec(3 * d, x.data(), -2.f, 2.f); - RandomVec(d, ht_1.data(), -2.f, 2.f); - const T* ht_1_data = ht_1.data(); - T* x_data = x.data(); - T* ht_data = ht.data(); + auto place = PlaceType(); + Tensor x, ht_1, ht; + x.Resize({3 * d}); + ht_1.Resize({d}); + ht.Resize({d}); + RandomVec(3 * d, x.mutable_data(place), -2.f, 2.f); + RandomVec(d, ht_1.mutable_data(place), -2.f, 2.f); + const T* ht_1_data = ht_1.data(); + T* x_data = x.mutable_data(place); + T* ht_data = ht.mutable_data(place); jit::gru_t step; step.gates = x_data; step.ht_1 = ht_1_data; @@ -243,10 +271,12 @@ void BenchSeqPoolKernel() { jit::seq_pool_attr_t attr(w, type); for (int h : TestSizes()) { attr.h = h; - std::vector x(h * w), y(w); - RandomVec(h * w, x.data(), -2.f, 2.f); - const T* x_data = x.data(); - T* y_data = y.data(); + Tensor x, y; + x.Resize({h * w}); + y.Resize({w}); + RandomVec(h * w, x.mutable_data(PlaceType()), -2.f, 2.f); + const T* x_data = x.data(); + T* y_data = y.mutable_data(PlaceType()); BenchAllImpls, PlaceType>(attr, x_data, y_data, &attr); } @@ -259,12 +289,15 @@ void BenchMatMulKernel() { for (int m : {1, 2, 3, 4}) { for (int n : TestSizes()) { for (int k : TestSizes()) { - std::vector a(m * k), b(k * n), c(m * n); - RandomVec(m * k, a.data(), -2.f, 2.f); - RandomVec(k * n, b.data(), -2.f, 2.f); - const T* a_data = a.data(); - const T* b_data = b.data(); - T* c_data = c.data(); + Tensor a, b, c; + a.Resize({m * k}); + b.Resize({k * n}); + c.Resize({m * n}); + RandomVec(m * k, a.mutable_data(PlaceType()), -2.f, 2.f); + RandomVec(k * n, b.mutable_data(PlaceType()), -2.f, 2.f); + const T* a_data = a.data(); + const T* b_data = b.data(); + T* c_data = c.mutable_data(PlaceType()); BenchAllImpls, PlaceType>(k, a_data, b_data, c_data, m, n, k); } diff --git a/paddle/fluid/operators/lrn_mkldnn_op.cc b/paddle/fluid/operators/lrn_mkldnn_op.cc index 4e4f977fcc742856b877ef0b7f9a3cc9879aefce..097ba01d401dbc7969e30f576cac2567c874ed99 100644 --- a/paddle/fluid/operators/lrn_mkldnn_op.cc +++ b/paddle/fluid/operators/lrn_mkldnn_op.cc @@ -67,7 +67,13 @@ class LRNMKLDNNOpKernel : public paddle::framework::OpKernel { mid->mutable_data(ctx.GetPlace()); const int n = ctx.Attr("n"); - const float alpha = ctx.Attr("alpha"); + // MKL-DNN implements LRN in a caffe way: + // http://caffe.berkeleyvision.org/tutorial/layers/lrn.html + // Where sum of squares is divided by size of normalization window + // this is not the case for PaddlePaddle LRN. + // Hence we need to compensate for this diffrence by + // multipliing alpha by size of window(n) + const float alpha = ctx.Attr("alpha") * static_cast(n); const float beta = ctx.Attr("beta"); const float k = ctx.Attr("k"); const bool is_test = ctx.Attr("is_test"); @@ -78,10 +84,7 @@ class LRNMKLDNNOpKernel : public paddle::framework::OpKernel { auto dims = paddle::framework::vectorize2int(x->dims()); auto src_md = paddle::platform::MKLDNNMemDesc( - dims, mkldnn::memory::data_type::f32, mkldnn::memory::format::nchw); - - auto dst_md = paddle::platform::MKLDNNMemDesc( - dims, mkldnn::memory::data_type::f32, mkldnn::memory::format::nchw); + dims, mkldnn::memory::data_type::f32, x->format()); auto forward_desc = mkldnn::lrn_forward::desc{mkldnn::prop_kind::forward, mkldnn::lrn_across_channels, @@ -92,8 +95,6 @@ class LRNMKLDNNOpKernel : public paddle::framework::OpKernel { k}; auto src_memory_pd = mkldnn::memory::primitive_desc{src_md, mkldnn_engine}; - auto dst_memory = mkldnn::memory{{dst_md, mkldnn_engine}, - static_cast(output_data)}; if (!is_test) { const std::string key = ctx.op().Output("Out"); @@ -110,11 +111,16 @@ class LRNMKLDNNOpKernel : public paddle::framework::OpKernel { src_memory->set_data_handle( static_cast(const_cast(input_data))); + auto dst_memory = mkldnn::memory(forward_pd->dst_primitive_desc(), + static_cast(output_data)); auto workspace_memory = insert_to_context( key_workspace_memory, dev_ctx, forward_pd->workspace_primitive_desc()); run_primitive(*forward_pd, *src_memory, *workspace_memory, dst_memory); + + out->set_layout(framework::DataLayout::kMKLDNN); + out->set_format(platform::GetMKLDNNFormat(dst_memory)); } else { auto forward_pd = mkldnn::lrn_forward::primitive_desc{forward_desc, mkldnn_engine}; @@ -122,8 +128,13 @@ class LRNMKLDNNOpKernel : public paddle::framework::OpKernel { src_memory_pd, static_cast(const_cast(input_data))}; auto workspace_memory = mkldnn::memory{forward_pd.workspace_primitive_desc()}; + auto dst_memory = mkldnn::memory(forward_pd.dst_primitive_desc(), + static_cast(output_data)); run_primitive(forward_pd, src_memory, workspace_memory, dst_memory); + + out->set_layout(framework::DataLayout::kMKLDNN); + out->set_format(platform::GetMKLDNNFormat(dst_memory)); } } }; @@ -151,7 +162,7 @@ class LRNMKLDNNGradOpKernel : public paddle::framework::OpKernel { const std::string key_workspace_memory = key + "@lrn_workspace_memory"; const int n = ctx.Attr("n"); - const float alpha = ctx.Attr("alpha"); + const float alpha = ctx.Attr("alpha") * static_cast(n); const float beta = ctx.Attr("beta"); const float k = ctx.Attr("k"); diff --git a/paddle/fluid/operators/math/CMakeLists.txt b/paddle/fluid/operators/math/CMakeLists.txt index dc27e543f0dfd65e556f9e3a138778972ad6982f..6bbb7155dda9b2c844f793a63adb861c2ed956e8 100644 --- a/paddle/fluid/operators/math/CMakeLists.txt +++ b/paddle/fluid/operators/math/CMakeLists.txt @@ -54,6 +54,7 @@ math_library(sequence_padding) math_library(sequence_pooling DEPS math_function jit_kernel_helper) math_library(sequence_scale) math_library(softmax DEPS math_function) +math_library(beam_search DEPS math_function) math_library(matrix_bit_code) @@ -68,6 +69,7 @@ cc_test(im2col_test SRCS im2col_test.cc DEPS im2col) cc_test(vol2col_test SRCS vol2col_test.cc DEPS vol2col) cc_test(sequence_padding_test SRCS sequence_padding_test.cc DEPS sequence_padding) cc_test(sequence_pooling_test SRCS sequence_pooling_test.cc DEPS sequence_pooling) +cc_test(beam_search_test SRCS beam_search_test.cc DEPS beam_search) if(WITH_GPU) nv_test(math_function_gpu_test SRCS math_function_test.cu DEPS math_function) nv_test(selected_rows_functor_gpu_test SRCS selected_rows_functor_test.cu.cc DEPS selected_rows_functor math_function) diff --git a/paddle/fluid/operators/math/beam_search.cc b/paddle/fluid/operators/math/beam_search.cc new file mode 100644 index 0000000000000000000000000000000000000000..fb7119273a734feba870fdabade6a4faa1d5e9a3 --- /dev/null +++ b/paddle/fluid/operators/math/beam_search.cc @@ -0,0 +1,283 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/operators/math/beam_search.h" +#include +#include + +namespace paddle { +namespace operators { +namespace math { + +template +class BeamSearchFunctor { + public: + void operator()(const platform::CPUDeviceContext &context, + const framework::LoDTensor *pre_ids, + const framework::LoDTensor *pre_scores, + const framework::LoDTensor *ids, + const framework::LoDTensor *scores, + framework::LoDTensor *selected_ids, + framework::LoDTensor *selected_scores, size_t level, + size_t beam_size, int end_id, bool is_accumulated) { + auto abs_lod = framework::ToAbsOffset(scores->lod()); + auto &high_level = abs_lod[level]; + + auto items = SelectTopBeamSizeItems(pre_ids, pre_scores, ids, scores, level, + beam_size, end_id, is_accumulated); + auto selected_items = ToMap(items, high_level.back()); + if (FLAGS_v == 3) { + VLOG(3) << "selected_items:"; + for (size_t i = 0; i < selected_items.size(); ++i) { + VLOG(3) << "offset: " << i; + for (auto &item : selected_items[i]) { + VLOG(3) << item.ToString(); + } + } + } + + PruneEndBeams(pre_ids, abs_lod, &selected_items, level, end_id); + // calculate the output tensor's height + size_t num_instances = std::accumulate( + std::begin(selected_items), std::end(selected_items), 0, + [](size_t a, std::vector &b) { return a + b.size(); }); + // the output tensor shape should be [num_instances, 1] + auto dims = framework::make_ddim( + std::vector({static_cast(num_instances), 1})); + selected_ids->Resize(dims); + selected_scores->Resize(dims); + + auto *selected_ids_data = + selected_ids->mutable_data(platform::CPUPlace()); + auto *selected_scores_data = + selected_scores->mutable_data(platform::CPUPlace()); + + // fill in data + std::vector low_level; + size_t low_offset = 0; + for (auto &items : selected_items) { + low_level.push_back(low_offset); + for (auto &item : items) { + selected_ids_data[low_offset] = item.id; + selected_scores_data[low_offset] = item.score; + low_offset++; + } + } + low_level.push_back(low_offset); + + // fill lod + framework::LoD lod(2); + lod[0].assign(high_level.begin(), high_level.end()); + lod[1].assign(low_level.begin(), low_level.end()); + if (!framework::CheckLoD(lod)) { + PADDLE_THROW("lod %s is not right", framework::LoDToString(lod)); + } + selected_ids->set_lod(lod); + selected_scores->set_lod(lod); + } + + /* + * The basic items help to sort. + */ + struct Item { + Item() {} + Item(size_t offset, size_t id, float score) + : offset(offset), id(id), score(score) {} + // offset in the higher lod level. + size_t offset; + // prefix id in the lower lod level. + // size_t prefix; + // the candidate id + size_t id; + // the corresponding score + float score; + + inline bool operator<(const Item &in) const { + return (score < in.score) || + ((score == in.score) && (offset < in.offset)); + } + + inline void operator=(const Item &in) { + offset = in.offset; + id = in.id; + score = in.score; + } + + std::string ToString() { + std::ostringstream os; + os << "{"; + os << "offset: " << offset << ", "; + os << "id: " << id << ", "; + os << "score: " << score << ""; + os << "}"; + return os.str(); + } + }; + + protected: + /* + * Prune the source sentences all branchs finished, and it is optional. + * Pruning must one step later than finishing (thus pre_ids is needed here), + * since the end tokens must be writed out. + */ + void PruneEndBeams(const framework::LoDTensor *pre_ids, + const framework::LoD &abs_lod, + std::vector> *items, size_t lod_level, + int end_id) { + auto *pre_ids_data = pre_ids->data(); + auto &high_level = abs_lod[lod_level]; + for (size_t src_idx = 0; src_idx < high_level.size() - 1; ++src_idx) { + size_t src_prefix_start = high_level[src_idx]; + size_t src_prefix_end = high_level[src_idx + 1]; + bool finish_flag = true; + for (size_t offset = src_prefix_start; offset < src_prefix_end; + offset++) { + for (auto &item : items->at(offset)) { + if (item.id != static_cast(end_id) || + pre_ids_data[offset] != end_id) { + finish_flag = false; + break; + } + } + if (!finish_flag) break; + } + if (finish_flag) { // all branchs of the beam (source sentence) end and + // prune this beam + for (size_t offset = src_prefix_start; offset < src_prefix_end; + offset++) + items->at(offset).clear(); + } + } + } + + /* + * Transform the items into a map whose key is offset, value is the items. + * NOTE low performance. + */ + std::vector> ToMap( + const std::vector> &items, size_t element_num) { + std::vector> result; + result.resize(element_num); + for (auto &entries : items) { + for (const auto &item : entries) { + result[item.offset].push_back(item); + } + } + return result; + } + + void Insert(std::vector *top_beam_ptr, const Item &item, + size_t beam_size) { + std::vector &top_beam = *top_beam_ptr; + + size_t num_beams = top_beam.size(); + if (num_beams < beam_size) { + top_beam.resize(num_beams + 1); + num_beams++; + } else { + if (item < top_beam[beam_size - 1]) { + return; + } + } + + for (int k = static_cast(num_beams) - 2; k >= 0; --k) { + if (top_beam[k] < item) { + top_beam[k + 1] = top_beam[k]; + } else { + top_beam[k + 1] = item; + return; + } + } + top_beam[0] = item; + } + + /* + * For each source, select top beam_size records. + */ + std::vector> SelectTopBeamSizeItems( + const framework::LoDTensor *pre_ids, + const framework::LoDTensor *pre_scores, const framework::LoDTensor *ids, + const framework::LoDTensor *scores, size_t lod_level, size_t beam_size, + int end_id, bool is_accumulated) { + std::vector> result; + + // find the current candidates + auto abs_lod = framework::ToAbsOffset(scores->lod()); + + auto *pre_ids_data = pre_ids->data(); + auto *pre_scores_data = pre_scores->data(); + + auto *ids_data = ids ? ids->data() : nullptr; + auto *scores_data = scores->data(); + + size_t num_seqs = scores->NumElements(lod_level); + size_t seq_width = 1; + for (int i = 1; i < scores->dims().size(); i++) { + seq_width *= scores->dims()[i]; + } + + for (size_t seq_id = 0; seq_id < num_seqs; ++seq_id) { + size_t seq_offset_start = abs_lod[lod_level][seq_id]; + size_t seq_offset_end = abs_lod[lod_level][seq_id + 1]; + + std::vector top_beam; + top_beam.reserve(beam_size); + + for (size_t offset = seq_offset_start; offset < seq_offset_end; + ++offset) { + auto pre_id = pre_ids_data[offset]; + auto pre_score = pre_scores_data[offset]; + if (pre_id == end_id) { + // Allocate all probability mass to end_id for finished branchs and + // the other candidate ids can be ignored. + Item item(offset, end_id, pre_score); + Insert(&top_beam, item, beam_size); + } else { + size_t index = offset * seq_width; + for (size_t d = 0; d < seq_width; d++, index++) { + int64_t id = ids_data ? ids_data[index] : static_cast(d); + float score = is_accumulated + ? scores_data[index] + : pre_score + std::log(scores_data[index]); + Item item(offset, id, score); + Insert(&top_beam, item, beam_size); + } + } + } + + result.emplace_back(top_beam); + } + + if (FLAGS_v == 3) { + VLOG(3) << "SelectTopBeamSizeItems result size " << result.size(); + for (auto &items : result) { + VLOG(3) << "item set:"; + for (auto &item : items) { + VLOG(3) << item.ToString(); + } + } + } + + return result; + } +}; + +template class BeamSearchFunctor; +template class BeamSearchFunctor; +template class BeamSearchFunctor; +template class BeamSearchFunctor; + +} // namespace math +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/math/beam_search.cu b/paddle/fluid/operators/math/beam_search.cu new file mode 100644 index 0000000000000000000000000000000000000000..d94e3023ce537cb9fa456e079c4fa3cf57fb954d --- /dev/null +++ b/paddle/fluid/operators/math/beam_search.cu @@ -0,0 +1,393 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/operators/math/beam_search.h" +#include "paddle/fluid/platform/cuda_device_function.h" + +namespace paddle { +namespace operators { +namespace math { + +struct Triple { + __device__ __forceinline__ Triple() {} + __device__ __forceinline__ Triple(int o, int i, float s) + : offset(o), id(i), score(s) {} + + __device__ __forceinline__ void set(int o, int i, float s) { + offset = o; + id = i; + score = s; + } + + __device__ __forceinline__ void operator=(const Triple& in) { + offset = in.offset; + id = in.id; + score = in.score; + } + + __device__ __forceinline__ bool operator<(const float s) const { + return score < s; + } + + __device__ __forceinline__ bool operator<(const Triple& in) const { + return (score < in.score) || ((score == in.score) && (offset < in.offset)); + } + + int offset; + int id; + float score; +}; + +__device__ __forceinline__ void Insert(Triple* top_beam, const Triple& p, + int beam_size) { + if (p < top_beam[beam_size - 1]) { + return; + } + for (int k = beam_size - 2; k >= 0; --k) { + if (top_beam[k] < p) { + top_beam[k + 1] = top_beam[k]; + } else { + top_beam[k + 1] = p; + return; + } + } + top_beam[0] = p; +} + +template +__device__ __forceinline__ int SelectTopBeam( + Triple* top_beam, const int64_t* pre_ids, const float* pre_scores, + const int64_t* ids, const float* scores, const int seq_offset_start, + const int seq_offset_end, const int seq_width, int beam_size, int end_id, + int used_threads) { + // top_beam is shared memory + const int tid = threadIdx.x; + const int tid_of_seq = threadIdx.x % MaxThreadsPerSeq; + + int num_used_threads = used_threads; + + Triple* top_beam_local = top_beam + tid * beam_size; + if (tid_of_seq < num_used_threads) { + for (int i = 0; i < beam_size; ++i) { + top_beam_local[i].set(-1, -1, -INFINITY); + } + + for (int offset = seq_offset_start; offset < seq_offset_end; ++offset) { + int pre_id = static_cast(pre_ids[offset]); + if (pre_id == end_id) { + if (tid_of_seq == 0) { + Triple tmp(offset, end_id, pre_scores[offset]); + Insert(top_beam_local, tmp, beam_size); + } + } else { + int index = offset * seq_width + tid_of_seq; + if (!IsAccumulated) { + float pre_score = pre_scores[offset]; + for (int i = tid_of_seq; i < seq_width; i += num_used_threads) { + float score = pre_score + __logf(scores[index]); + int id = ids ? static_cast(ids[index]) : i; + Triple tmp(offset, id, score); + Insert(top_beam_local, tmp, beam_size); + index += num_used_threads; + } + } else { + for (int i = tid_of_seq; i < seq_width; i += num_used_threads) { + int id = ids ? static_cast(ids[index]) : i; + float score = scores[index]; + Triple tmp(offset, id, score); + Insert(top_beam_local, tmp, beam_size); + index += num_used_threads; + } + } + } + } + } + + while (num_used_threads > 1) { + if (num_used_threads > 16) { + __syncthreads(); + } + + num_used_threads = num_used_threads >> 1; + if (tid_of_seq < num_used_threads) { + int index_in_sh = (num_used_threads + tid) * beam_size; + for (int i = 0; i < beam_size; i++) { + Insert(top_beam_local, top_beam[index_in_sh], beam_size); + index_in_sh++; + } + } + } + + if (tid_of_seq == 0) { + int num_items = 0; + for (int i = 0; i < beam_size; ++i) { + num_items = + (top_beam_local[i].score > -INFINITY) ? num_items + 1 : num_items; + } + return num_items; + } + + return 0; +} + +__device__ __forceinline__ bool PruneEndBeams(Triple* top_beam_local, + const int64_t* pre_ids, + const int end_id, int num_items) { + bool finish_flag = true; + for (int i = 0; i < num_items; ++i) { + int offset = top_beam_local[i].offset; + if (top_beam_local[i].id != end_id || + static_cast(pre_ids[offset]) != end_id) { + finish_flag = false; + break; + } + } + return finish_flag; +} + +__device__ __forceinline__ void WriteBack( + int64_t* selected_ids, float* selected_scores, size_t* selected_offsets, + Triple* top_beam_local, const int seq_offset_start, + const int seq_offset_end, const int selected_seq_start, + const int selected_seq_length) { + const int tid = threadIdx.x; // use 1 thread only for each sequence + int global_index = selected_seq_start; + for (int global_offset = seq_offset_start; global_offset < seq_offset_end; + ++global_offset) { + for (int local_index = 0; local_index < selected_seq_length; + ++local_index) { + if (top_beam_local[local_index].offset == global_offset) { + selected_ids[global_index] = + static_cast(top_beam_local[local_index].id); + selected_scores[global_index] = top_beam_local[local_index].score; + global_index++; + } + } + selected_offsets[global_offset + 1] = static_cast(global_index); + } +} + +template +__device__ void BeamSearchDetails( + int64_t* selected_ids, float* selected_scores, size_t* selected_offsets, + const int64_t* pre_ids, const float* pre_scores, const int64_t* ids, + const float* scores, const int seq_offset_start, const int seq_offset_end, + const int seq_width, int beam_size, int end_id, bool is_accumulated, + int num_used_threads) { + __shared__ Triple top_beam[MaxLength]; + + int num_items = 0; + if (is_accumulated) { + num_items = SelectTopBeam( + top_beam, pre_ids, pre_scores, ids, scores, seq_offset_start, + seq_offset_end, seq_width, beam_size, end_id, num_used_threads); + } else { + num_items = SelectTopBeam( + top_beam, pre_ids, pre_scores, ids, scores, seq_offset_start, + seq_offset_end, seq_width, beam_size, end_id, num_used_threads); + } + + const int tid = threadIdx.x; // use 1 thread only for each sequence + const int tid_of_seq = tid % MaxThreadsPerSeq; + if (tid_of_seq == 0) { + // Use 1 thread for each sequence. + Triple* top_beam_local = top_beam + tid * beam_size; + bool finish_flag = + PruneEndBeams(top_beam_local, pre_ids, end_id, num_items); + + int selected_seq_start = 0; + int selected_seq_length = finish_flag ? 0 : num_items; + + if (MaxSeqs > 1) { + const int seq_id = (MaxSeqs > 1) ? tid / MaxThreadsPerSeq : tid; + __shared__ int shared_mem[MaxSeqs]; + + // [0, MaxSeqs - 1], length of each sequences + shared_mem[seq_id] = selected_seq_length; + __syncthreads(); + + for (int s = 0; s < seq_id; ++s) { + selected_seq_start += shared_mem[s]; + } + + if (seq_id == 0) { + selected_offsets[0] = 0; + } + } else { + selected_offsets[0] = 0; + } + + WriteBack(selected_ids, selected_scores, selected_offsets, top_beam_local, + seq_offset_start, seq_offset_end, selected_seq_start, + selected_seq_length); + } +} + +template +__global__ void BeamSearchKernel(int64_t* selected_ids, float* selected_scores, + size_t* selected_offsets, + const int64_t* pre_ids, + const float* pre_scores, const int64_t* ids, + const float* scores, const size_t* seq_offsets, + const int num_seqs, const int seq_width, + int beam_size, int end_id, bool is_accumulated, + int num_used_threads) { + const int tid = threadIdx.x; + const int seq_id = (MaxSeqs > 1) ? tid / MaxThreadsPerSeq : tid; + + int seq_offset_start = static_cast(seq_offsets[seq_id]); + int seq_offset_end = static_cast(seq_offsets[seq_id + 1]); + + BeamSearchDetails( + selected_ids, selected_scores, selected_offsets, pre_ids, pre_scores, ids, + scores, seq_offset_start, seq_offset_end, seq_width, beam_size, end_id, + is_accumulated, num_used_threads); +} + +template +__global__ void BeamSearchKernelSingle( + int64_t* selected_ids, float* selected_scores, size_t* selected_offsets, + const int64_t* pre_ids, const float* pre_scores, const int64_t* ids, + const float* scores, const int seq_length, const int seq_width, + int beam_size, int end_id, bool is_accumulated, int num_used_threads) { + const int seq_offset_start = 0; + const int seq_offset_end = seq_length; + + BeamSearchDetails( + selected_ids, selected_scores, selected_offsets, pre_ids, pre_scores, ids, + scores, seq_offset_start, seq_offset_end, seq_width, beam_size, end_id, + is_accumulated, num_used_threads); +} + +static inline int GetNumUsedThreads(const int max_threads_per_seq, + const int seq_width, int beam_size) { + int num_used_threads = (seq_width + beam_size - 1) / beam_size; + num_used_threads = max_threads_per_seq < num_used_threads + ? max_threads_per_seq + : num_used_threads; + + num_used_threads = + num_used_threads > 32 + ? (num_used_threads >> 5) << 5 + : (num_used_threads > 16 + ? 32 + : (num_used_threads > 8 + ? 16 + : (num_used_threads > 4 + ? 8 + : (num_used_threads > 2 ? 4 + : num_used_threads)))); + return num_used_threads; +} + +template +class BeamSearchFunctor { + public: + void operator()(const platform::CUDADeviceContext& context, + const framework::LoDTensor* pre_ids, + const framework::LoDTensor* pre_scores, + const framework::LoDTensor* ids, + const framework::LoDTensor* scores, + framework::LoDTensor* selected_ids, + framework::LoDTensor* selected_scores, size_t level, + size_t beam_size, int end_id, bool is_accumulated) { + auto abs_lod = framework::ToAbsOffset(scores->lod()); + + const int64_t* pre_ids_data = pre_ids->data(); + const float* pre_scores_data = pre_scores->data(); + const int64_t* ids_data = ids ? ids->data() : nullptr; + const float* scores_data = scores->data(); + + const size_t num_seqs = abs_lod[level].size() - 1; + size_t seq_width = 1; + for (int i = 1; i < scores->dims().size(); i++) { + seq_width *= scores->dims()[i]; + } + + // Reserve a big enough memory. + auto selected_dims = + framework::make_ddim({static_cast(num_seqs * beam_size), 1}); + int64_t* selected_ids_data = + selected_ids->mutable_data(selected_dims, context.GetPlace()); + float* selected_scores_data = + selected_scores->mutable_data(selected_dims, context.GetPlace()); + + framework::LoD selected_lod(2); + selected_lod[0].assign(abs_lod[level].begin(), abs_lod[level].end()); + selected_lod[1].resize(scores->dims()[0] + 1); + size_t* selected_offsets = + selected_lod[1].CUDAMutableData(context.GetPlace()); + + if (num_seqs == 1) { + const int seq_length = static_cast(abs_lod[level][1]); + const int kMaxThreadsPerSeq = 1024; + int num_used_threads = + GetNumUsedThreads(kMaxThreadsPerSeq, static_cast(seq_width), + static_cast(beam_size)); + switch (platform::RoundToPowerOfTwo(beam_size * seq_width)) { + CUDA_LAUNCH_KERNEL_HELPER( + BeamSearchKernelSingle<<< + 1, kMaxThreadsPerSeq, 0, context.stream()>>>( + selected_ids_data, selected_scores_data, selected_offsets, + pre_ids_data, pre_scores_data, ids_data, scores_data, + seq_length, static_cast(seq_width), + static_cast(beam_size), static_cast(end_id), + is_accumulated, num_used_threads)); + } + } else if (num_seqs <= 4) { + const size_t* seq_offsets = abs_lod[level].CUDAData(context.GetPlace()); + // Use only 1 block + const int kMaxThreadsPerSeq = 32; + const int kMaxSeqs = 4; + int num_used_threads = + GetNumUsedThreads(kMaxThreadsPerSeq, static_cast(seq_width), + static_cast(beam_size)); + switch (platform::RoundToPowerOfTwo(beam_size * num_seqs * 32)) { + CUDA_LAUNCH_KERNEL_HELPER( + BeamSearchKernel<<< + 1, num_seqs * kMaxThreadsPerSeq, 0, context.stream()>>>( + selected_ids_data, selected_scores_data, selected_offsets, + pre_ids_data, pre_scores_data, ids_data, scores_data, + seq_offsets, static_cast(num_seqs), + static_cast(seq_width), static_cast(beam_size), + end_id, is_accumulated, num_used_threads)); + } + } else { + LOG(FATAL) << "Not implemented."; + } + + context.Wait(); + if (!framework::CheckLoD(selected_lod)) { + PADDLE_THROW("lod %s is not right", framework::LoDToString(selected_lod)); + } + + selected_ids->set_lod(selected_lod); + selected_scores->set_lod(selected_lod); + if (selected_lod[1].back() < num_seqs * beam_size) { + auto final_selected_dims = framework::make_ddim( + {static_cast(selected_lod[1].back()), 1}); + selected_ids->Resize(final_selected_dims); + selected_scores->Resize(final_selected_dims); + } + } +}; + +template class BeamSearchFunctor; +template class BeamSearchFunctor; +template class BeamSearchFunctor; +template class BeamSearchFunctor; + +} // namespace math +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/math/beam_search.h b/paddle/fluid/operators/math/beam_search.h new file mode 100644 index 0000000000000000000000000000000000000000..3cd17f426c5596582c91f2b3f0cc5ba513e3aa4b --- /dev/null +++ b/paddle/fluid/operators/math/beam_search.h @@ -0,0 +1,119 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + +http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +#include +#include +#include "paddle/fluid/framework/lod_tensor.h" +#include "paddle/fluid/platform/device_context.h" + +namespace paddle { +namespace operators { +namespace math { + +/* + * This is an implementation of beam search. + * + * To explain the details, lets take machine translation task for example, in + * this task, one source sentence is translated to multiple target sentences, + * during this period, one sentence will be translated to multiple translation + * prefixes(target sentence that have not ended), in each time step a prefix + * will have some candidates, input the candidate ids and their corresponding + * scores (probabilities), it will sort and select the top beam_size candidates + * for each source sentence, and store the selected candidates's score and their + * corresponding ids to LoDTensors. + * + * A detailed example: + * + * Input + * + * ids: + * - LoD (should have 2 levels) + * - first level: [0, 1, 4] + * - second level: [0, 1, 2, 3, 4] + * - tensor's data: + * [[4, 2, 5] + * [2, 1, 3] + * [3, 5, 2] + * [8, 2, 1]] + * + * scores: + * - LoD same as `ids` + * - tensor's data + * [[0.5, 0.3, 0.2] + * [0.6, 0.3, 0.1] + * [0.9, 0.5, 0.1] + * [0.7, 0.5, 0.1]] + * + * The inputs means that there are 2 source sentences to translate, and the + * first source has 1 prefix, the second source has 2 prefix. + * + * Lets assume beam size is 2, and the beam search's output should be + * - LoD + * - first level: [0, 1, 2] + * - second level: [0, 2, 4] + * - id tensor's data + * [[4, + * 1, + * 3, + * 8]] + * - score tensor's data + * [[0.5, + * 0.3, + * 0.9, + * 0.7]] + * + * TODO all the prune operations should be in the beam search, so it is better + * to split the beam search algorithm into a sequence of smaller operators, and + * the prune operators can be inserted in this sequence. + */ +template +class BeamSearchFunctor { + public: + /* + * The main function of beam search. + * + * @selected_ids: a [None, 1]-shaped tensor with LoD. + * In a machine translation model, it might be the candidate term id sets, + * each set stored as a varience-length sequence. + * The format might be described with a two-level LoD + * - [[0 1], + * [0 1 2]] + * - [[] + * [0 1]] + * the first level of LoD tells that there are two source sentences. The + * second level describes the details of the candidate id set's offsets in + * the source sentences. + * + * @selected_scores: a LoD tensor with the same shape and LoD with + * selected_ids. + * It stores the corresponding scores of candidate ids in selected_ids. + * + * Return false if all the input tensor is empty, in machine translation task + * that means no candidates is provided, and the task will stop running. + */ + void operator()(const DeviceContext& context, + const framework::LoDTensor* pre_ids, + const framework::LoDTensor* pre_scores, + const framework::LoDTensor* ids, + const framework::LoDTensor* scores, + framework::LoDTensor* selected_ids, + framework::LoDTensor* selected_scores, size_t level, + size_t beam_size, int end_id, bool is_accumulated); +}; + +} // namespace math +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/math/beam_search_test.cc b/paddle/fluid/operators/math/beam_search_test.cc new file mode 100644 index 0000000000000000000000000000000000000000..1c29ee95f6b109209316e4e8c8f3cda37eac62ae --- /dev/null +++ b/paddle/fluid/operators/math/beam_search_test.cc @@ -0,0 +1,141 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/operators/math/beam_search.h" +#include +#include + +void PrepareCPUTensors(paddle::framework::LoDTensor* ids, + paddle::framework::LoDTensor* scores, + paddle::framework::LoDTensor* pre_ids, + paddle::framework::LoDTensor* pre_scores) { + // lod + paddle::framework::LoD lod; + std::vector level0({0, 2, 4}); + std::vector level1({0, 1, 2, 3, 4}); + lod.push_back(level0); + lod.push_back(level1); + ids->set_lod(lod); + scores->set_lod(lod); + + auto dims = paddle::framework::make_ddim({4, 3}); + ids->Resize(dims); + scores->Resize(dims); + + paddle::platform::CPUPlace place; + auto* ids_data = ids->mutable_data(place); + auto* scores_data = scores->mutable_data(place); + std::vector ids_vec_data({4, 2, 5, 2, 1, 3, 3, 5, 2, 8, 2, 1}); + std::vector scores_vec_data( + {0.6f, 0.3f, 0.5f, 0.2f, 0.3f, 0.1f, 0.9f, 0.5f, 0.1f, 0.7f, 0.5f, 0.1f}); + + CHECK_EQ(static_cast(ids->numel()), ids_vec_data.size()); + CHECK_EQ(static_cast(ids->numel()), scores_vec_data.size()); + + for (int i = 0; i < ids->numel(); i++) { + ids_data[i] = ids_vec_data[i]; + scores_data[i] = scores_vec_data[i]; + } + + // pre_ids + pre_ids->Resize(paddle::framework::make_ddim({4, 1})); + for (int i = 0; i < 4; i++) { + pre_ids->mutable_data(place)[i] = i + 1; + } + + // pre_scores + pre_scores->Resize(paddle::framework::make_ddim({4, 1})); + for (int i = 0; i < 4; i++) { + pre_scores->mutable_data(place)[i] = 0.1 * (i + 1); + } +} + +template +void TestBeamSearch() { + paddle::framework::LoDTensor ids; + paddle::framework::LoDTensor scores; + paddle::framework::LoDTensor pre_ids; + paddle::framework::LoDTensor pre_scores; + + auto* place = new Place(); + DeviceContext* context = new DeviceContext(*place); + if (paddle::platform::is_cpu_place(*place)) { + PrepareCPUTensors(&ids, &scores, &pre_ids, &pre_scores); + } else { + paddle::framework::LoDTensor cpu_ids; + paddle::framework::LoDTensor cpu_scores; + paddle::framework::LoDTensor cpu_pre_ids; + paddle::framework::LoDTensor cpu_pre_scores; + + PrepareCPUTensors(&cpu_ids, &cpu_scores, &cpu_pre_ids, &cpu_pre_scores); + + TensorCopySync(cpu_ids, *place, &ids); + TensorCopySync(cpu_scores, *place, &scores); + TensorCopySync(cpu_pre_ids, *place, &pre_ids); + TensorCopySync(cpu_pre_scores, *place, &pre_scores); + + ids.set_lod(cpu_ids.lod()); + scores.set_lod(cpu_scores.lod()); + pre_ids.set_lod(cpu_pre_ids.lod()); + pre_scores.set_lod(cpu_pre_scores.lod()); + } + + paddle::framework::LoDTensor selected_ids; + paddle::framework::LoDTensor selected_scores; + + size_t level = 0; + size_t beam_size = 2; + int end_id = 0; + paddle::operators::math::BeamSearchFunctor beamsearch; + beamsearch(*context, &pre_ids, &pre_scores, &ids, &scores, &selected_ids, + &selected_scores, level, beam_size, end_id, true); + + ASSERT_EQ(selected_ids.lod(), selected_scores.lod()); + + paddle::framework::LoDTensor cpu_selected_ids; + paddle::framework::LoDTensor cpu_selected_scores; + if (paddle::platform::is_cpu_place(*place)) { + cpu_selected_ids = selected_ids; + cpu_selected_scores = selected_scores; + } else { + TensorCopySync(selected_ids, paddle::platform::CPUPlace(), + &cpu_selected_ids); + TensorCopySync(selected_scores, paddle::platform::CPUPlace(), + &cpu_selected_scores); + cpu_selected_ids.set_lod(selected_ids.lod()); + cpu_selected_scores.set_lod(selected_scores.lod()); + } + + std::vector expected_ids({4, 5, 3, 8}); + std::vector expected_scores({0.6f, 0.5f, 0.9f, 0.7f}); + for (int i = 0; i < 4; i++) { + ASSERT_EQ(expected_ids[i], cpu_selected_ids.data()[i]); + ASSERT_EQ(expected_scores[i], cpu_selected_scores.data()[i]); + } + + delete place; + delete context; +} + +TEST(BeamSearch, CPU) { + TestBeamSearch(); +} + +#ifdef PADDLE_WITH_CUDA +TEST(BeamSearch, GPU) { + TestBeamSearch(); +} +#endif diff --git a/paddle/fluid/operators/math/sampler.cc b/paddle/fluid/operators/math/sampler.cc index 2708f3bcd8f1d2cab19c74b57fdf9f903d9dc65d..238d9f2905058d267ffbee0669594920d7a9e031 100644 --- a/paddle/fluid/operators/math/sampler.cc +++ b/paddle/fluid/operators/math/sampler.cc @@ -13,6 +13,7 @@ See the License for the specific language governing permissions and limitations under the License. */ #include "paddle/fluid/operators/math/sampler.h" +#include #include #include #include @@ -77,7 +78,14 @@ int64_t CustomSampler::Sample() const { auto index = (*int_dist_)(*random_engine_); auto p = (*real_dist_)(*random_engine_); if (p > alias_probs_[index]) { - return alias_[index]; + int alias = alias_[index]; + + if (alias == exceptional_val) { + LOG(WARNING) << "WARNING: CustomSampler get alias " << exceptional_val; + return index; + } + + return alias; } else { return index; } diff --git a/paddle/fluid/operators/math/sampler.h b/paddle/fluid/operators/math/sampler.h index 98e0b898a504e3bd6b37c3cc772c179eab6038a4..3fa5a7ae336a9be984324411b88570aea99c2c78 100644 --- a/paddle/fluid/operators/math/sampler.h +++ b/paddle/fluid/operators/math/sampler.h @@ -116,6 +116,7 @@ class CustomSampler : public Sampler { const float* alias_probs_; const int* alias_; const float* probs_; + const int exceptional_val = -1; std::shared_ptr random_engine_; std::shared_ptr> real_dist_; std::shared_ptr> int_dist_; diff --git a/paddle/fluid/operators/math/selected_rows_functor_test.cc b/paddle/fluid/operators/math/selected_rows_functor_test.cc index f15b37a1e3f0ae9c7612c4f74470472393ff4ad6..aedb82da2f0fb2f15e1586d351af7c9d4364852b 100644 --- a/paddle/fluid/operators/math/selected_rows_functor_test.cc +++ b/paddle/fluid/operators/math/selected_rows_functor_test.cc @@ -354,7 +354,7 @@ TEST(selected_rows_functor, cpu_merge_add_multi) { auto* out_data = output->value().data(); for (size_t i = 0; i < ret_rows.size(); ++i) { - for (size_t j = 0; j < row_numel; ++j) { + for (size_t j = 0; j < static_cast(row_numel); ++j) { EXPECT_EQ(out_data[i * row_numel + j], ret_rows[i]); } } diff --git a/paddle/fluid/operators/math/selected_rows_functor_test.cu.cc b/paddle/fluid/operators/math/selected_rows_functor_test.cu.cc index 73d83fa2e43f14445c969648cd469b0e32d644c7..74892316e6decdeab3a08396fa2f4bdeb8eb7b73 100644 --- a/paddle/fluid/operators/math/selected_rows_functor_test.cu.cc +++ b/paddle/fluid/operators/math/selected_rows_functor_test.cu.cc @@ -301,7 +301,7 @@ TEST(selected_rows_functor, gpu_merge_add) { auto* out_data = output_cpu.data(); for (size_t i = 0; i < ret_rows.size(); ++i) { - for (size_t j = 0; j < row_numel; ++j) { + for (size_t j = 0; j < static_cast(row_numel); ++j) { EXPECT_EQ(out_data[i * row_numel + j], ret_rows[i]); } } diff --git a/paddle/fluid/operators/math/sequence_pooling_test.cc b/paddle/fluid/operators/math/sequence_pooling_test.cc index 5535523e798912ff80eeb5d753914c7d8d70a05f..cf6e89b3d9f11f2b68322ef15ddf026625f6a5a5 100644 --- a/paddle/fluid/operators/math/sequence_pooling_test.cc +++ b/paddle/fluid/operators/math/sequence_pooling_test.cc @@ -66,7 +66,7 @@ void TestSequencePoolingSum(const paddle::framework::LoD& lod) { cpu_in_grad.set_lod(in_grad.lod()); } - EXPECT_EQ(in_grad.numel(), lod[0].back() * second_dim); + EXPECT_EQ(in_grad.numel(), static_cast(lod[0].back() * second_dim)); EXPECT_EQ(in_grad.lod(), lod); if (paddle::platform::is_cpu_place(*place)) { diff --git a/paddle/fluid/operators/nce_op.h b/paddle/fluid/operators/nce_op.h index 2c97eef096eb3d23273e362e658cb1b5fc808609..3e48b67a570d41482e358ae3941eb1e2b6ab91f8 100644 --- a/paddle/fluid/operators/nce_op.h +++ b/paddle/fluid/operators/nce_op.h @@ -119,6 +119,11 @@ class NCEKernel : public framework::OpKernel { PrepareSamples(context, sampler); auto sample_labels = context.Output("SampleLabels"); const int64_t *sample_labels_data = sample_labels->data(); + + for (int x = 0; x < sample_labels->numel(); x++) { + PADDLE_ENFORCE_GE(sample_labels_data[x], 0, "nce sample label %d", x); + } + auto sample_out = context.Output("SampleLogits"); T *sample_out_data = sample_out->mutable_data(context.GetPlace()); auto label = context.Input("Label"); diff --git a/paddle/fluid/operators/ngraph/CMakeLists.txt b/paddle/fluid/operators/ngraph/CMakeLists.txt new file mode 100644 index 0000000000000000000000000000000000000000..6b256ef02666c21ec1db3f6922b56bb23363b4a0 --- /dev/null +++ b/paddle/fluid/operators/ngraph/CMakeLists.txt @@ -0,0 +1,5 @@ +if(WITH_NGRAPH) + cc_library(ngraph_bridge SRCS ngraph_bridge.cc DEPS operator framework_proto ngraph) + cc_library(ngraph_engine SRCS ngraph_engine.cc DEPS ngraph_bridge framework_proto) + op_library(ngraph_engine_op DEPS ngraph_engine op_registry op_info device_context) +endif() diff --git a/paddle/fluid/framework/ngraph_bridge.cc b/paddle/fluid/operators/ngraph/ngraph_bridge.cc similarity index 55% rename from paddle/fluid/framework/ngraph_bridge.cc rename to paddle/fluid/operators/ngraph/ngraph_bridge.cc index 365870c54eb3861ad6c273d3866dcd32d1c4166a..d6e897ed4666261cdd0bd6565f61abb218d971e5 100644 --- a/paddle/fluid/framework/ngraph_bridge.cc +++ b/paddle/fluid/operators/ngraph/ngraph_bridge.cc @@ -17,39 +17,39 @@ limitations under the License. */ #include #include "ngraph/ngraph.hpp" -#include "paddle/fluid/framework/ngraph_bridge.h" -#include "paddle/fluid/framework/operator.h" +#include "paddle/fluid/operators/ngraph/ngraph_bridge.h" #include "paddle/fluid/operators/ngraph/ngraph_ops.h" #include "paddle/fluid/platform/enforce.h" #include "paddle/fluid/platform/ngraph_helper.h" namespace paddle { -namespace framework { +namespace operators { namespace NG_OPS = paddle::operators::ngraphs; std::map&, + std::function&, std::shared_ptr>>)>> NgraphBridge::NG_NODE_MAP = { {"elementwise_add", NG_OPS::BuildElementwiseAddNode}, {"elementwise_add_grad", NG_OPS::BuildElementwiseAddGradNode}, - {"fill_constant", paddle::operators::ngraphs::BuildFillConstantNode}, - {"mean", paddle::operators::ngraphs::BuildMeanNode}, - {"mean_grad", paddle::operators::ngraphs::BuildMeanGradNode}, - {"mul", paddle::operators::ngraphs::BuildMulNode}, - {"mul_grad", paddle::operators::ngraphs::BuildMulGradNode}, - {"softmax", paddle::operators::ngraphs::BuildSoftmaxNode}, - {"softmax_grad", paddle::operators::ngraphs::BuildSoftmaxGradNode}, - {"scale", paddle::operators::ngraphs::BuildScaleNode}, - {"relu", paddle::operators::ngraphs::BuildUnaryNode}, - {"tanh", paddle::operators::ngraphs::BuildUnaryNode}, - {"top_k", paddle::operators::ngraphs::BuildTopKNode}}; - -void NgraphBridge::BuildNgNode(const std::shared_ptr& op) { + {"fill_constant", NG_OPS::BuildFillConstantNode}, + {"mean", NG_OPS::BuildMeanNode}, + {"mean_grad", NG_OPS::BuildMeanGradNode}, + {"mul", NG_OPS::BuildMulNode}, + {"mul_grad", NG_OPS::BuildMulGradNode}, + {"softmax", NG_OPS::BuildSoftmaxNode}, + {"softmax_grad", NG_OPS::BuildSoftmaxGradNode}, + {"scale", NG_OPS::BuildScaleNode}, + {"relu", NG_OPS::BuildUnaryNode}, + {"tanh", NG_OPS::BuildUnaryNode}, + {"top_k", NG_OPS::BuildTopKNode}}; + +void NgraphBridge::BuildNgNode( + const std::shared_ptr& op) { auto& op_type = op->Type(); NG_NODE_MAP[op_type](op, ngb_node_map_); } -} // namespace framework +} // namespace operators } // namespace paddle diff --git a/paddle/fluid/framework/ngraph_bridge.h b/paddle/fluid/operators/ngraph/ngraph_bridge.h similarity index 84% rename from paddle/fluid/framework/ngraph_bridge.h rename to paddle/fluid/operators/ngraph/ngraph_bridge.h index 5ad7b8daeb6a782515e50fc87ca7188b46308390..c57988f8f6322e76678c572aa21ff5b17b9e3c22 100644 --- a/paddle/fluid/framework/ngraph_bridge.h +++ b/paddle/fluid/operators/ngraph/ngraph_bridge.h @@ -21,16 +21,16 @@ limitations under the License. */ #include "ngraph/node.hpp" -namespace paddle { -namespace framework { +#include "paddle/fluid/framework/operator.h" -class OperatorBase; +namespace paddle { +namespace operators { class NgraphBridge { public: static std::map< std::string, - std::function&, + std::function&, std::shared_ptr>>)>> NG_NODE_MAP; @@ -41,7 +41,7 @@ class NgraphBridge { var_node_map) : ngb_node_map_(var_node_map) {} - void BuildNgNode(const std::shared_ptr& op); + void BuildNgNode(const std::shared_ptr& op); private: std::shared_ptr< @@ -49,5 +49,5 @@ class NgraphBridge { ngb_node_map_; }; -} // namespace framework +} // namespace operators } // namespace paddle diff --git a/paddle/fluid/operators/ngraph/ngraph_engine.cc b/paddle/fluid/operators/ngraph/ngraph_engine.cc new file mode 100644 index 0000000000000000000000000000000000000000..bec4b514a218715134d2366dd7efd7cf5b377b68 --- /dev/null +++ b/paddle/fluid/operators/ngraph/ngraph_engine.cc @@ -0,0 +1,491 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include + +#include +#include +#include +#include + +#include "paddle/fluid/framework/block_desc.h" +#include "paddle/fluid/framework/ddim.h" +#include "paddle/fluid/framework/feed_fetch_type.h" +#include "paddle/fluid/framework/framework.pb.h" +#include "paddle/fluid/framework/lod_tensor.h" +#include "paddle/fluid/framework/op_desc.h" +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/framework/var_desc.h" +#include "paddle/fluid/framework/var_type.h" +#include "paddle/fluid/operators/ngraph/ngraph_bridge.h" +#include "paddle/fluid/operators/ngraph/ngraph_engine.h" + +namespace paddle { +namespace operators { + +static ngraph::Shape Ddim2Shape(const framework::DDim& dims) { + ngraph::Shape sp; + for (int i = 0; i < dims.size(); ++i) { + int k = dims[i]; + k = k == 0 ? 1 : k; + sp.push_back(k); + } + return sp; +} + +static std::map + pd2ng_type_map = { + {framework::proto::VarType::FP32, ngraph::element::f32}, + {framework::proto::VarType::FP64, ngraph::element::f64}, + {framework::proto::VarType::INT32, ngraph::element::i32}, + {framework::proto::VarType::INT64, ngraph::element::i64}, + {framework::proto::VarType::BOOL, ngraph::element::boolean}, +}; + +std::unordered_map> + NgraphEngine::func_cache_ = {}; + +std::shared_ptr NgraphEngine::backend_ = + ngraph::runtime::Backend::create("CPU"); + +static std::vector> NgraphOpIntervals( + framework::BlockDesc* block) { + std::vector> intervals; + auto ops = block->AllOps(); + int size = ops.size(); + int left = 0; + while (left < size && ops.at(left)->Type() != framework::kFeedOpType) { + ++left; + } + if (left == size) { + return intervals; + } + while (left < size && ops.at(left)->Type() == framework::kFeedOpType) { + ++left; + } + + int right = left; + while (right < size && ops.at(right)->Type() != framework::kFetchOpType) { + ++right; + } + if (right == size) { + return intervals; + } + if (left >= right) return intervals; + + // (left, right - 1) represents indices between feed and fetch + int pivot = left; + while (pivot < right) { + auto op_type = ops.at(pivot)->Type(); + if (NgraphBridge::NG_NODE_MAP.find(op_type) == + NgraphBridge::NG_NODE_MAP.end()) { + ++pivot; + } else { + int start = pivot, end = start; + while (pivot < right && + (NgraphBridge::NG_NODE_MAP.find(ops.at(pivot)->Type()) != + NgraphBridge::NG_NODE_MAP.end())) { + ++pivot; + ++end; + } + std::vector interval = {start, end}; + intervals.push_back(interval); + } + } // end while + return intervals; +} + +static void SubstituteNgraphOp(framework::BlockDesc* block, + std::string block_str, + std::vector interval) { + framework::ProgramDesc program; + block->RemoveOp(interval.at(0), interval.at(1)); + auto* ng_op = block->InsertOp(interval.at(0)); + ng_op->SetType("ngraph_engine"); + ng_op->SetAttr("interval", interval); + ng_op->SetAttr("graph", block_str); +} + +// TODO(baojun-nervana): Move EnableNgraph to compile time per PR #15089 +void NgraphEngine::EnableNgraph(const framework::ProgramDesc& program) { +#ifdef PADDLE_WITH_NGRAPH + VLOG(4) << "use_ngraph=True"; + for (size_t bid = 0; bid < program.Size(); ++bid) { + // TODO(baojun-nervana): Remove the const_cast + auto* block = + const_cast(program).MutableBlock(bid); + std::string block_str = block->Proto()->SerializeAsString(); + auto intervals = NgraphOpIntervals(block); + for (auto it = intervals.rbegin(); it != intervals.rend(); ++it) { + SubstituteNgraphOp(block, block_str, *it); + } + } +#else + LOG(WARNING) + << "'NGRAPH' is not supported, Please re-compile with WITH_NGRAPH option"; +#endif +} + +NgraphEngine::NgraphEngine(const framework::Scope& scope, + const platform::Place& place, + const std::string& serialized_graph, + const std::vector& interval) + : scope_(scope), place_(place) { + var_in_node_map_ = std::make_shared< + std::unordered_map>>(); + + var_node_map_ = std::make_shared< + std::unordered_map>>(); + + func_cache_key_ = std::to_string(interval[0]) + std::to_string(interval[1]) + + serialized_graph; + + framework::proto::BlockDesc bdesc; + bdesc.ParseFromString(serialized_graph); + framework::BlockDesc block(nullptr, &bdesc); + + Prepare(block, interval); + + BuildNgIO(); + + GetNgFunction(); +} + +void NgraphEngine::Prepare(const framework::BlockDesc& block, + const std::vector& interval) { + for (auto& var : block.AllVars()) { + if (!(var->GetType() == framework::proto::VarType::SELECTED_ROWS || + var->GetType() == framework::proto::VarType::LOD_TENSOR || + var->GetType() == framework::proto::VarType::LOD_TENSOR_ARRAY)) { + continue; + } + + auto var_name = var->Name(); + if (var->Name() == framework::kEmptyVarName) { + continue; + } + + if (var_name != framework::kFeedOpType && + var_name != framework::kFetchOpType) { + auto pd_type = var->GetDataType(); + if (pd2ng_type_map.find(pd_type) == pd2ng_type_map.end()) { + PADDLE_THROW("Data type of var %s not found in pd2ng_type_map", + var_name); + } + var_type_map_[var_name] = pd2ng_type_map[pd_type]; + } + + if (var->Persistable()) { + persistables_.insert(var->Name()); + } + } + + auto ops_desc = block.AllOps(); + int idx = interval[0]; + while (idx < interval[1]) { + auto op_desc = ops_desc.at(idx); + auto op = framework::OpRegistry::CreateOp(*op_desc); + fused_ops_.push_back(std::move(op)); + ++idx; + } + + while (ops_desc.at(idx)->Type() != framework::kFetchOpType) { + auto op_desc = ops_desc.at(idx); + for (auto& var_name_item : op_desc->Inputs()) { + for (auto& var_name : var_name_item.second) { + post_op_inputs_.insert(var_name); + } + } + ++idx; + } + + while (idx < static_cast(ops_desc.size()) && + ops_desc.at(idx)->Type() == framework::kFetchOpType) { + std::string fetch_target_name = ops_desc.at(idx)->Input("X")[0]; + fetches_.insert(fetch_target_name); + ++idx; + } + + if (ops_desc.at(interval.at(0) - 1)->Type() == framework::kFeedOpType && + ops_desc.at(interval.at(1))->Type() == framework::kFetchOpType) { + ng_op_state_ = OpState::FULL; + } + + for (auto* op_desc : ops_desc) { + if (op_desc->Type().find("_grad") != std::string::npos) { + ng_op_state_ = ng_op_state_ == OpState::FULL ? OpState::FULL_TRAIN + : OpState::PARTIAL_TRAIN; + break; + } + } + + if (ng_op_state_ != OpState::FULL_TRAIN && + ng_op_state_ != OpState::PARTIAL_TRAIN) { + ng_op_state_ = ng_op_state_ == OpState::FULL ? OpState::FULL_TEST + : OpState::PARTIAL_TEST; + } +} + +void NgraphEngine::GetNgInputShape( + std::shared_ptr op) { + framework::RuntimeContext ctx(op->Inputs(), op->Outputs(), scope_); + op->RuntimeInferShape(scope_, place_, ctx); + for (auto& var_name_item : op->Inputs()) { + for (auto& var_name : var_name_item.second) { + auto* var = scope_.FindVar(var_name); + if (var && var->IsType()) { + auto* tensor_pd = GetLoDTensorOrSelectedRowsValueFromVar(*var); + auto sp = Ddim2Shape(tensor_pd->dims()); + if (std::find(var_in_.begin(), var_in_.end(), var_name) != + var_in_.end()) { + if (var_node_map_->find(var_name) == var_node_map_->end()) { + // auto ng_type = pd2ng_type_map.at(GetDataTypeOfVar(var)); + auto ng_type = var_type_map_.at(var_name); + auto prm = + std::make_shared(ng_type, sp, true); + (*var_node_map_)[var_name] = prm; + (*var_in_node_map_)[var_name] = prm; + } + } + } + } + } +} + +void NgraphEngine::BuildNgNodes() { + for (auto& op : fused_ops_) { + for (auto& var_name_item : op->Outputs()) { + for (auto& var_name : var_name_item.second) { + if (var_node_map_->find(var_name) == var_node_map_->end()) { + auto* var = scope_.FindVar(var_name); + if (var && var->IsType()) { + auto* tensor_pd = GetLoDTensorOrSelectedRowsValueFromVar(*var); + auto& ddim = tensor_pd->dims(); + auto ng_shape = Ddim2Shape(ddim); + auto ng_type = var_type_map_.at(var_name); + auto prm = std::make_shared(ng_type, + ng_shape, true); + (*var_node_map_)[var_name] = prm; + } + } + } + } + } + NgraphBridge ngb(var_node_map_); + for (auto& op : fused_ops_) { + ngb.BuildNgNode(op); + } +} + +void NgraphEngine::BuildNgIO() { + std::unordered_set inputs; + std::unordered_set outputs; + + for (auto& op : fused_ops_) { + for (auto& var_name_item : op->Inputs()) { + for (auto& var_name : var_name_item.second) { + inputs.insert(var_name); + const bool is_output = outputs.find(var_name) != outputs.end(); + if (!is_output && + std::find(var_in_.begin(), var_in_.end(), var_name) == + var_in_.end()) { + // fill var_in here to keep lhs and rhs order + var_in_.push_back(var_name); + } + } + } + + if (op->Type() != "fill_constant") { + GetNgInputShape(op); + } + + for (auto& var_name_item : op->Outputs()) { + PADDLE_ENFORCE_LE(var_name_item.second.size(), 1, + "op %s has more than 1 output - Not handling yet", + op->Type()); + for (auto& var_name : var_name_item.second) { + outputs.insert(var_name); + } + } + } + + // var_out.clear(); + for (auto& op : fused_ops_) { + for (auto& var_name_item : op->Outputs()) { + PADDLE_ENFORCE_LE(var_name_item.second.size(), 1, + "op %s has more than 1 output - Not handling yet", + op->Type()); + for (auto& var_name : var_name_item.second) { + switch (ng_op_state_) { + case OpState::PARTIAL_TEST: + if (post_op_inputs_.find(var_name) != post_op_inputs_.end() || + fetches_.find(var_name) != fetches_.end()) { + var_out_.push_back(var_name); + } + break; + case OpState::FULL_TEST: + if (fetches_.find(var_name) != fetches_.end()) { + var_out_.push_back(var_name); + } + break; + case OpState::PARTIAL_TRAIN: + if (fetches_.find(var_name) != fetches_.end() || + post_op_inputs_.find(var_name) != post_op_inputs_.end() || + persistables_.find(var_name) != persistables_.end()) { + var_out_.push_back(var_name); + } + break; + case OpState::FULL_TRAIN: + if (fetches_.find(var_name) != fetches_.end() || + persistables_.find(var_name) != persistables_.end()) { + var_out_.push_back(var_name); + } + break; + default: + var_out_.push_back(var_name); + } + } + } + } +} + +void NgraphEngine::BuildNgFunction() { + BuildNgNodes(); + ngraph_function_ = nullptr; + ngraph::NodeVector func_outputs; + ngraph::ParameterVector func_inputs; + + for (auto& vo : var_out_) { + func_outputs.push_back(var_node_map_->at(vo)); + } + + for (auto& vi : var_in_) { + std::shared_ptr prm = + std::dynamic_pointer_cast( + var_in_node_map_->at(vi)); + func_inputs.push_back(prm); + } + + ngraph_function_ = + std::make_shared(func_outputs, func_inputs); +} + +void NgraphEngine::GetNgFunction() { + bool cache_on = true; + if (cache_on) { + std::string input_shape_str; + for (auto& var_name : var_in_) { + auto shape = var_node_map_->at(var_name)->get_shape(); + for (size_t i = 0; i < shape.size(); ++i) { + input_shape_str += std::to_string(shape.at(i)); + } + } + func_cache_key_ = input_shape_str + func_cache_key_; + if (func_cache_.find(func_cache_key_) != func_cache_.end()) { + ngraph_function_ = func_cache_.at(func_cache_key_); + } else { + BuildNgFunction(); + func_cache_[func_cache_key_] = ngraph_function_; + } + } else { + BuildNgFunction(); + } +} + +void NgraphEngine::Run(const framework::Scope& scope, + const platform::Place& place) const { + std::vector> t_in; + std::vector> t_out; + + for (size_t i = 0; i < var_in_.size(); ++i) { + auto vi = var_in_.at(i); + auto sp = var_node_map_->at(vi)->get_shape(); + std::shared_ptr ti; + auto* var = scope.FindVar(vi); + if (var && var->IsType()) { + auto* tensor_pd = GetMutableLoDTensorOrSelectedRowsValueFromVar(var); + PADDLE_ENFORCE(sp == Ddim2Shape(tensor_pd->dims()), + "Ensure ngraph tensor layout align with paddle tensor"); + auto ng_type = var_type_map_.at(vi); + if (ng_type == ngraph::element::f32) { + auto pd_arr = tensor_pd->mutable_data(place); + ti = backend_->create_tensor(ngraph::element::f32, sp, pd_arr); + } else if (ng_type == ngraph::element::i32) { + const int* arr = tensor_pd->data(); + ti = backend_->create_tensor(ngraph::element::i32, sp, + const_cast(arr)); + } else if (ng_type == ngraph::element::i64) { + auto pd_arr = tensor_pd->mutable_data(place); + ti = backend_->create_tensor(ngraph::element::i64, sp, pd_arr); + } else if (ng_type == ngraph::element::f64) { + auto pd_arr = tensor_pd->mutable_data(place); + ti = backend_->create_tensor(ngraph::element::f64, sp, pd_arr); + } else if (ng_type == ngraph::element::boolean) { + auto pd_arr = tensor_pd->mutable_data(place); + ti = backend_->create_tensor(ngraph::element::boolean, sp, pd_arr); + } else { + PADDLE_THROW("Data type not handling for var %s", vi); + } + } else { + PADDLE_THROW("Cannot find var or tensor with var name %s", vi); + } + bool is_test = (ng_op_state_ == OpState::PARTIAL_TEST || + ng_op_state_ == OpState::FULL_TEST) + ? true + : false; + bool is_persistable = + (persistables_.find(vi) != persistables_.end()) ? true : false; + if (is_test && is_persistable) { + ti->set_stale(false); + } + t_in.push_back(ti); + } + + for (size_t i = 0; i < var_out_.size(); ++i) { + auto vo = var_out_[i]; + auto* var = scope.FindVar(vo); + std::shared_ptr to; + if (var && var->IsType()) { + auto* tensor_pd = GetMutableLoDTensorOrSelectedRowsValueFromVar(var); + auto dd = tensor_pd->dims(); + ngraph::Shape sp = Ddim2Shape(dd); + auto ng_type = var_type_map_.at(vo); + if (ng_type == ngraph::element::f32) { + auto pd_arr = tensor_pd->mutable_data(place); + to = backend_->create_tensor(ng_type, sp, pd_arr); + } else if (ng_type == ngraph::element::i64) { + auto pd_arr = tensor_pd->mutable_data(place); + to = backend_->create_tensor(ng_type, sp, pd_arr); + } else if (ng_type == ngraph::element::i32) { + auto pd_arr = tensor_pd->mutable_data(place); + to = backend_->create_tensor(ng_type, sp, pd_arr); + } else if (ng_type == ngraph::element::f64) { + auto pd_arr = tensor_pd->mutable_data(place); + to = backend_->create_tensor(ng_type, sp, pd_arr); + } else if (ng_type == ngraph::element::boolean) { + auto pd_arr = tensor_pd->mutable_data(place); + to = backend_->create_tensor(ng_type, sp, pd_arr); + } else { + PADDLE_THROW("Data type not handled in for var %s", vo); + } + t_out.push_back(to); + } else { + PADDLE_THROW("Cannot find var or tensor with var name %s", vo); + } + } + + backend_->call(backend_->compile(ngraph_function_), t_out, t_in); +} // NgraphEngine::Run +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/ngraph/ngraph_engine.h b/paddle/fluid/operators/ngraph/ngraph_engine.h new file mode 100644 index 0000000000000000000000000000000000000000..bf5ff2a743b0edb69163e674d36c56a02c0b4153 --- /dev/null +++ b/paddle/fluid/operators/ngraph/ngraph_engine.h @@ -0,0 +1,93 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include +#include +#include + +#include "paddle/fluid/framework/operator.h" +#include "paddle/fluid/framework/program_desc.h" + +#include "ngraph/ngraph.hpp" + +namespace paddle { +namespace operators { + +enum class OpState { /* nGraph support state on ops */ + FULL_TRAIN, /* Support full ops for train */ + PARTIAL_TRAIN, /* Support partial ops for train */ + FULL_TEST, /* Support full list of ops for test */ + PARTIAL_TEST, /* Support partial list of ops for test */ + FULL, /* All ops supported from feed to fetch */ + UNKNOWN /* Output all for debug purpose */ +}; + +// perform graph build through bridge and execute computation +class NgraphEngine { + public: + explicit NgraphEngine(const framework::Scope& scope, + const platform::Place& place, + const std::string& serialized_graph, + const std::vector& interval); + + void Run(const framework::Scope& scope, const platform::Place& place) const; + + static void EnableNgraph(const framework::ProgramDesc& program); + + private: + static std::unordered_map> + func_cache_; + const framework::Scope& scope_; + const platform::Place& place_; + std::vector> fused_ops_; + std::unordered_map var_type_map_; + std::unordered_set persistables_; + std::unordered_set fetches_; + std::unordered_set post_op_inputs_; + OpState ng_op_state_ = OpState::UNKNOWN; + std::string func_cache_key_; + + // ngraph backend eg. CPU + static std::shared_ptr backend_; + // ngraph function to call and execute + std::shared_ptr ngraph_function_; + // var_name of inputs + std::vector var_in_; + // var_name of outputs from fetch in order + std::vector var_out_; + // map input vars to nodes + std::shared_ptr< + std::unordered_map>> + var_in_node_map_; + // map each var name with a ngraph node + std::shared_ptr< + std::unordered_map>> + var_node_map_; + // prepare info for nraph engine + void Prepare(const framework::BlockDesc& block, + const std::vector& interval); + // get ngraph input and define ngraph input parameters + void GetNgInputShape(std::shared_ptr op); + // Call ngraph bridge to map ops + void BuildNgNodes(); + // get the ngraph input and output var list + void BuildNgIO(); + // build ngraph function call + void BuildNgFunction(); + // Check cache for ngraph function or otherwise build the function + void GetNgFunction(); +}; + +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/ngraph/ngraph_engine_op.cc b/paddle/fluid/operators/ngraph/ngraph_engine_op.cc new file mode 100644 index 0000000000000000000000000000000000000000..3051ca123b29658d3e9a35239ad00f621a297cb5 --- /dev/null +++ b/paddle/fluid/operators/ngraph/ngraph_engine_op.cc @@ -0,0 +1,52 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. */ + +#include + +#include "paddle/fluid/framework/block_desc.h" +#include "paddle/fluid/framework/op_desc.h" +#include "paddle/fluid/framework/op_info.h" +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/operators/ngraph/ngraph_engine_op.h" + +namespace paddle { +namespace operators { + +class NgraphEngineOpMaker : public framework::OpProtoAndCheckerMaker { + public: + void Make() override { + AddInput("Xs", "A list of inputs.").AsDispensable(); + AddOutput("Ys", "A list of outputs").AsDispensable(); + AddAttr("graph", "the graph."); + AddAttr>("interval", "op interval supported by ngraph"); + AddComment("ngraph engine operator."); + } +}; + +class NgraphEngineInferVarType : public framework::VarTypeInference { + public: + void operator()(const framework::OpDesc &op_desc, + framework::BlockDesc *block) const override {} +}; + +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; + +REGISTER_OPERATOR(ngraph_engine, ops::NgraphEngineOp, ops::NgraphEngineOpMaker, + ops::NgraphEngineOpMaker); +REGISTER_OP_CPU_KERNEL( + ngraph_engine, + ops::NgraphEngineKernel); diff --git a/paddle/fluid/operators/ngraph/ngraph_engine_op.h b/paddle/fluid/operators/ngraph/ngraph_engine_op.h new file mode 100644 index 0000000000000000000000000000000000000000..d2974298b0707575624ad2f6935e83d06b4c83bb --- /dev/null +++ b/paddle/fluid/operators/ngraph/ngraph_engine_op.h @@ -0,0 +1,58 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. */ + +#pragma once + +#include +#include + +#include "paddle/fluid/framework/scope.h" +#include "paddle/fluid/operators/ngraph/ngraph_engine.h" +#include "paddle/fluid/platform/device_context.h" +#include "paddle/fluid/platform/place.h" + +namespace paddle { +namespace operators { + +class NgraphEngineOp : public framework::OperatorWithKernel { + public: + using framework::OperatorWithKernel::OperatorWithKernel; + + protected: + void InferShape(framework::InferShapeContext* ctx) const override {} + + framework::OpKernelType GetExpectedKernelType( + const framework::ExecutionContext& ctx) const override { + framework::OpKernelType kt = framework::OpKernelType( + framework::proto::VarType::FP32, ctx.GetPlace()); + return kt; + } +}; + +template +class NgraphEngineKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const override { + auto& scope = ctx.scope(); + auto place = ctx.GetPlace(); + std::string serialized_graph = ctx.Attr("graph"); + auto interval = ctx.Attr>("interval"); + + NgraphEngine ngraph_engine(scope, place, serialized_graph, interval); + ngraph_engine.Run(scope, place); + } +}; + +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/reader/create_ctr_reader_op.cc b/paddle/fluid/operators/reader/create_ctr_reader_op.cc index 58a465d87a8c0da50e3eb80fefe32d50217f6990..2a3e80c9152b5550631f8c5669283b782f975d4e 100644 --- a/paddle/fluid/operators/reader/create_ctr_reader_op.cc +++ b/paddle/fluid/operators/reader/create_ctr_reader_op.cc @@ -41,13 +41,19 @@ class CreateCTRReaderOp : public framework::OperatorBase { auto* queue_holder = queue_holder_var->template GetMutable(); - int thread_num = Attr("thread_num"); - std::vector slots = Attr>("slots"); - int batch_size = Attr("batch_size"); - std::vector file_list = - Attr>("file_list"); - out->Reset(std::make_shared(queue_holder->GetQueue(), batch_size, - thread_num, slots, file_list)); + auto thread_num = Attr("thread_num"); + auto sparse_slots = Attr>("sparse_slots"); + auto dense_slot_index = Attr>("dense_slot_index"); + auto sparse_slot_index = Attr>("sparse_slot_index"); + auto batch_size = Attr("batch_size"); + auto file_type = Attr("file_type"); + auto file_format = Attr("file_format"); + auto file_list = Attr>("file_list"); + DataDesc data_desc(batch_size, file_list, file_type, file_format, + dense_slot_index, sparse_slot_index, sparse_slots); + VLOG(1) << data_desc; + out->Reset(std::make_shared(queue_holder->GetQueue(), thread_num, + data_desc)); } }; @@ -58,10 +64,22 @@ class CreateCTRReaderOpMaker : public FileReaderMakerBase { "Name of the `LoDTensorBlockingQueueHolder` variable"); AddAttr("thread_num", "the thread num to read data"); AddAttr("batch_size", "the batch size of read data"); + AddAttr("file_type", "plain or gzip").SetDefault("plain"); + AddAttr("file_format", "svm or csv").SetDefault("csv"); AddAttr>("file_list", "The list of files that need to read"); - AddAttr>( - "slots", "the slots that should be extract from file"); + AddAttr>( + "dense_slot_index", + "the dense slots id that should be extract from file") + .SetDefault({}); + AddAttr>( + "sparse_slot_index", + "the sparse slots id that should be extract from file") + .SetDefault({}); + AddAttr>("sparse_slots", + "the sparse slots id that should be " + "extract from file, used when file " + "format is svm"); AddComment(R"DOC( Create CTRReader to support read ctr data with cpp. diff --git a/paddle/fluid/operators/reader/ctr_reader.cc b/paddle/fluid/operators/reader/ctr_reader.cc index d1d3ddc89dc09a185e6a41274cf382b430ec3eeb..f08798794a2f9fc042800583cbc032d6f12bf3dc 100644 --- a/paddle/fluid/operators/reader/ctr_reader.cc +++ b/paddle/fluid/operators/reader/ctr_reader.cc @@ -73,6 +73,9 @@ static inline void parse_line( } } +// label slot1:fea_sign slot2:fea_sign slot1:fea_sign +static inline void parse_svm_line(const std::string& line) {} + class Reader { public: virtual ~Reader() {} @@ -95,11 +98,27 @@ class GzipReader : public Reader { igzstream gzstream_; }; -class MultiGzipReader : public Reader { +class PlainFileReader : public Reader { public: - explicit MultiGzipReader(const std::vector& file_list) { + explicit PlainFileReader(const std::string& file_name) + : stream_(file_name.c_str()) {} + + ~PlainFileReader() {} + + bool HasNext() override { return stream_.peek() != EOF; } + + void NextLine(std::string* line) override { std::getline(stream_, *line); } + + private: + std::ifstream stream_; +}; + +template +class MultiFileReader : public Reader { + public: + explicit MultiFileReader(const std::vector& file_list) { for (auto& file : file_list) { - readers_.emplace_back(std::make_shared(file)); + readers_.emplace_back(std::make_shared(file)); } } @@ -119,46 +138,35 @@ class MultiGzipReader : public Reader { } private: - std::vector> readers_; + std::vector> readers_; size_t current_reader_index_ = 0; }; void MonitorThread(std::vector* thread_status, std::shared_ptr queue) { - VLOG(30) << "monitor thread in"; + VLOG(3) << "monitor thread in"; bool reader_thread_is_running = true; while (reader_thread_is_running) { - VLOG(30) << "reader_thread_is_running"; + VLOG(3) << "reader_thread_is_running"; reader_thread_is_running = false; for (size_t i = 0; i < (*thread_status).size(); ++i) { if ((*thread_status)[i] == Running) { - VLOG(30) << "reader is running!"; + VLOG(3) << "reader is running!"; reader_thread_is_running = true; } } std::this_thread::sleep_for(std::chrono::milliseconds(1000)); } - VLOG(30) << "all reader thread is stopped, push empty data into queue"; - queue->Push({}); - VLOG(30) << "monitor thread exited"; + VLOG(3) << "all reader thread is stopped, close the queue"; + queue->Close(); + VLOG(3) << "monitor thread exited"; } -void ReadThread(const std::vector& file_list, - const std::vector& slots, int batch_size, - int thread_id, std::vector* thread_status, - std::shared_ptr queue) { - VLOG(30) << "[" << thread_id << "]" - << " reader thread start! thread_id = " << thread_id; - for (auto& file : file_list) { - VLOG(30) << "[" << thread_id << "]" - << " file " << file; - } - (*thread_status)[thread_id] = Running; - VLOG(30) << "set status to running"; - +void ReadSvmData(const DataDesc& data_desc, std::shared_ptr reader, + std::shared_ptr queue) { std::unordered_map slot_to_index; - for (size_t i = 0; i < slots.size(); ++i) { - slot_to_index[slots[i]] = i; + for (size_t i = 0; i < data_desc.sparse_slot_ids_.size(); ++i) { + slot_to_index[data_desc.sparse_slot_ids_[i]] = i; } std::string line; @@ -166,21 +174,17 @@ void ReadThread(const std::vector& file_list, std::vector>> batch_data; std::vector batch_label; - MultiGzipReader reader(file_list); - - VLOG(30) << "reader inited"; - - while (reader.HasNext()) { + while (reader->HasNext()) { batch_data.clear(); - batch_data.reserve(batch_size); + batch_data.reserve(data_desc.batch_size_); batch_label.clear(); - batch_label.reserve(batch_size); + batch_label.reserve(data_desc.batch_size_); // read batch_size data - for (int i = 0; i < batch_size; ++i) { - if (reader.HasNext()) { - reader.NextLine(&line); + for (int i = 0; i < data_desc.batch_size_; ++i) { + if (reader->HasNext()) { + reader->NextLine(&line); std::unordered_map> slot_to_data; int64_t label; parse_line(line, slot_to_index, &label, &slot_to_data); @@ -193,8 +197,8 @@ void ReadThread(const std::vector& file_list, std::vector lod_datas; - // first insert tensor for each slots - for (auto& slot : slots) { + // first insert tensor for each sparse_slots + for (auto& slot : data_desc.sparse_slot_ids_) { std::vector lod_data{0}; std::vector batch_feasign; @@ -226,11 +230,167 @@ void ReadThread(const std::vector& file_list, lod_datas.push_back(label_tensor); queue->Push(lod_datas); - VLOG(40) << "push one data, queue_size=" << queue->Size(); + VLOG(4) << "push one data, queue_size=" << queue->Size(); + } +} + +// label dense_fea,dense_fea sparse_fea,sparse_fea +static inline void parse_csv_line( + const std::string& line, const DataDesc& data_desc, int64_t* label, + std::vector>* dense_datas, + std::vector>* sparse_datas) { + std::vector ret; + string_split(line, ' ', &ret); + *label = std::stol(ret[0]); + dense_datas->resize(data_desc.dense_slot_index_.size()); + for (size_t i = 0; i < data_desc.dense_slot_index_.size(); ++i) { + int slot_idx = data_desc.dense_slot_index_[i]; + auto& slot_data = ret[slot_idx]; + std::vector data_in_slot_str; + string_split(slot_data, ',', &data_in_slot_str); + std::vector data_in_slot; + for (auto& data_str : data_in_slot_str) { + (*dense_datas)[i].push_back(std::stof(data_str)); + } + } + sparse_datas->resize(data_desc.sparse_slot_index_.size()); + for (size_t i = 0; i < data_desc.sparse_slot_index_.size(); ++i) { + int slot_idx = data_desc.sparse_slot_index_[i]; + auto& slot_data = ret[slot_idx]; + std::vector data_in_slot_str; + string_split(slot_data, ',', &data_in_slot_str); + std::vector data_in_slot; + for (auto& data_str : data_in_slot_str) { + auto id = std::stol(data_str); + (*sparse_datas)[i].push_back(id); + } + } +} + +void ReadCsvData(const DataDesc& data_desc, std::shared_ptr reader, + std::shared_ptr queue) { + std::string line; + while (reader->HasNext()) { + std::vector batch_label; + batch_label.reserve(data_desc.batch_size_); + + std::vector>> batch_dense_data; + batch_dense_data.reserve(data_desc.batch_size_); + + std::vector>> batch_sparse_data; + batch_sparse_data.reserve(data_desc.batch_size_); + + // read batch_size data + for (int i = 0; i < data_desc.batch_size_; ++i) { + if (reader->HasNext()) { + reader->NextLine(&line); + int64_t label; + std::vector> dense_datas; + std::vector> sparse_datas; + parse_csv_line(line, data_desc, &label, &dense_datas, &sparse_datas); + batch_label.push_back(label); + if (!batch_dense_data.empty()) { + PADDLE_ENFORCE_EQ(batch_dense_data[0].size(), dense_datas.size(), + "dense data should have the same shape"); + } + batch_dense_data.push_back(dense_datas); + batch_sparse_data.push_back(sparse_datas); + } else { + break; + } + } + + // the order of output data is label, dense_datas, sparse_datas + std::vector lod_datas; + + // insert label tensor + framework::LoDTensor label_tensor; + auto* label_tensor_data = label_tensor.mutable_data( + framework::make_ddim({static_cast(batch_label.size()), 1}), + platform::CPUPlace()); + memcpy(label_tensor_data, batch_label.data(), + batch_label.size() * sizeof(int64_t)); + lod_datas.push_back(label_tensor); + + // insert tensor for each dense_slots + for (size_t i = 0; i < data_desc.dense_slot_index_.size(); ++i) { + framework::LoDTensor lod_tensor; + size_t width = batch_dense_data[0][i].size(); + auto* tensor_data = lod_tensor.mutable_data( + framework::make_ddim( + {static_cast(batch_dense_data.size()), // batch_size + static_cast(width)}), + platform::CPUPlace()); + + for (size_t j = 0; j < batch_dense_data.size(); ++j) { + auto& dense_data_row = batch_dense_data[j][i]; + memcpy(tensor_data + j * width, dense_data_row.data(), + width * sizeof(float)); + } + + lod_datas.push_back(lod_tensor); + } + + // insert tensor for each sparse_slots + for (size_t i = 0; i < data_desc.sparse_slot_index_.size(); ++i) { + std::vector lod_data{0}; + std::vector batch_feasign; + + for (size_t row_idx = 0; row_idx < batch_sparse_data.size(); ++row_idx) { + auto& sparse_ids = batch_sparse_data[row_idx][i]; + lod_data.push_back(lod_data.back() + sparse_ids.size()); + batch_feasign.insert(batch_feasign.end(), sparse_ids.begin(), + sparse_ids.end()); + } + + framework::LoDTensor lod_tensor; + framework::LoD lod{lod_data}; + lod_tensor.set_lod(lod); + int64_t* tensor_data = lod_tensor.mutable_data( + framework::make_ddim({static_cast(batch_feasign.size()), 1}), + platform::CPUPlace()); + memcpy(tensor_data, batch_feasign.data(), + batch_feasign.size() * sizeof(int64_t)); + lod_datas.push_back(lod_tensor); + } + + queue->Push(lod_datas); + VLOG(4) << "push one data, queue_size=" << queue->Size(); + } +} + +void ReadThread(const std::vector& file_list, + const DataDesc& data_desc, int thread_id, + std::vector* thread_status, + std::shared_ptr queue) { + VLOG(3) << "[" << thread_id << "]" + << " reader thread start! thread_id = " << thread_id; + for (auto& file : file_list) { + VLOG(3) << "[" << thread_id << "]" + << " file " << file; + } + (*thread_status)[thread_id] = Running; + VLOG(3) << "set status to running"; + + std::shared_ptr reader; + if (data_desc.file_type_ == "gzip") { + reader.reset(new MultiFileReader(file_list)); + } else if (data_desc.file_type_ == "plain") { + reader.reset(new MultiFileReader(file_list)); + } else { + PADDLE_THROW("do not support file format %s", data_desc.file_type_); + } + + VLOG(3) << "reader inited"; + + if (data_desc.file_format_ == "svm") { + ReadSvmData(data_desc, reader, queue); + } else if (data_desc.file_format_ == "csv") { + ReadCsvData(data_desc, reader, queue); } (*thread_status)[thread_id] = Stopped; - VLOG(30) << "set status to stopped, thread " << thread_id << " exited"; + VLOG(3) << "set status to stopped, thread " << thread_id << " exited"; } } // namespace reader diff --git a/paddle/fluid/operators/reader/ctr_reader.h b/paddle/fluid/operators/reader/ctr_reader.h index 56879ffda5d3e04a88d12d6c4701c24a0d0ee4f7..740cd5219c70331d1f71d832adef084c148a2408 100644 --- a/paddle/fluid/operators/reader/ctr_reader.h +++ b/paddle/fluid/operators/reader/ctr_reader.h @@ -36,9 +36,63 @@ namespace reader { enum ReaderThreadStatus { Running, Stopped }; +struct DataDesc { + DataDesc(int batch_size, const std::vector& file_names, + const std::string& file_type, const std::string& file_format, + const std::vector& dense_slot_index, + const std::vector& sparse_slot_index, + const std::vector& sparse_slot_ids) + : batch_size_(batch_size), + file_names_(file_names), + file_type_(file_type), + file_format_(file_format), + dense_slot_index_(dense_slot_index), + sparse_slot_index_(sparse_slot_index), + sparse_slot_ids_(sparse_slot_ids) {} + + const int batch_size_; + const std::vector file_names_; + const std::string file_type_; // gzip or plain + const std::string file_format_; // csv or svm + // used for csv data format + const std::vector dense_slot_index_; + const std::vector sparse_slot_index_; + // used for svm data format + const std::vector sparse_slot_ids_; +}; + +inline std::ostream& operator<<(std::ostream& os, const DataDesc& data_desc) { + os << "data_desc:\n"; + os << "\tbatch_size -> " << data_desc.batch_size_ << "\n"; + os << "\tfile_type -> " << data_desc.file_type_ << "\n"; + os << "\tfile_format -> " << data_desc.file_format_ << "\n"; + os << "\tfile_names -> {"; + for (auto& file_name : data_desc.file_names_) { + os << file_name << ","; + } + os << "}\n"; + os << "\tdense_slot_index -> {"; + for (auto& slot : data_desc.dense_slot_index_) { + os << slot << ","; + } + os << "}\n"; + os << "\tsparse_slot_index_ -> {"; + for (auto& slot : data_desc.sparse_slot_index_) { + os << slot << ","; + } + os << "}\n"; + os << "\tsparse_slot_ids_ -> {"; + for (auto& slot : data_desc.sparse_slot_ids_) { + os << slot << ","; + } + os << "}\n"; + + return os; +} + void ReadThread(const std::vector& file_list, - const std::vector& slots, int batch_size, - int thread_id, std::vector* thread_status, + const DataDesc& data_desc, int thread_id, + std::vector* thread_status, std::shared_ptr queue); // monitor all running thread, if they are all stopped, @@ -48,15 +102,15 @@ void MonitorThread(std::vector* thread_status, class CTRReader : public framework::FileReader { public: - explicit CTRReader(const std::shared_ptr& queue, - int batch_size, size_t thread_num, - const std::vector& slots, - const std::vector& file_list) - : batch_size_(batch_size), slots_(slots), file_list_(file_list) { + CTRReader(const std::shared_ptr& queue, + int thread_num, const DataDesc& data_desc) + : data_desc_(data_desc) { PADDLE_ENFORCE_GT(thread_num, 0, "thread num should be larger then 0!"); PADDLE_ENFORCE(queue != nullptr, "LoDTensorBlockingQueue must not be null"); - PADDLE_ENFORCE_GT(file_list.size(), 0, "file list should not be empty"); - thread_num_ = std::min(file_list_.size(), thread_num); + PADDLE_ENFORCE_GT(data_desc_.file_names_.size(), 0, + "file list should not be empty"); + + thread_num_ = std::min(data_desc_.file_names_.size(), thread_num); queue_ = queue; SplitFiles(); for (size_t i = 0; i < thread_num_; ++i) { @@ -64,7 +118,7 @@ class CTRReader : public framework::FileReader { } } - ~CTRReader() {} + ~CTRReader() { Shutdown(); } void ReadNext(std::vector* out) override { bool success; @@ -81,7 +135,10 @@ class CTRReader : public framework::FileReader { for (auto& read_thread : read_threads_) { read_thread->join(); } - monitor_thread_->join(); + + if (monitor_thread_) { + monitor_thread_->join(); + } read_threads_.clear(); monitor_thread_.reset(nullptr); @@ -95,9 +152,9 @@ class CTRReader : public framework::FileReader { queue_->ReOpen(); VLOG(3) << "reopen success"; VLOG(3) << "thread_num " << thread_num_; - for (size_t thread_id = 0; thread_id < thread_num_; thread_id++) { + for (int thread_id = 0; thread_id < thread_num_; thread_id++) { read_threads_.emplace_back(new std::thread(std::bind( - &ReadThread, file_groups_[thread_id], slots_, batch_size_, + &ReadThread, file_groups_[thread_id], data_desc_, static_cast(thread_id), &read_thread_status_, queue_))); } monitor_thread_.reset(new std::thread( @@ -108,8 +165,8 @@ class CTRReader : public framework::FileReader { private: void SplitFiles() { file_groups_.resize(thread_num_); - for (size_t i = 0; i < file_list_.size(); ++i) { - auto& file_name = file_list_[i]; + for (size_t i = 0; i < data_desc_.file_names_.size(); ++i) { + auto& file_name = data_desc_.file_names_[i]; std::ifstream f(file_name.c_str()); PADDLE_ENFORCE(f.good(), "file %s not exist!", file_name); file_groups_[i % thread_num_].push_back(file_name); @@ -118,9 +175,7 @@ class CTRReader : public framework::FileReader { private: size_t thread_num_; - const int batch_size_; - const std::vector slots_; - const std::vector file_list_; + const DataDesc data_desc_; std::shared_ptr queue_; std::vector> read_threads_; std::unique_ptr monitor_thread_; diff --git a/paddle/fluid/operators/reader/ctr_reader_test.cc b/paddle/fluid/operators/reader/ctr_reader_test.cc index 8dba9baebce0a82ee2a541fe6ae9f6bcef8e2835..9f3a254c84d4e04fbcd449644a7e138eff520fbc 100644 --- a/paddle/fluid/operators/reader/ctr_reader_test.cc +++ b/paddle/fluid/operators/reader/ctr_reader_test.cc @@ -36,6 +36,7 @@ using paddle::framework::LoD; using paddle::framework::DDim; using paddle::platform::CPUPlace; using paddle::framework::make_ddim; +using paddle::operators::reader::DataDesc; static void generatedata(const std::vector& data, const std::string& file_name) { @@ -126,30 +127,103 @@ TEST(CTR_READER, read_data) { LoDTensorBlockingQueueHolder queue_holder; int capacity = 64; - queue_holder.InitOnce(capacity, {}, false); + queue_holder.InitOnce(capacity, false); std::shared_ptr queue = queue_holder.GetQueue(); int batch_size = 3; int thread_num = 1; - std::vector slots = {"6002", "6003"}; + std::vector sparse_slots = {"6002", "6003"}; std::vector file_list; for (int i = 0; i < thread_num; ++i) { file_list.push_back(gz_file_name); } - CTRReader reader(queue, batch_size, thread_num, slots, file_list); + DataDesc data_desc(batch_size, file_list, "gzip", "svm", {}, {}, + sparse_slots); + + CTRReader reader(queue, thread_num, data_desc); reader.Start(); size_t batch_num = std::ceil(static_cast(ctr_data.size()) / batch_size) * thread_num; - check_all_data(ctr_data, slots, label_dims, label_value, data_slot_6002, - data_slot_6003, batch_num, batch_size, queue, &reader); + check_all_data(ctr_data, sparse_slots, label_dims, label_value, + data_slot_6002, data_slot_6003, batch_num, batch_size, queue, + &reader); reader.Shutdown(); reader.Start(); - check_all_data(ctr_data, slots, label_dims, label_value, data_slot_6002, - data_slot_6003, batch_num, batch_size, queue, &reader); + check_all_data(ctr_data, sparse_slots, label_dims, label_value, + data_slot_6002, data_slot_6003, batch_num, batch_size, queue, + &reader); reader.Shutdown(); } + +static void GenereteCsvData(const std::string& file_name, + const std::vector& data) { + std::ofstream out(file_name.c_str()); + PADDLE_ENFORCE(out.good(), "open file %s failed!", file_name); + for (auto& c : data) { + out << c; + } + out.close(); + PADDLE_ENFORCE(out.good(), "save file %s failed!", file_name); +} + +static void CheckReadCsvOut(const std::vector& out) { + ASSERT_EQ(out.size(), 3); + ASSERT_EQ(out[0].dims()[1], 1); + ASSERT_EQ(out[1].dims()[1], 2); + ASSERT_EQ(out[2].dims()[1], 1); + for (size_t i = 0; i < out[0].numel(); ++i) { + int64_t label = out[0].data()[i]; + auto& dense_dim = out[1].dims(); + for (size_t j = 0; j < dense_dim[1]; ++j) { + ASSERT_EQ(out[1].data()[i * dense_dim[1] + j], + static_cast(label + 0.1)); + } + auto& sparse_lod = out[2].lod(); + for (size_t j = sparse_lod[0][i]; j < sparse_lod[0][i + 1]; ++j) { + ASSERT_EQ(out[2].data()[j], label); + } + } +} + +TEST(CTR_READER, read_csv_data) { + std::string file_name = "test_ctr_reader_data.csv"; + const std::vector csv_data = { + "0 0.1,0.1 0,0,0,0\n", "1 1.1,1.1 1,1,1,1\n", "2 2.1,2.1 2,2,2,2\n", + "3 3.1,3.1 3,3,3,3\n", + }; + GenereteCsvData(file_name, csv_data); + + LoDTensorBlockingQueueHolder queue_holder; + int capacity = 64; + queue_holder.InitOnce(capacity, false); + + std::shared_ptr queue = queue_holder.GetQueue(); + + int batch_size = 3; + int thread_num = 1; + std::vector file_list; + for (int i = 0; i < thread_num; ++i) { + file_list.push_back(file_name); + } + DataDesc data_desc(batch_size, file_list, "plain", "csv", {1}, {2}, {}); + + CTRReader reader(queue, thread_num, data_desc); + + for (size_t i = 0; i < 2; ++i) { + reader.Start(); + std::vector out; + while (true) { + reader.ReadNext(&out); + if (out.empty()) { + break; + } + CheckReadCsvOut(out); + } + reader.Shutdown(); + } +} diff --git a/paddle/fluid/operators/reader/lod_tensor_blocking_queue.h b/paddle/fluid/operators/reader/lod_tensor_blocking_queue.h index 3f041ff7e4e32b407729a22aab25d3aab199fee0..5b53edff5d8ea79a03542231dbf34f5a6f254986 100644 --- a/paddle/fluid/operators/reader/lod_tensor_blocking_queue.h +++ b/paddle/fluid/operators/reader/lod_tensor_blocking_queue.h @@ -32,10 +32,8 @@ class LoDTensorBlockingQueue { friend class LoDTensorBlockingQueueHolder; private: - LoDTensorBlockingQueue(size_t capacity, - const std::vector& dims, - bool speed_test_mode = false) - : queue_(capacity, speed_test_mode), dims_(dims) {} + explicit LoDTensorBlockingQueue(size_t capacity, bool speed_test_mode = false) + : queue_(capacity, speed_test_mode) {} public: bool Push(const std::vector& lod_tensor_vec) { @@ -65,17 +63,15 @@ class LoDTensorBlockingQueue { private: BlockingQueue> queue_; - std::vector dims_; }; class LoDTensorBlockingQueueHolder { public: - void InitOnce(size_t capacity, const std::vector& dims, - bool speed_test_mode = false) { + void InitOnce(size_t capacity, bool speed_test_mode = false) { PADDLE_ENFORCE( queue_ == nullptr, "LoDTensorBlockingQueueHolder::InitOnce() can only be called once"); - queue_.reset(new LoDTensorBlockingQueue(capacity, dims, speed_test_mode)); + queue_.reset(new LoDTensorBlockingQueue(capacity, speed_test_mode)); } inline const std::shared_ptr& GetQueue() const { diff --git a/paddle/fluid/operators/reader/read_op.cc b/paddle/fluid/operators/reader/read_op.cc index a0b70938d354cbb3bf10a9c8c589ba5153624f45..8fe638ac2fdc6e0baed7d6cd3c57b72f23164129 100644 --- a/paddle/fluid/operators/reader/read_op.cc +++ b/paddle/fluid/operators/reader/read_op.cc @@ -27,13 +27,13 @@ class ReadInferShape : public framework::InferShapeBase { "The ReadOp must take a reader as input."); PADDLE_ENFORCE(ctx->HasOutputs("Out"), "The ReadOp should be assigned with output."); - std::vector reader_dims = ctx->GetReaderDims("Reader"); - std::vector out_names = ctx->Outputs("Out"); - PADDLE_ENFORCE_EQ( - reader_dims.size(), out_names.size(), - "The reader's dim number doesn't match the output number."); - ctx->SetOutputsDim("Out", reader_dims); - if (!ctx->IsRuntime()) { + if (!ctx->IsRuntime() && ctx->Attrs().Get("infer_out")) { + std::vector reader_dims = ctx->GetReaderDims("Reader"); + std::vector out_names = ctx->Outputs("Out"); + PADDLE_ENFORCE_EQ( + reader_dims.size(), out_names.size(), + "The reader's dim number doesn't match the output number."); + ctx->SetOutputsDim("Out", reader_dims); auto in_desc = boost::get(ctx->GetInputVarPtrs("Reader")[0]); auto in_lod_levels = in_desc->GetLoDLevels(); @@ -53,15 +53,18 @@ class ReadInferVarType : public framework::VarTypeInference { public: void operator()(const framework::OpDesc& op_desc, framework::BlockDesc* block) const override { - std::string reader_name = op_desc.Input("Reader")[0]; - std::vector out_names = op_desc.Output("Out"); - framework::VarDesc* reader = block->FindVarRecursive(reader_name); - auto dtypes = reader->GetDataTypes(); - PADDLE_ENFORCE_EQ(dtypes.size(), out_names.size()); - for (size_t i = 0; i < dtypes.size(); ++i) { - framework::VarDesc& out = block->FindRecursiveOrCreateVar(out_names[i]); - out.SetType(framework::proto::VarType::LOD_TENSOR); - out.SetDataType(dtypes[i]); + bool infer_out = boost::get(op_desc.GetAttr("infer_out")); + if (infer_out) { + std::string reader_name = op_desc.Input("Reader")[0]; + std::vector out_names = op_desc.Output("Out"); + framework::VarDesc* reader = block->FindVarRecursive(reader_name); + auto dtypes = reader->GetDataTypes(); + PADDLE_ENFORCE_EQ(dtypes.size(), out_names.size()); + for (size_t i = 0; i < dtypes.size(); ++i) { + framework::VarDesc& out = block->FindRecursiveOrCreateVar(out_names[i]); + out.SetType(framework::proto::VarType::LOD_TENSOR); + out.SetDataType(dtypes[i]); + } } } }; @@ -73,6 +76,7 @@ class ReadOp : public framework::OperatorBase { private: void RunImpl(const framework::Scope& scope, const platform::Place& dev_place) const override { + VLOG(3) << "read op in"; framework::ReaderHolder* reader = detail::Ref(scope.FindVar(Input("Reader")), "Cannot find reader variable %s", Input("Reader")) @@ -87,7 +91,9 @@ class ReadOp : public framework::OperatorBase { reader->ReadNext(&ins); if (ins.empty()) { + VLOG(3) << "read empty data in"; if (Attr("throw_eof_exp")) { + VLOG(3) << "throw_eof_exp"; PADDLE_THROW_EOF(); } else { ins.resize(out_arg_names.size()); @@ -96,6 +102,7 @@ class ReadOp : public framework::OperatorBase { tensor.mutable_data(framework::make_ddim({0}), dev_place); } } + VLOG(3) << "read empty data out"; } PADDLE_ENFORCE_EQ(ins.size(), out_arg_names.size()); for (size_t i = 0; i < out_arg_names.size(); ++i) { @@ -120,6 +127,7 @@ class ReadOpMaker : public framework::OpProtoAndCheckerMaker { " only when the data-balance is enabled in ParallelExecutor" " and it is set by ParallelExecutor instance, not users.") .SetDefault(true); + AddAttr("infer_out", "").SetDefault(true); AddComment(R"DOC( Read Operator diff --git a/paddle/fluid/operators/reader/reader_op_registry.cc b/paddle/fluid/operators/reader/reader_op_registry.cc index b82aab1214992be73d876a42424234e3cea46455..3921eedf94abbe68bed035940913f830a6c16e48 100644 --- a/paddle/fluid/operators/reader/reader_op_registry.cc +++ b/paddle/fluid/operators/reader/reader_op_registry.cc @@ -65,6 +65,10 @@ void FileReaderMakerBase::Make() { "It means the reader will generate two data each time," "whose shapes are [2,3,4] and [5,6] respectively."); AddAttr>("lod_levels", "The LoD levels of each data."); + AddAttr( + "use_data_config", + "Use the config of all datas like shape_concat/ranks/lod_levels") + .SetDefault(true); Apply(); } @@ -75,19 +79,23 @@ void FileReaderInferShape::operator()(framework::InferShapeContext* ctx) const { PADDLE_ENFORCE(ctx->HasOutput("Out"), "The output file reader should not be null."); - const auto shape_concat = ctx->Attrs().Get>("shape_concat"); - const auto ranks = ctx->Attrs().Get>("ranks"); - std::vector shapes = RestoreShapes(shape_concat, ranks); - ctx->SetReaderDims("Out", shapes); - - const auto lod_levels = ctx->Attrs().Get>("lod_levels"); - PADDLE_ENFORCE_EQ(lod_levels.size(), shapes.size(), - "The number of 'lod_levels'(%d) doesn't match the number " - "of 'shapes'(%d).", - lod_levels.size(), shapes.size()); - framework::VarDesc* reader = - boost::get(ctx->GetOutputVarPtrs("Out")[0]); - reader->SetLoDLevels(lod_levels); + bool use_data_config = ctx->Attrs().Get("use_data_config"); + if (use_data_config) { + const auto shape_concat = + ctx->Attrs().Get>("shape_concat"); + const auto ranks = ctx->Attrs().Get>("ranks"); + std::vector shapes = RestoreShapes(shape_concat, ranks); + ctx->SetReaderDims("Out", shapes); + + const auto lod_levels = ctx->Attrs().Get>("lod_levels"); + PADDLE_ENFORCE_EQ(lod_levels.size(), shapes.size(), + "The number of 'lod_levels'(%d) doesn't match the number " + "of 'shapes'(%d).", + lod_levels.size(), shapes.size()); + framework::VarDesc* reader = + boost::get(ctx->GetOutputVarPtrs("Out")[0]); + reader->SetLoDLevels(lod_levels); + } } void FileReaderInferVarType::operator()(const framework::OpDesc& op_desc, diff --git a/paddle/fluid/operators/shuffle_channel_op.cc b/paddle/fluid/operators/shuffle_channel_op.cc new file mode 100644 index 0000000000000000000000000000000000000000..9349912e090f2ad3248923c87b50c8d72b0d84d1 --- /dev/null +++ b/paddle/fluid/operators/shuffle_channel_op.cc @@ -0,0 +1,113 @@ +/*Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + http://www.apache.org/licenses/LICENSE-2.0 +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/operators/shuffle_channel_op.h" + +namespace paddle { +namespace operators { + +class ShuffleChannelOp : public framework::OperatorWithKernel { + public: + using framework::OperatorWithKernel::OperatorWithKernel; + + void InferShape(framework::InferShapeContext* ctx) const override { + PADDLE_ENFORCE(ctx->HasInput("X"), + "Input(X) of ShuffleChannelOp should not be null."); + PADDLE_ENFORCE(ctx->HasOutput("Out"), + "Output(Out) of ShuffleChannelOp should not be null."); + + auto input_dims = ctx->GetInputDim("X"); + PADDLE_ENFORCE(input_dims.size() == 4, "The layout of input is NCHW."); + + ctx->SetOutputDim("Out", input_dims); + } + + protected: + framework::OpKernelType GetExpectedKernelType( + const framework::ExecutionContext& ctx) const override { + return framework::OpKernelType(ctx.Input("X")->type(), + ctx.device_context()); + } +}; + +class ShuffleChannelOpMaker : public framework::OpProtoAndCheckerMaker { + public: + void Make() override { + AddInput("X", + "(Tensor, default Tensor), " + "the input feature data of ShuffleChannelOp, the layout is NCHW."); + AddOutput("Out", + "(Tensor, default Tensor), the output of " + "ShuffleChannelOp. The layout is NCHW."); + AddAttr("group", "the number of groups.") + .SetDefault(1) + .AddCustomChecker([](const int& group) { + PADDLE_ENFORCE_GE(group, 1, "group should be larger than 0."); + }); + + AddComment(R"DOC( + Shuffle Channel operator + This opearator shuffles the channels of input x. + It divide the input channels in each group into several subgroups, + and obtain a new order by selecting element from every subgroup one by one. + + Shuffle channel operation makes it possible to build more powerful structures + with multiple group convolutional layers. + please get more information from the following paper: + https://arxiv.org/pdf/1707.01083.pdf + )DOC"); + } +}; + +class ShuffleChannelGradOp : public framework::OperatorWithKernel { + public: + using framework::OperatorWithKernel::OperatorWithKernel; + + void InferShape(framework::InferShapeContext* ctx) const override { + PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("Out")), + "Input(Out@Grad) should not be null"); + PADDLE_ENFORCE(ctx->HasOutput(framework::GradVarName("X")), + "Output(X@Grad) should not be null"); + + auto input_dims = ctx->GetInputDim("X"); + PADDLE_ENFORCE(input_dims.size() == 4, "The layout of input is NCHW."); + + ctx->SetOutputDim(framework::GradVarName("X"), input_dims); + } + + protected: + framework::OpKernelType GetExpectedKernelType( + const framework::ExecutionContext& ctx) const override { + return framework::OpKernelType(ctx.Input("X")->type(), + ctx.device_context()); + } +}; + +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; +REGISTER_OPERATOR(shuffle_channel, ops::ShuffleChannelOp, + ops::ShuffleChannelOpMaker, + paddle::framework::DefaultGradOpDescMaker); + +REGISTER_OPERATOR(shuffle_channel_grad, ops::ShuffleChannelGradOp); + +REGISTER_OP_CPU_KERNEL( + shuffle_channel, + ops::ShuffleChannelOpKernel, + ops::ShuffleChannelOpKernel); + +REGISTER_OP_CPU_KERNEL( + shuffle_channel_grad, + ops::ShuffleChannelGradOpKernel, + ops::ShuffleChannelGradOpKernel); diff --git a/paddle/fluid/operators/shuffle_channel_op.cu b/paddle/fluid/operators/shuffle_channel_op.cu new file mode 100644 index 0000000000000000000000000000000000000000..9506343b3d508459c6e10dc68eba13504b07338f --- /dev/null +++ b/paddle/fluid/operators/shuffle_channel_op.cu @@ -0,0 +1,125 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + http://www.apache.org/licenses/LICENSE-2.0 +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/operators/shuffle_channel_op.h" +#include "paddle/fluid/platform/cuda_primitives.h" +#include "paddle/fluid/platform/gpu_info.h" + +namespace paddle { +namespace operators { + +using Tensor = framework::Tensor; +static constexpr int kNumCUDAThreads = 512; +static constexpr int kNumMaximumNumBlocks = 4096; + +static inline int NumBlocks(const int N) { + return std::min((N + kNumCUDAThreads - 1) / kNumCUDAThreads, + kNumMaximumNumBlocks); +} + +template +__global__ void ShuffleChannel(const int nthreads, const int feature_map_size, + T* output, const T* input, int group_row, + int group_column, int len) { + int index = blockIdx.x * blockDim.x + threadIdx.x; + int offset = blockDim.x * gridDim.x; + for (size_t ii = index; ii < nthreads; ii += offset) { + const int n = index / group_row / group_column / len; + const int i = (index / group_column / len) % group_row; + const int j = index / len % group_column; + const int k = index - (n * feature_map_size + (i * group_column + j) * len); + T* p_o = output + n * feature_map_size + (j * group_row + i) * len; + p_o[k] = input[index]; + } +} +template +class ShuffleChannelOpCUDAKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const override { + auto* input = ctx.Input("X"); + auto* output = ctx.Output("Out"); + int group = ctx.Attr("group"); + + auto input_dims = input->dims(); + auto num = input_dims[0]; + auto channel = input_dims[1]; + auto height = input_dims[2]; + auto weight = input_dims[3]; + + auto feature_map_size = channel * height * weight; + auto sp_sz = height * weight; + int group_row = group; + int group_column = channel / group_row; + // count is the product of NCHW same as numel() + int count = num * group_column * group_row * sp_sz; + + int blocks = NumBlocks(output->numel()); + int threads = kNumCUDAThreads; + + const T* input_data = input->data(); + T* output_data = output->mutable_data(ctx.GetPlace()); + + ShuffleChannel< + T><<>>( + count, feature_map_size, output_data, input_data, group_row, + group_column, sp_sz); + } +}; + +template +class ShuffleChannelGradOpCUDAKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const override { + auto* input = ctx.Input("X"); + int group = ctx.Attr("group"); + + auto input_dims = input->dims(); + auto num = input_dims[0]; + auto channel = input_dims[1]; + auto height = input_dims[2]; + auto weight = input_dims[3]; + auto feature_map_size = channel * height * weight; + auto sp_sz = height * weight; + + int group_row = group; + int group_column = channel / group_row; + auto* output_grad = + ctx.Input(framework::GradVarName("Out")); + auto* input_grad = + ctx.Output(framework::GradVarName("X")); + T* input_grad_data = input_grad->mutable_data(ctx.GetPlace()); + const T* output_grad_data = output_grad->data(); + + int blocks = NumBlocks(output_grad->numel()); + int threads = kNumCUDAThreads; + int count = num * group_column * group_row * sp_sz; + + ShuffleChannel< + T><<>>( + count, feature_map_size, input_grad_data, output_grad_data, group_row, + group_column, sp_sz); + } +}; +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; +REGISTER_OP_CUDA_KERNEL( + shuffle_channel, + ops::ShuffleChannelOpCUDAKernel, + ops::ShuffleChannelOpCUDAKernel); +REGISTER_OP_CUDA_KERNEL( + shuffle_channel_grad, + ops::ShuffleChannelGradOpCUDAKernel, + ops::ShuffleChannelGradOpCUDAKernel); diff --git a/paddle/fluid/operators/shuffle_channel_op.h b/paddle/fluid/operators/shuffle_channel_op.h new file mode 100644 index 0000000000000000000000000000000000000000..f6af1bc88598870ebccef81bd37f93f376940851 --- /dev/null +++ b/paddle/fluid/operators/shuffle_channel_op.h @@ -0,0 +1,95 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + http://www.apache.org/licenses/LICENSE-2.0 +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once +#include +#include +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/operators/math/math_function.h" + +namespace paddle { +namespace operators { + +template +class ShuffleChannelOpKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const override { + auto* input = ctx.Input("X"); + auto* output = ctx.Output("Out"); + int group = ctx.Attr("group"); + + auto input_dims = input->dims(); + auto num = input_dims[0]; + auto channel = input_dims[1]; + auto height = input_dims[2]; + auto weight = input_dims[3]; + + auto feature_map_size = channel * height * weight; + auto sp_sz = height * weight; + int group_row = group; + int group_column = channel / group_row; + + const T* input_data = input->data(); + T* output_data = output->mutable_data(ctx.GetPlace()); + for (int n = 0; n < num; ++n) { + for (int i = 0; i < group_row; ++i) { + for (int j = 0; j < group_column; ++j) { + const T* p_i = input_data + n * feature_map_size + + (i * group_column + j) * sp_sz; + T* p_o = + output_data + n * feature_map_size + (j * group_row + i) * sp_sz; + memcpy(p_o, p_i, sizeof(int) * sp_sz); + } + } + } + } +}; + +template +class ShuffleChannelGradOpKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const override { + auto* input = ctx.Input("X"); + int group = ctx.Attr("group"); + + auto input_dims = input->dims(); + auto num = input_dims[0]; + auto channel = input_dims[1]; + auto height = input_dims[2]; + auto weight = input_dims[3]; + auto feature_map_size = channel * height * weight; + auto sp_sz = height * weight; + + int group_row = group; + int group_column = channel / group_row; + + auto* output_grad = + ctx.Input(framework::GradVarName("Out")); + auto* input_grad = + ctx.Output(framework::GradVarName("X")); + T* input_grad_data = input_grad->mutable_data(ctx.GetPlace()); + const T* output_grad_data = output_grad->data(); + for (int n = 0; n < num; ++n) { + for (int i = 0; i < group_row; ++i) { + for (int j = 0; j < group_column; ++j) { + const T* p_i = output_grad_data + n * feature_map_size + + (i * group_column + j) * sp_sz; + T* p_o = input_grad_data + n * feature_map_size + + (j * group_row + i) * sp_sz; + memcpy(p_o, p_i, sizeof(int) * sp_sz); + } + } + } + } +}; + +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/tensorrt/tensorrt_engine_op.cc b/paddle/fluid/operators/tensorrt/tensorrt_engine_op.cc index b993c55fad13e892efd51648b78704bec83bf2b4..031335009b692f9d1f73070c88e8e79d852cbe36 100644 --- a/paddle/fluid/operators/tensorrt/tensorrt_engine_op.cc +++ b/paddle/fluid/operators/tensorrt/tensorrt_engine_op.cc @@ -29,8 +29,14 @@ class TensorRTEngineOpMaker : public framework::OpProtoAndCheckerMaker { AddInput("Xs", "A list of inputs.").AsDuplicable(); AddOutput("Ys", "A list of outputs").AsDuplicable(); AddAttr("subgraph", "the subgraph."); + AddAttr("calibration_data", "the calibration data for int8"); + AddAttr( + "engine_key", + "The engine_key here is used to distinguish different TRT Engines"); AddAttr("max_batch_size", "the maximum batch size."); AddAttr("workspace_size", "the workspace size."); + AddAttr("sub_block", "the trt block"); + AddAttr("enable_int8", "whether swith to int8 mode"); AddComment("TensorRT engine operator."); } }; @@ -47,6 +53,6 @@ class TensorRTEngineInferVarType : public framework::VarTypeInference { namespace ops = paddle::operators; REGISTER_OPERATOR(tensorrt_engine, ops::TensorRTEngineOp, - ops::TensorRTEngineOpMaker); + ops::TensorRTEngineOpMaker, ops::TensorRTEngineOpMaker); #endif // PADDLE_WITH_CUDA diff --git a/paddle/fluid/operators/tensorrt/tensorrt_engine_op.h b/paddle/fluid/operators/tensorrt/tensorrt_engine_op.h index 88c4f508474e66953b79fb92ff1eb0b53a539f07..2ff35c7c6ac6409d529de5b794bfc322b1f5dd9b 100644 --- a/paddle/fluid/operators/tensorrt/tensorrt_engine_op.h +++ b/paddle/fluid/operators/tensorrt/tensorrt_engine_op.h @@ -17,8 +17,10 @@ #ifdef PADDLE_WITH_CUDA #include +#include #include +#include "paddle/fluid/framework/executor.h" #include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/operator.h" #include "paddle/fluid/inference/analysis/helper.h" @@ -62,6 +64,9 @@ nvinfer1::Dims Vec2TRT_Dims(const std::vector &shape) { using inference::Singleton; using inference::tensorrt::TensorRTEngine; +using inference::tensorrt::TRTInt8Calibrator; +using inference::tensorrt::TRTCalibratorEngine; +using inference::tensorrt::TRTCalibratorEngineManager; class TensorRTEngineOp : public framework::OperatorBase { private: @@ -70,6 +75,11 @@ class TensorRTEngineOp : public framework::OperatorBase { mutable std::unique_ptr trt_engine_; int max_batch_size_; int workspace_size_; + std::unique_ptr calibrator_; + bool enable_int8_; + std::string calibration_data_; + std::string engine_key_; + bool calibration_mode_; public: TensorRTEngineOp(const std::string &type, @@ -80,26 +90,108 @@ class TensorRTEngineOp : public framework::OperatorBase { input_names_ = Inputs("Xs"); max_batch_size_ = Attr("max_batch_size"); workspace_size_ = Attr("workspace_size"); + enable_int8_ = Attr("enable_int8"); + calibration_data_ = Attr("calibration_data"); + engine_key_ = Attr("engine_key"); auto params = Attr>("parameters"); for (const auto ¶m : params) { param_names_.insert(param); } + // calibration_mode is ture represents we need to + // generate the calibration table data. + calibration_mode_ = (enable_int8_ && calibration_data_.size() == 0); + + VLOG(4) << "calibration_mode: " << calibration_mode_; + if (enable_int8_ && calibration_data_.size()) { + calibrator_.reset(new TRTInt8Calibrator(calibration_data_)); + } } protected: + void RunNativeImpl(const framework::Scope &scope, + const platform::Place &dev_place) const { + framework::Executor executor(dev_place); + auto *block = Attr("sub_block"); + auto *program = block->Program(); + auto ¤t_scope = scope.NewScope(); + auto ctx = executor.Prepare(*program, block->ID()); + executor.RunPreparedContext(ctx.get(), ¤t_scope, false, true, true); + } + void RunImpl(const framework::Scope &scope, const platform::Place &dev_place) const override { + if (calibration_mode_ == true) { + RunCalibration(scope, dev_place); + return; + } RunTrt(scope, dev_place); } + void RunCalibration(const framework::Scope &scope, + const platform::Place &dev_place) const { + // This process will builds a 32-bit trt engine, runs it on the calibration + // set, and records a histogram for each + // tensor of the distribution of activation values. + LOG_FIRST_N(INFO, 1) << "The TRT engine: " << engine_key_ + << " is running calibration trt int8... "; + int runtime_batch = 1; + platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance(); + auto &dev_ctx = *pool.Get(dev_place); + auto stream = + reinterpret_cast(dev_ctx).stream(); + if (!Singleton::Global().Has(engine_key_)) { + TRTCalibratorEngine *calib_res = + Singleton::Global().Create(engine_key_); + std::unordered_map calib_buffers; + for (auto &x : input_names_) { + if (param_names_.count(x)) continue; + auto &t = + inference::analysis::GetFromScope(scope, x); + calib_buffers[x] = t.memory_size(); + auto t_shape = framework::vectorize(t.dims()); + runtime_batch = t_shape[0]; + } + calib_res->calib_.reset(new TRTInt8Calibrator( + calib_buffers, runtime_batch, engine_key_, dev_place)); + calib_res->thr_.reset(new std::thread([&]() { + calib_res->engine_.reset(new TensorRTEngine( + max_batch_size_, workspace_size_, stream, + boost::get(dev_place).device, enable_int8_, + calib_res->calib_.get())); + VLOG(3) << "start the calib trt engine thread"; + Prepare(scope, dev_place, calib_res->engine_.get()); + })); + } + + TRTInt8Calibrator *temp_calibrator = + Singleton::Global() + .Get(engine_key_) + ->calib_.get(); + std::unordered_map calib_data; + + for (auto &x : Inputs("Xs")) { + if (param_names_.count(x)) continue; + auto &t = + inference::analysis::GetFromScope(scope, x); + calib_data.emplace(x, t.data()); + } + temp_calibrator->setBatch(calib_data); + RunNativeImpl(scope, dev_place); + } + void RunTrt(const framework::Scope &scope, const platform::Place &dev_place) const { int runtime_batch = 1; + platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance(); + auto &dev_ctx = *pool.Get(dev_place); + auto stream = + reinterpret_cast(dev_ctx).stream(); if (trt_engine_.get() == nullptr) { - trt_engine_.reset(new TensorRTEngine( - max_batch_size_, workspace_size_, nullptr, - boost::get(dev_place).device)); + trt_engine_.reset( + new TensorRTEngine(max_batch_size_, workspace_size_, stream, + boost::get(dev_place).device, + enable_int8_, calibrator_.get())); Prepare(scope, dev_place, trt_engine_.get()); } @@ -126,6 +218,7 @@ class TensorRTEngineOp : public framework::OperatorBase { } } + cudaStreamSynchronize(stream); PADDLE_ENFORCE_LE(runtime_batch, max_batch_size_); // Execute the engine. engine->Execute(runtime_batch); @@ -163,12 +256,13 @@ class TensorRTEngineOp : public framework::OperatorBase { output_index += 1; } - cudaStreamSynchronize(*engine->stream()); + cudaStreamSynchronize(stream); } void Prepare(const framework::Scope &scope, const platform::Place &dev_place, TensorRTEngine *engine) const { - VLOG(4) << "Prepare engine"; + LOG(INFO) << "Prepare TRT engine (Optimize model structure, Select OP " + "kernel etc). This process may cost a lot of time."; framework::proto::BlockDesc block_desc; block_desc.ParseFromString(Attr("subgraph")); diff --git a/paddle/fluid/operators/tensorrt/tensorrt_engine_op_test.cc b/paddle/fluid/operators/tensorrt/tensorrt_engine_op_test.cc index 287b0edc96e5e312b0ff1725ee188ff319d44d23..5a3d9d2c1a3e8111acbad2ddcf4f5469a3a99751 100644 --- a/paddle/fluid/operators/tensorrt/tensorrt_engine_op_test.cc +++ b/paddle/fluid/operators/tensorrt/tensorrt_engine_op_test.cc @@ -96,19 +96,20 @@ TEST(TensorRTEngineOp, manual) { engine_op_desc.SetType("tensorrt_engine"); engine_op_desc.SetInput("Xs", std::vector({"x"})); engine_op_desc.SetOutput("Ys", std::vector({"z0"})); - SetAttr(engine_op_desc.Proto(), "subgraph", - block_->SerializeAsString()); - SetAttr(engine_op_desc.Proto(), "max_batch_size", 2); - SetAttr(engine_op_desc.Proto(), "workspace_size", 2 << 10); - SetAttr(engine_op_desc.Proto(), "engine_uniq_key", "a_engine"); - SetAttr>(engine_op_desc.Proto(), "parameters", - std::vector({})); - SetAttr>(engine_op_desc.Proto(), - "output_name_mapping", - std::vector({"z0"})); + + engine_op_desc.SetBlockAttr("sub_block", &block_desc); + engine_op_desc.SetAttr("max_batch_size", static_cast(2)); + engine_op_desc.SetAttr("workspace_size", static_cast(1 << 20)); + engine_op_desc.SetAttr("parameters", std::vector({})); + engine_op_desc.SetAttr("engine_key", std::string("a_engine")); + engine_op_desc.SetAttr("calibration_data", std::string("")); + engine_op_desc.SetAttr("enable_int8", static_cast(false)); + engine_op_desc.SetAttr("output_name_mapping", + std::vector({"z0"})); + engine_op_desc.SetAttr("subgraph", std::string(block_->SerializeAsString())); LOG(INFO) << "create engine op"; - auto engine_op = framework::OpRegistry::CreateOp(*engine_op_desc.Proto()); + auto engine_op = framework::OpRegistry::CreateOp(engine_op_desc); LOG(INFO) << "engine_op " << engine_op.get(); framework::Scope scope; @@ -190,20 +191,19 @@ void Execute(int batch_size, int input_dim, int output_dim, int nlayers = 1) { engine_op_desc.SetInput("Xs", std::vector({"x0"})); engine_op_desc.SetOutput("Ys", std::vector({"z3"})); - SetAttr(engine_op_desc.Proto(), "subgraph", - block_->SerializeAsString()); - SetAttr(engine_op_desc.Proto(), "max_batch_size", batch_size); - SetAttr(engine_op_desc.Proto(), "workspace_size", 2 << 10); - SetAttr>( - engine_op_desc.Proto(), "parameters", - std::vector({"y0", "y1", "y2", "y3"})); - SetAttr(engine_op_desc.Proto(), "engine_uniq_key", "b_engine"); - - SetAttr>(engine_op_desc.Proto(), - "output_name_mapping", - std::vector({"z3"})); - - auto engine_op = framework::OpRegistry::CreateOp(*engine_op_desc.Proto()); + engine_op_desc.SetBlockAttr("sub_block", &block_desc); + engine_op_desc.SetAttr("max_batch_size", static_cast(batch_size)); + engine_op_desc.SetAttr("workspace_size", static_cast(1 << 20)); + engine_op_desc.SetAttr("parameters", + std::vector({"y0", "y1", "y2", "y3"})); + engine_op_desc.SetAttr("engine_key", std::string("b_engine")); + engine_op_desc.SetAttr("calibration_data", std::string("")); + engine_op_desc.SetAttr("enable_int8", static_cast(false)); + engine_op_desc.SetAttr("output_name_mapping", + std::vector({"z3"})); + engine_op_desc.SetAttr("subgraph", std::string(block_->SerializeAsString())); + + auto engine_op = framework::OpRegistry::CreateOp(engine_op_desc); // Execute them. engine_op->Run(scope, place); diff --git a/paddle/fluid/operators/warpctc_cudnn_op.cu.cc b/paddle/fluid/operators/warpctc_cudnn_op.cu.cc index 5e16a209e712a143e1083e171f88002817aef838..a764d59410c90535dbda0b3f11e89ae9bf578c04 100644 --- a/paddle/fluid/operators/warpctc_cudnn_op.cu.cc +++ b/paddle/fluid/operators/warpctc_cudnn_op.cu.cc @@ -144,19 +144,17 @@ class CudnnCTCKernel : public framework::OpKernel { CUDNN_CTC_LOSS_ALGO_DETERMINISTIC, cu_ctcloss_desc, &workspace_size)); T* loss_data = loss->mutable_data(loss_dims, ctx.GetPlace()); - math::SetConstant()( - ctx.template device_context(), loss, static_cast(0)); - - auto temp_allocation = - platform::DeviceTemporaryAllocator::Instance().Get(dev_ctx).Allocate( - workspace_size); - void* cudnn_workspace = temp_allocation->ptr(); - - CUDNN_ENFORCE(platform::dynload::cudnnCTCLoss( - handle, cu_logits_desc, warpctc_logits_data, warpctc_label_data, - warpctc_label_lengths.data(), warpctc_logits_lengths.data(), loss_data, - cu_grad_desc, warpctc_grad_data, CUDNN_CTC_LOSS_ALGO_DETERMINISTIC, - cu_ctcloss_desc, cudnn_workspace, workspace_size)); + + auto workspace_handle = dev_ctx.cudnn_workspace_handle(); + auto cudnn_func = [&](void* cudnn_workspace) { + CUDNN_ENFORCE(platform::dynload::cudnnCTCLoss( + handle, cu_logits_desc, warpctc_logits_data, warpctc_label_data, + warpctc_label_lengths.data(), warpctc_logits_lengths.data(), + loss_data, cu_grad_desc, warpctc_grad_data, + CUDNN_CTC_LOSS_ALGO_DETERMINISTIC, cu_ctcloss_desc, cudnn_workspace, + workspace_size)); + }; + workspace_handle.RunFunc(cudnn_func, workspace_size); } }; diff --git a/paddle/fluid/platform/cuda_device_function.h b/paddle/fluid/platform/cuda_device_function.h index 9f504d14a8da116648483c0f64cb511b46e6a97e..2ce8f141d3c51661305f4952479cf2889fc4f396 100644 --- a/paddle/fluid/platform/cuda_device_function.h +++ b/paddle/fluid/platform/cuda_device_function.h @@ -13,6 +13,7 @@ See the License for the specific language governing permissions and limitations under the License. */ #pragma once + #include // NOTE(): support float16 to half in header file. #define PADDLE_CUDA_FP16 @@ -30,6 +31,34 @@ namespace platform { mask = __ballot_sync(FULL_WARP_MASK, (predicate)) #endif +inline static int RoundToPowerOfTwo(int dim) { + if (dim > 512) { + return 1024; + } else if (dim > 256) { + return 512; + } else if (dim > 128) { + return 256; + } else if (dim > 64) { + return 128; + } else if (dim > 32) { + return 64; + } else { + return 32; + } +} + +#define CUDA_LAUNCH_KERNEL_BASE(dim, ...) \ + case (dim): { \ + constexpr auto kPowerOfTwoDim = (dim); \ + __VA_ARGS__; \ + } break + +#define CUDA_LAUNCH_KERNEL_HELPER(...) \ + CUDA_LAUNCH_KERNEL_BASE(256, ##__VA_ARGS__); \ + CUDA_LAUNCH_KERNEL_BASE(128, ##__VA_ARGS__); \ + CUDA_LAUNCH_KERNEL_BASE(64, ##__VA_ARGS__); \ + CUDA_LAUNCH_KERNEL_BASE(32, ##__VA_ARGS__); + template __forceinline__ __device__ T CudaShuffleDownSync(unsigned mask, T val, int delta, int width = 32) { diff --git a/paddle/fluid/platform/device_context.cc b/paddle/fluid/platform/device_context.cc index 8f80a2d7822f1dc16cee2514a991b7341f5d1cfd..2493fb71c019f9923012afa4a46cb3e95479f860 100644 --- a/paddle/fluid/platform/device_context.cc +++ b/paddle/fluid/platform/device_context.cc @@ -30,8 +30,9 @@ platform::DeviceContext* DeviceContextPool::Get(const platform::Place& place) { auto it = device_contexts_.find(place); if (it == device_contexts_.end()) { PADDLE_THROW( - "'Place' is not supported, Please re-compile with WITH_GPU " - "option"); + "Place %s is not supported, Please re-compile with WITH_GPU " + "option", + place); } return it->second.get().get(); } diff --git a/paddle/fluid/platform/gpu_info.cc b/paddle/fluid/platform/gpu_info.cc index ca89d91aadb2d3e9005e6dd06cef124428d7e250..400a6d7bfa5912774c4bbb2a5868dd9a471afd00 100644 --- a/paddle/fluid/platform/gpu_info.cc +++ b/paddle/fluid/platform/gpu_info.cc @@ -15,6 +15,8 @@ limitations under the License. */ #include "paddle/fluid/platform/gpu_info.h" #include +#include +#include #include "gflags/gflags.h" #include "paddle/fluid/platform/enforce.h" @@ -58,7 +60,18 @@ DEFINE_string(selected_gpus, "", namespace paddle { namespace platform { -int GetCUDADeviceCount() { +static int GetCUDADeviceCountImpl() { + const auto *cuda_visible_devices = std::getenv("CUDA_VISIBLE_DEVICES"); + if (cuda_visible_devices != nullptr) { + std::string cuda_visible_devices_str(cuda_visible_devices); + if (std::all_of(cuda_visible_devices_str.begin(), + cuda_visible_devices_str.end(), + [](char ch) { return ch == ' '; })) { + VLOG(2) << "CUDA_VISIBLE_DEVICES is set to be empty. No GPU detected."; + return 0; + } + } + int count; PADDLE_ENFORCE( cudaGetDeviceCount(&count), @@ -66,6 +79,11 @@ int GetCUDADeviceCount() { return count; } +int GetCUDADeviceCount() { + static auto dev_cnt = GetCUDADeviceCountImpl(); + return dev_cnt; +} + int GetCUDAComputeCapability(int id) { PADDLE_ENFORCE_LT(id, GetCUDADeviceCount(), "id must less than GPU count"); cudaDeviceProp device_prop; @@ -203,13 +221,17 @@ size_t GpuMaxChunkSize() { void GpuMemcpyAsync(void *dst, const void *src, size_t count, enum cudaMemcpyKind kind, cudaStream_t stream) { PADDLE_ENFORCE(cudaMemcpyAsync(dst, src, count, kind, stream), - "cudaMemcpyAsync failed in paddle::platform::GpuMemcpyAsync"); + "cudaMemcpyAsync failed in paddle::platform::GpuMemcpyAsync " + "(%p -> %p, length: %d)", + src, dst, static_cast(count)); } void GpuMemcpySync(void *dst, const void *src, size_t count, enum cudaMemcpyKind kind) { PADDLE_ENFORCE(cudaMemcpy(dst, src, count, kind), - "cudaMemcpy failed in paddle::platform::GpuMemcpySync"); + "cudaMemcpy failed in paddle::platform::GpuMemcpySync (%p -> " + "%p, length: %d)", + src, dst, static_cast(count)); } void GpuMemcpyPeerAsync(void *dst, int dst_device, const void *src, diff --git a/paddle/fluid/platform/mkldnn_reuse.h b/paddle/fluid/platform/mkldnn_reuse.h index faac6a12c66378d090b642312df4538aeeb3d8cd..269280d604a13a62046fb7811d34b7c69b61b50f 100644 --- a/paddle/fluid/platform/mkldnn_reuse.h +++ b/paddle/fluid/platform/mkldnn_reuse.h @@ -365,7 +365,7 @@ class TransposeMKLDNNHandler : public MKLDNNHandler { mem_fmt.ndims = axis.size(); for (unsigned int i = 0; i < nchw_tz.size(); ++i) { mem_fmt.dims[i] = nchw_tz[i]; // logical dimensions (nchw format, - // regardless physical layout) + // regardless physical layout) } mem_fmt.data_type = mkldnn_f32; mem_fmt.format = mkldnn_blocked; @@ -374,7 +374,7 @@ class TransposeMKLDNNHandler : public MKLDNNHandler { for (int i = nchw_tz.size() - 1; i >= 0; --i) { mem_fmt.layout_desc.blocking.padding_dims[i] = nchw_tz[i]; // logical dimensions (nchw format, regardless physical - // layout) + // layout) mem_fmt.layout_desc.blocking.block_dims[i] = 1; mem_fmt.layout_desc.blocking.offset_padding_to_data[i] = 0; // no offset mem_fmt.layout_desc.blocking.strides[0][axis[i]] = total_stride; diff --git a/paddle/fluid/pybind/imperative.cc b/paddle/fluid/pybind/imperative.cc index dbc7843caa0c0a39a32cda6050fa99a3ab4c3e22..31c3bfa43ffec22059a602e9ff09a33188d72c91 100644 --- a/paddle/fluid/pybind/imperative.cc +++ b/paddle/fluid/pybind/imperative.cc @@ -15,18 +15,38 @@ limitations under the License. */ #include "paddle/fluid/pybind/imperative.h" #include "paddle/fluid/framework/block_desc.h" #include "paddle/fluid/imperative/tracer.h" +#include "paddle/fluid/imperative/type_defs.h" namespace paddle { namespace pybind { // Bind Methods -void BindTracer(pybind11::module *m) { +void BindTracer(pybind11::module* m) { pybind11::class_(*m, "Tracer", "") .def("__init__", - [](imperative::Tracer &self, framework::BlockDesc *root_block) { + [](imperative::Tracer& self, framework::BlockDesc* root_block) { new (&self) imperative::Tracer(root_block); }) - .def("trace", &imperative::Tracer::Trace) + .def("trace", + [](imperative::Tracer& self, imperative::OpBase* op, + const imperative::VarBasePtrMap& inputs, + const imperative::VarBasePtrMap& outputs, + framework::BlockDesc* block, + const platform::CPUPlace expected_place, + const bool stop_gradient = false) { + self.Trace(op, inputs, outputs, block, expected_place, + stop_gradient); + }) + .def("trace", + [](imperative::Tracer& self, imperative::OpBase* op, + const imperative::VarBasePtrMap& inputs, + const imperative::VarBasePtrMap& outputs, + framework::BlockDesc* block, + const platform::CUDAPlace expected_place, + const bool stop_gradient = false) { + self.Trace(op, inputs, outputs, block, expected_place, + stop_gradient); + }) .def("py_trace", &imperative::Tracer::PyTrace, pybind11::return_value_policy::take_ownership); } diff --git a/paddle/fluid/pybind/inference_api.cc b/paddle/fluid/pybind/inference_api.cc index 26247026667158a2f43cdac21bf5600479455e16..e05667d2c7e9ce5c64cfacee4919cd36d7383c0c 100644 --- a/paddle/fluid/pybind/inference_api.cc +++ b/paddle/fluid/pybind/inference_api.cc @@ -180,8 +180,14 @@ void BindNativePredictor(py::module *m) { } void BindAnalysisConfig(py::module *m) { - py::class_(*m, "AnalysisConfig") - .def(py::init()) + py::class_ analysis_config(*m, "AnalysisConfig"); + + py::enum_(analysis_config, "Precision") + .value("Float32", AnalysisConfig::Precision::kFloat32) + .value("Int8", AnalysisConfig::Precision::kInt8) + .export_values(); + + analysis_config.def(py::init()) .def(py::init()) .def(py::init()) .def("set_model", (void (AnalysisConfig::*)(const std::string &)) & @@ -215,7 +221,8 @@ void BindAnalysisConfig(py::module *m) { .def("specify_input_name", &AnalysisConfig::specify_input_name) .def("enable_tensorrt_engine", &AnalysisConfig::EnableTensorRtEngine, py::arg("workspace_size") = 1 << 20, py::arg("max_batch_size") = 1, - py::arg("min_subgraph_size") = 3) + py::arg("min_subgraph_size") = 3, + py::arg("precision_mode") = AnalysisConfig::Precision::kFloat32) .def("tensorrt_engine_enabled", &AnalysisConfig::tensorrt_engine_enabled) .def("switch_ir_debug", &AnalysisConfig::SwitchIrDebug, py::arg("x") = true) diff --git a/paddle/fluid/pybind/pybind.cc b/paddle/fluid/pybind/pybind.cc index c470483756659b55329c022e0c43002182db815b..97e5bbaaccaf7c702a324abd708a314c72ece004 100644 --- a/paddle/fluid/pybind/pybind.cc +++ b/paddle/fluid/pybind/pybind.cc @@ -138,6 +138,22 @@ PYBIND11_MODULE(core, m) { .def("_grad_ivar", [](const imperative::VarBase &self) { return self.grads_; }, py::return_value_policy::reference) + .def("_copy_to", + [](const imperative::VarBase &self, const platform::CPUPlace &place, + bool blocking) { + std::unique_ptr new_var = + self.NewVarBase(place, blocking); + return new_var.release(); + }, + py::return_value_policy::take_ownership) + .def("_copy_to", + [](const imperative::VarBase &self, const platform::CUDAPlace &place, + bool blocking) { + std::unique_ptr new_var = + self.NewVarBase(place, blocking); + return new_var.release(); + }, + py::return_value_policy::take_ownership) .def("value", [](const imperative::VarBase &self) { return self.var_; }, py::return_value_policy::reference) .def_property( @@ -469,6 +485,7 @@ All parameter, weight, gradient are variables in Paddle. py::return_value_policy::reference); py::class_(m, "Reader", "") + .def("start", &framework::ReaderHolder::Start) .def("reset", &framework::ReaderHolder::ResetAll); using LoDTensorBlockingQueue = @@ -489,19 +506,12 @@ All parameter, weight, gradient are variables in Paddle. .def("is_closed", &LoDTensorBlockingQueue::IsClosed); m.def("init_lod_tensor_blocking_queue", - [](Variable &var, size_t capacity, - const std::vector> &shapes) - -> std::shared_ptr { - std::vector dims(shapes.size()); - std::transform(shapes.begin(), shapes.end(), dims.begin(), - [](const std::vector &shape) { - return make_ddim(shape); - }); - auto *holder = var.GetMutable(); - holder->InitOnce(capacity, dims, - FLAGS_reader_queue_speed_test_mode); - return holder->GetQueue(); - }, + [](Variable &var, + size_t capacity) -> std::shared_ptr { + auto *holder = var.GetMutable(); + holder->InitOnce(capacity, FLAGS_reader_queue_speed_test_mode); + return holder->GetQueue(); + }, py::return_value_policy::copy); py::class_(m, "_Scope", R"DOC( @@ -626,7 +636,18 @@ All parameter, weight, gradient are variables in Paddle. py::class_(m, "Communicator").def(py::init<>()); #endif py::class_(m, "CUDAPlace") - .def(py::init()) + .def("__init__", + [](platform::CUDAPlace &self, int dev_id) { +#ifdef PADDLE_WITH_CUDA + PADDLE_ENFORCE( + dev_id >= 0 && dev_id < platform::GetCUDADeviceCount(), + "Invalid CUDAPlace(%d), must inside [0, %d)", dev_id, + platform::GetCUDADeviceCount()); + new (&self) platform::CUDAPlace(dev_id); +#else + PADDLE_THROW("Cannot use CUDAPlace in CPU only version"); +#endif + }) .def("__str__", string::to_string); py::class_(m, "CPUPlace") @@ -634,7 +655,12 @@ All parameter, weight, gradient are variables in Paddle. .def("__str__", string::to_string); py::class_(m, "CUDAPinnedPlace") - .def(py::init<>()) + .def("__init__", + [](platform::CUDAPinnedPlace &) { +#ifndef PADDLE_WITH_CUDA + PADDLE_THROW("Cannot use CUDAPinnedPlace in CPU only version"); +#endif + }) .def("__str__", string::to_string); py::class_(m, "Place") @@ -1005,7 +1031,7 @@ All parameter, weight, gradient are variables in Paddle. PADDLE_ENFORCE(!self.IsFinalized(), "BuildStrategy is finlaized."); self.remove_unnecessary_lock_ = b; }, - R"DOC(The type is BOOL. If set True, some locks in GPU ops would be released and ParallelExecutor would run faster. Default False.)DOC") + R"DOC(The type is BOOL. If set True, some locks in GPU ops would be released and ParallelExecutor would run faster. Default True.)DOC") .def_property( "num_trainers", [](const BuildStrategy &self) { return self.num_trainers_; }, diff --git a/paddle/scripts/paddle_build.sh b/paddle/scripts/paddle_build.sh index bb7258ee5913469d9f9a5f1bf5cf4bb4fa63938a..1135caf4f8c32901d93270d372fdaac702acf006 100755 --- a/paddle/scripts/paddle_build.sh +++ b/paddle/scripts/paddle_build.sh @@ -173,7 +173,6 @@ function cmake_gen() { -DCMAKE_BUILD_TYPE=${CMAKE_BUILD_TYPE:-Release} ${PYTHON_FLAGS} -DWITH_DSO=ON - -DWITH_DOC=${WITH_DOC:-OFF} -DWITH_GPU=${WITH_GPU:-OFF} -DWITH_AMD_GPU=${WITH_AMD_GPU:-OFF} -DWITH_DISTRIBUTE=${distibuted_flag} @@ -208,7 +207,6 @@ EOF -DCMAKE_BUILD_TYPE=${CMAKE_BUILD_TYPE:-Release} \ ${PYTHON_FLAGS} \ -DWITH_DSO=ON \ - -DWITH_DOC=${WITH_DOC:-OFF} \ -DWITH_GPU=${WITH_GPU:-OFF} \ -DWITH_AMD_GPU=${WITH_AMD_GPU:-OFF} \ -DWITH_DISTRIBUTE=${distibuted_flag} \ @@ -328,7 +326,8 @@ function run_brpc_test() { ======================================== EOF set +x - declare -a other_tests=("test_listen_and_serv_op" "system_allocator_test") + declare -a other_tests=("test_listen_and_serv_op" "system_allocator_test" \ + "rpc_server_test" "varhandle_test" "collective_server_test" "brpc_serde_test") all_tests=`ctest -N` for t in "${other_tests[@]}" @@ -527,31 +526,6 @@ function bind_test() { wait } - -function gen_docs() { - mkdir -p ${PADDLE_ROOT}/build - cd ${PADDLE_ROOT}/build - cat <= 0: + if ops_type[search_end_index] == "relu": + return Calibrator.u8_max + + input_name = input_index_name[search_end_index][0] + + for i in output_index_name.keys(): + if input_name in output_index_name[i]: + search_end_index = i + break + + if ops_type[ + search_end_index] not in Calibrator.const_sign_op_type and ops_type[ + search_end_index] != 'conv2d': + return Calibrator.s8_max + + if ops_type[search_end_index] != 'conv2d': + continue + + if program.current_block().ops[search_end_index].has_attr( + 'fuse_relu') and program.current_block().ops[ + search_end_index].attr('fuse_relu'): + return Calibrator.u8_max + else: + return Calibrator.s8_max + + return Calibrator.s8_max + + def __check_op_type_with_specified_var_as_input(self, + program, + var_name, + start_index=0): + ''' + Check whether all the type of ops that use the specified variable as the + input.If one of those op is not int8-enabled, return False. + ''' + op_type_list = [ + op.type for op in program.current_block().ops[start_index:] + if var_name in op.input_arg_names + ] + for i in op_type_list: + if not i in Calibrator.supported_int8_op_type: + return False + return True + + def __check_var_source_dt(self, var_name): + ''' + Check whether the specified variable is the output of int8 conv op or not. + If true, return the original op index. + If false, return -1 + ''' + return self._int8_output_var_op_index_dict[ + var_name] if var_name in self._int8_output_var_op_index_dict else -1 + + def __update_int8_output_var_op_index_dict(self, index, var_name=None): + ''' + Update the int8_output_variable/op_index dictionary + ''' + for k, v in self._int8_output_var_op_index_dict.items(): + if v >= index: + self._int8_output_var_op_index_dict[k] = v + 1 + if var_name: + self._int8_output_var_op_index_dict[var_name] = index + + def __update_program(self): + ''' + Update the program with the quantize/dequantize op insertion. + ''' + quantize_index, dequantize_index = self.__get_quantize_dequantize_combination( + self._output_program) + inserted_op_length = 0 + calc_max_func = self.__get_optimal_scaling_factor if self.algo == "KL" else np.max + insert_op_collection = sorted(quantize_index + dequantize_index) + + for index in insert_op_collection: + if index in quantize_index: + quantize_tmp = self._output_program.current_block().create_var( + name="quantize_{}_tmp".format(index), + dtype=core.VarDesc.VarType.UINT8) + original_out_name = self._output_program.current_block().ops[ + index + inserted_op_length - 1].output_names[0] + original_out = self._output_program.current_block().ops[ + index + inserted_op_length - 1].output(original_out_name)[0] + + op = self._output_program.current_block()._insert_op( + index=index + inserted_op_length, + type="quantize", + inputs={"Input": original_out}, + outputs={"Output": quantize_tmp}, ) + + op._set_attr("data_format", "MKLDNNLAYOUT") + op._set_attr("use_mkldnn", 1) + op._set_attr( + "Scale", self._var_max_range[original_out] / + calc_max_func(self._var_max_value_map[original_out])) + + if self.__get_max_range_by_var_name( + self._output_program, + original_out) == Calibrator.s8_max: + op._set_attr("is_negative_input", 1) + + self.__update_int8_output_var_op_index_dict( + index + inserted_op_length, "quantize_{}_tmp".format(index)) + + inserted_op_length += 1 + for op in self._output_program.current_block().ops[ + index + inserted_op_length:]: + for j in op.input_names: + if op.input(j) and op.input( + j + )[0] == original_out and op.type in Calibrator.supported_int8_op_type: + op.desc.set_input(j, + ["{}".format(quantize_tmp.name)]) + else: + start_index = index + inserted_op_length + dequantize_tmp_var = self._output_program.current_block( + ).create_var( + name="dequantize_{}_tmp".format(index + 1), + dtype="float32", ) + original_out_var = None + + for original_input in self._output_program.current_block().ops[ + start_index].input_arg_names: + index_res = self.__get_op_index_by_output_var( + self._output_program, original_input) + if index_res != -1: + original_out_var = original_input + break + + if original_out_var: + op = self._output_program.current_block()._insert_op( + index=start_index, + type="dequantize", + inputs={"Input": original_out_var}, + outputs={"Output": dequantize_tmp_var}) + op._set_attr("data_format", "MKLDNNLAYOUT") + op._set_attr("use_mkldnn", 1) + op._set_attr("Scale", self._var_max_range[original_out_var] + / calc_max_func(self._var_max_value_map[ + original_out_var])) + + for op_index in range( + start_index + 1, + len(self._output_program.current_block().ops)): + if self._output_program.current_block( + ).ops[op_index].type == "conv2d" and self._output_program.current_block( + ).ops[op_index].attr("force_fp32_output"): + continue + else: + for j in self._output_program.current_block().ops[ + op_index].input_names: + if len(self._output_program.current_block().ops[ + op_index].input(j) + ) and self._output_program.current_block( + ).ops[op_index].input(j)[ + 0] == original_out_var: + self._output_program.current_block( + ).ops[op_index].desc.set_input( + j, + ["{}".format(dequantize_tmp_var.name)]) + + inserted_op_length += 1 + + op._set_attr("data_format", "MKLDNNLAYOUT") + op._set_attr("use_mkldnn", 1) + + def __update_output_program_attr(self): + for i in self._output_program.list_vars(): + if i.name in self._persistable_vars: + i.persistable = False + os.system("rm -rf {}/{}".format(self.pretrained_model, i.name)) + + for i in self._u8_output_var: + self._output_program.current_block().var(i).desc.set_dtype( + core.VarDesc.VarType.UINT8) + + for i in self._s8_output_var: + self._output_program.current_block().var(i).desc.set_dtype( + core.VarDesc.VarType.INT8) + + @property + def sampling_program(self): + return self._output_program + + @property + def sampling_vars(self): + return self._weights_var_name + self._conv_input_var_name + self._conv_output_var_name + self._residual_input_var_name + self._pool2d_output_var_name + + def _is_close(self, a, b, rel_tol=1e-09, abs_tol=0.0): + return abs(a - b) <= max(rel_tol * max(abs(a), abs(b)), abs_tol) + + def __generate_output_program(self): + for i in self.program.list_vars(): + if not i.persistable and i.name in self.sampling_vars: + i.persistable = True + self._persistable_vars.append(i.name) + + self._output_program = self.program.clone() + + def __save_scale(self): + ''' + Update the convolution scale information. + ''' + func = self.__get_optimal_scaling_factor if self.algo == 'KL' else np.max + for i in self._conv_op_index[1:]: + weights_var_name = self.program.current_block().ops[i].input( + 'Filter')[0] + input_var_name = self.program.current_block().ops[i].input('Input')[ + 0] + output_var_name = self.program.current_block().ops[i].output( + 'Output')[0] + self._output_program.current_block().ops[i]._set_attr( + "Scale_weights", self._weights_scaling_factor[weights_var_name]) + + self._output_program.current_block().ops[i]._set_attr( + "Scale_in", self._var_max_range[input_var_name] / + func(self._var_max_value_map[input_var_name])) + self._output_program.current_block().ops[i]._set_attr( + "Scale_out", self._var_max_range[output_var_name] / + func(self._var_max_value_map[output_var_name])) + if self._output_program.current_block().ops[i].desc.input( + "ResidualData"): + residual_var_name = self._output_program.current_block().ops[ + i].desc.input("ResidualData")[0] + self._output_program.current_block().ops[i]._set_attr( + "Scale_in_eltwise", self._var_max_range[residual_var_name] / + func(self._var_max_value_map[residual_var_name])) + + def __sampling(self, sampling_data): + ''' + Sampling the variables data range. + ''' + for i in self.program.list_vars(): + if i.name not in self.sampling_vars: + continue + + if i.name in self._weights_var_name: + scaling_factor_per_channel = [] + data = sampling_data[i.name][0] + for j in range(data.shape[0]): + var_value = float(np.max(np.abs(data[j]))) + if not self._is_close(var_value, 0.0): + scaling_factor_per_channel.append(Calibrator.s8_max / + var_value) + else: + scaling_factor_per_channel.append(0.0) + self._weights_scaling_factor[ + i.name] = scaling_factor_per_channel + else: + if i.name in self._conv_output_var_name: + op_pos = self.__get_op_index_by_output_var(self.program, + i.name) + cur_op = self.program.current_block().ops[op_pos] + + if cur_op.has_attr('fuse_relu') and cur_op.attr( + 'fuse_relu'): + max_range = Calibrator.u8_max + self._u8_output_var.append(i.name) + else: + max_range = Calibrator.s8_max + self._s8_output_var.append(i.name) + else: + max_range = self.__get_max_range_by_var_name(self.program, + i.name) + max_value = [[np.abs(np_data)] + for np_data in sampling_data[i.name]] + + self._var_max_range[i.name] = max_range + self._var_max_value_map[i.name] = max_value + + def __check_force_fp32_attr_by_output_var(self, program, var_name): + for op in program.current_block().ops: + if op.type == "conv2d" and var_name in op.output_arg_names: + return op.attr("force_fp32_output") + return False + + def __get_op_index_by_output_var(self, program, var_name, start_index=0): + ''' + Check whether the specified input variable is the output of the + conv/pool2d op's output or not. + + Returns: + The index if the variable is the output of any conv/pool2d op's + output. + -1 when the variable is not the output of any conv/pool2d op's + output. + ''' + for index, op in enumerate(program.current_block().ops[start_index:]): + if var_name in op.output_arg_names and op.type in Calibrator.supported_int8_op_type: + return index + return -1 + + def __get_op_index_by_input_var(self, program, var_name, start_index=0): + ''' + Get the op index by specified input variable. + Returns: + The op index if the variable is the input of this op or -1 if the + variable is not the input of any op. + ''' + for index, op in enumerate(program.current_block().ops[start_index:]): + if var_name in op.input_arg_names: + return index + + return -1 + + def __get_quantize_dequantize_combination(self, program): + """ + Get the quantize/dequantize op index for further inserting. + Args: + The program desc. + Returns: + Two lists contains the quantize op and dequantize op index information. + """ + quantize_op_index = [] + dequantize_op_index = [] + minimal_conv_count = 2 # there must be two conv ops if not enable the first conv int8. + if len(self._conv_op_index) < minimal_conv_count: + return [], [] + + for index, value in enumerate(self._conv_op_index): + if index == 0: + quantize_op_index.append(self._conv_op_index[index + 1]) + elif index == len(self._conv_op_index) - 1: + output_var = program.current_block().ops[value].output( + "Output")[0] + if self.__check_op_type_with_specified_var_as_input( + program, output_var, index): + dequantize_op_index.append(self._conv_op_index[index] + 2) + else: + program.current_block().ops[value]._set_attr( + "force_fp32_output", True) + + elif self._conv_op_index[index] + 1 < self._conv_op_index[index + + 1]: + + program.current_block().ops[self._conv_op_index[ + index]]._set_attr("force_fp32_output", True) + + for op_index in range(self._conv_op_index[index + 1], + self._conv_op_index[index], -1): + op_type = program.current_block().ops[op_index].type + op_has_int8_input = False + input_var_name = None + input_length = len(program.current_block().ops[op_index] + .input_arg_names) + + for var_name in program.current_block().ops[ + op_index].input_arg_names: + if self.__check_var_source_dt(var_name) != -1: + op_has_int8_input = True + input_var_name = var_name + break + + if op_has_int8_input: + if op_type == "conv2d": + if program.current_block().ops[op_index + + 1].type == "conv2d": + continue + elif program.current_block( + ).ops[op_index + + 1].type in Calibrator.non_conv_int8_op_type: + dequantize_op_index.append(op_index + 2) + break + else: + program.current_block().ops[op_index]._set_attr( + "force_fp32_output", True) + continue + elif not self.__check_force_fp32_attr_by_output_var( + program, input_var_name + ) and op_index not in dequantize_op_index: + share_input_flag = True + for input_attr_name in program.current_block().ops[ + op_index].input_names: + input_var_name = program.current_block().ops[ + op_index].input(input_attr_name)[0] + cousin_op_index = self.__get_op_index_by_input_var( + program, input_var_name) + if cousin_op_index != -1 and cousin_op_index in dequantize_op_index: + share_input_flag = False + break + if share_input_flag: + dequantize_op_index.append(op_index) + + elif input_length: + output_is_to_int8_op = False + share_input_flag = True + for var_name in program.current_block().ops[ + op_index].input_arg_names: + if not self.__check_op_type_with_specified_var_as_input( + program, var_name): + share_input_flag = False + break + + for var_name in program.current_block().ops[ + op_index].output_arg_names: + if self.__get_op_index_by_output_var( + program, var_name, op_index) != -1: + output_is_to_int8_op = True + break + + if share_input_flag or output_is_to_int8_op: + quantize_op_index.append(op_index) + + return quantize_op_index, dequantize_op_index + + def __init_analysis(self): + ''' + Collect the variable names for sampling. + ''' + start_index = 1 #analysis the conv op detail from second conv op. + + for i in self._conv_op_index[start_index:]: + self._weights_var_name.append(self.program.current_block().ops[i] + .input('Filter')[0]) + self._conv_input_var_name.append(self.program.current_block().ops[i] + .input('Input')[0]) + self._conv_output_var_name.append(self.program.current_block().ops[ + i].output('Output')[0]) + self._int8_output_var_op_index_dict[self.program.current_block() + .ops[i].output('Output')[0]] = i + if self.program.current_block().ops[i].desc.input("ResidualData"): + self._residual_input_var_name.append(self.program.current_block( + ).ops[i].desc.input("ResidualData")[0]) + + if self.program.current_block().ops[i + 1].type == "pool2d": + self._pool2d_output_var_name.append(self.program.current_block( + ).ops[i + 1].output('Out')[0]) + + def __expand_quantized_bins(self, quantized_bins, reference_bins): + expanded_quantized_bins = [0] * len(reference_bins) + num_merged_bins = len(reference_bins) / len(quantized_bins) + j_start = 0 + j_end = num_merged_bins + for idx in xrange(len(quantized_bins)): + zero_count = reference_bins[j_start:j_end].count(0) + num_merged_bins = j_end - j_start + if zero_count == num_merged_bins: + avg_bin_ele = 0 + else: + avg_bin_ele = quantized_bins[idx] / ( + num_merged_bins - zero_count + 0.0) + for idx1 in xrange(j_start, j_end): + expanded_quantized_bins[idx1] = (0 if reference_bins[idx1] == 0 + else avg_bin_ele) + j_start += num_merged_bins + j_end += num_merged_bins + if (idx + 1) == len(quantized_bins) - 1: + j_end = len(reference_bins) + return expanded_quantized_bins + + def __safe_entropy(self, reference_distr_P, P_sum, candidate_distr_Q, + Q_sum): + ''' + Calculate the entropy. + ''' + assert len(reference_distr_P) == len(candidate_distr_Q) + tmp_sum1 = 0 + tmp_sum2 = 0 + for idx in range(len(reference_distr_P)): + p_idx = reference_distr_P[idx] + q_idx = candidate_distr_Q[idx] + if p_idx == 0: + tmp_sum1 += 0 + tmp_sum2 += 0 + else: + if q_idx == 0: + print("Fatal error!, idx = " + str(idx) + + " qindex = 0! p_idx = " + str(p_idx)) + tmp_sum1 += p_idx * (math.log(Q_sum * p_idx)) + tmp_sum2 += p_idx * (math.log(P_sum * q_idx)) + return (tmp_sum1 - tmp_sum2) / P_sum + + # Reference: http://on-demand.gputechconf.com/gtc/2017/presentation/s7310-8-bit-inference-with-tensorrt.pdf + def __get_optimal_scaling_factor(self, + activation_blob, + num_quantized_bins=255): + ''' + Using the KL-divergenc method to get the more precise scaling factor. + ''' + max_val = np.max(activation_blob) + min_val = np.min(activation_blob) + if min_val >= 0: + hist, hist_edeges = np.histogram( + activation_blob, bins=2048, range=(min_val, max_val)) + ending_iter = 2047 + starting_iter = int(ending_iter * 0.7) + else: + th = max(abs(max_val), abs(min_val)) + hist, hist_edeges = np.histogram( + activation_blob, bins=2048, range=(-th, th)) + starting_iter = 0 + ending_iter = 2047 + if abs(max_val) > abs(min_val): + while starting_iter < ending_iter: + if hist[starting_iter] == 0: + starting_iter += 1 + continue + else: + break + starting_iter += int((ending_iter - starting_iter) * 0.6) + else: + while ending_iter > 0: + if hist[ending_iter] == 0: + ending_iter -= 1 + continue + else: + break + starting_iter = int(0.6 * ending_iter) + bin_width = hist_edeges[1] - hist_edeges[0] + P_sum = len(activation_blob) + min_kl_divergence = 0 + min_kl_index = 0 + kl_inited = False + for i in range(starting_iter, ending_iter + 1): + reference_distr_P = hist[0:i].tolist() + outliers_count = sum(hist[i:2048]) + if reference_distr_P[i - 1] == 0: + continue + reference_distr_P[i - 1] += outliers_count + reference_distr_bins = reference_distr_P[:] + candidate_distr_Q = hist[0:i].tolist() + num_merged_bins = i / num_quantized_bins + candidate_distr_Q_quantized = [0] * num_quantized_bins + j_start = 0 + j_end = num_merged_bins + for idx in xrange(num_quantized_bins): + candidate_distr_Q_quantized[idx] = sum(candidate_distr_Q[ + j_start:j_end]) + j_start += num_merged_bins + j_end += num_merged_bins + if (idx + 1) == num_quantized_bins - 1: + j_end = i + candidate_distr_Q = self.__expand_quantized_bins( + candidate_distr_Q_quantized, reference_distr_bins) + Q_sum = sum(candidate_distr_Q) + kl_divergence = self.__safe_entropy(reference_distr_P, P_sum, + candidate_distr_Q, Q_sum) + if not kl_inited: + min_kl_divergence = kl_divergence + min_kl_index = i + kl_inited = True + elif kl_divergence < min_kl_divergence: + min_kl_divergence = kl_divergence + min_kl_index = i + else: + pass + if min_kl_index == 0: + while starting_iter > 0: + if hist[starting_iter] == 0: + starting_iter -= 1 + continue + else: + break + min_kl_index = starting_iter + return (min_kl_index + 0.5) * bin_width + + @staticmethod + def __dot(program, output_name="model.dot"): + ''' + Generate the graphiz dot file for debugging. + ''' + dot_graph = "" + dot_nodes = [] + dot_edges = [] + dot_graph += "digraph pm {\n" + for block in program.blocks: + ops = list(block.ops) + for index, op in enumerate(ops): + op_type = op.type + op_name = op_type + "_" + op.output_arg_names[0].replace( + ".", "_") + "___" + str(index) + for name in op.input_arg_names: + name = name.replace(".", "_") + dot_edge = name + " -> " + op_name + if dot_edge not in dot_edges: + dot_edges.append(dot_edge) + dot_node = name + " [shape=oval, style=filled, fillcolor=yellow]" + if dot_node not in dot_nodes: + dot_nodes.append(dot_node) + + for name in op.output_arg_names: + name = name.replace(".", "_") + dot_edge = op_name + " -> " + name + if dot_edge not in dot_edges: + dot_edges.append(dot_edge) + if op_type in Calibrator.supported_int8_op_type: + if op_type == "conv2d" and op.has_attr( + 'force_fp32_output') and op.attr( + "force_fp32_output"): + dot_node = op_name + " [shape=box, style=filled, color=deeppink]" + else: + dot_node = op_name + " [shape=box, style=filled, color=greenyellow]" + elif op_type in ["quantize", "dequantize"]: + dot_node = op_name + " [shape=box, style=filled, color=gold]" + else: + dot_node = op_name + " [shape=box, style=filled, fillcolor=red]" + + if dot_node not in dot_nodes: + dot_nodes.append(dot_node) + + for dot_edge in dot_edges: + dot_graph += dot_edge + "\n" + for dot_node in dot_nodes: + dot_graph += dot_node + "\n" + dot_graph += "}" + + with open(output_name, 'w') as f: + f.write(dot_graph) diff --git a/python/paddle/fluid/contrib/reader/README.md b/python/paddle/fluid/contrib/reader/README.md new file mode 100644 index 0000000000000000000000000000000000000000..9e4b7d1ce3d9664495220d7ccfc6ef6eac0b81c2 --- /dev/null +++ b/python/paddle/fluid/contrib/reader/README.md @@ -0,0 +1,15 @@ +## CTR READER + +An multi-thread cpp reader that has the same interface with py_reader. It +uses cpp multi-thread to read file and is much more faster then the Python read +thread in py_reader. + +Currently, it support two types of file: + - gzip + - plain text file + +and two types of data format: + - cvs data format is : + * label dense_fea,dense_fea sparse_fea,sparse_fea + - the svm data format is : + * label slot1:fea_sign slot2:fea_sign slot1:fea_sign diff --git a/python/paddle/fluid/contrib/reader/__init__.py b/python/paddle/fluid/contrib/reader/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..4cf85ffc166420f117db9576b4d687c96d429e3c --- /dev/null +++ b/python/paddle/fluid/contrib/reader/__init__.py @@ -0,0 +1,19 @@ +# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import print_function + +from . import ctr_reader + +__all__ = ctr_reader.__all__ diff --git a/python/paddle/fluid/contrib/reader/ctr_reader.py b/python/paddle/fluid/contrib/reader/ctr_reader.py index b8449e8d848670f8262aa01e5654e0e2fc621837..44e8647f8c3f52b0d3c52c7febfe2ef4ef878bd8 100644 --- a/python/paddle/fluid/contrib/reader/ctr_reader.py +++ b/python/paddle/fluid/contrib/reader/ctr_reader.py @@ -20,6 +20,8 @@ from paddle.fluid.framework import default_main_program, \ default_startup_program, Variable from paddle.fluid.unique_name import generate as unique_name +__all__ = ['ctr_reader'] + def monkey_patch_reader_methods(reader): def __get_reader__(): @@ -30,7 +32,11 @@ def monkey_patch_reader_methods(reader): def reset(): return __get_reader__().reset() + def start(): + return __get_reader__().start() + reader.reset = reset + reader.start = start reader.stop_gradient = True reader.persistable = True return reader @@ -44,13 +50,18 @@ def _copy_reader_var_(block, var): return new_var -def ctr_reader(feed_data, - capacity, - thread_num, - batch_size, - file_list, - slots, - name=None): +def ctr_reader( + feed_dict, + file_type, # gzip or plain + file_format, # csv or svm + dense_slot_index, + sparse_slot_index, + capacity, + thread_num, + batch_size, + file_list, + slots, + name=None): """ Create a CTR reader for data feeding in Python @@ -67,12 +78,21 @@ def ctr_reader(feed_data, Note that :code:`Program.clone()` method cannot clone :code:`py_reader`. Args: + feed_dict(list(variable)): a list of data variable. + file_type('gzip'|'plain'): the type of the data file + file_format('csv'|'svm'): csv data or svm data format. + cvs data format is : + label dense_fea,dense_fea sparse_fea,sparse_fea + the svm data format is : + label slot1:fea_sign slot2:fea_sign slot1:fea_sign + dense_slot_index(list(int)): the index of dense slots + sparse_slot_index(list(int)): the index of sparse slots capacity(int): The buffer capacity maintained by :code:`py_reader`. - thread_num(list|tuple): List of tuples which declaring data shapes. - batch_size(list|tuple): List of strs which declaring data type. - file_list(list|tuple): List of ints which declaring data lod_level. - slots(bool): Whether use double buffer or not. - name(basestring): The prefix Python queue name and Reader name. None will + thread_num(int): the thread num to read files by cpp reader. + batch_size(int): batch size of data. + file_list(list(str)): List of file names that need to read. + slots(list(int64)): list of slot id. + name(string): The prefix Python queue name and Reader name. None will be generated automatically. Returns: @@ -80,7 +100,15 @@ def ctr_reader(feed_data, Examples: - 1. The basic usage of :code:`py_reader` is as follows: + 1. The basic usage of :code:`ctr_reader` is as follows: + + .. code-block:: python + + py_reader = fluid.contrib.ctr_reader.ctr_reader( + feed_dict=datas, file_type='plain', file_format='csv', + file_list=file_list, dense_slot_indexs=[1, 2, 3, 4], sparse_slot_indexs=[], + capacity=64, thread_num=20, batch_size=1000, slots=[], name='ctr_reader') + """ if name is None: queue_name = unique_name('lod_tensor_blocking_queue') @@ -90,7 +118,7 @@ def ctr_reader(feed_data, reader_name = "_".join([name, "reader"]) var = global_scope().var(queue_name) - feed_queue = core.init_lod_tensor_blocking_queue(var, capacity, shapes) + feed_queue = core.init_lod_tensor_blocking_queue(var, capacity) startup_blk = default_startup_program().current_block() reader_var = startup_blk.create_var(name=reader_name) @@ -99,12 +127,22 @@ def ctr_reader(feed_data, inputs={'blocking_queue': [queue_name]}, outputs={'Out': [reader_var]}, attrs={ + 'use_data_config': False, 'thread_num': thread_num, 'batch_size': batch_size, 'file_list': file_list, - 'slots': slots, + 'file_type': file_type, + 'file_format': file_format, + 'dense_slot_index': dense_slot_index, + 'sparse_slot_index': sparse_slot_index, + 'sparse_slots': slots, + 'ranks': [], + 'lod_levels': [], + 'shape_concat': [] }) + dtypes = [data.dtype for data in feed_dict] + reader_var.desc.set_dtypes(dtypes) reader_var.persistable = True main_prog_reader_var = _copy_reader_var_( @@ -118,6 +156,9 @@ def ctr_reader(feed_data, main_blk = default_main_program().current_block() main_blk.append_op( - type='read', inputs={'Reader': [reader]}, outputs={'Out': feed_data}) + type='read', + inputs={'Reader': [reader]}, + attrs={'infer_out': False}, + outputs={'Out': feed_dict}) return reader diff --git a/python/paddle/fluid/contrib/tests/CMakeLists.txt b/python/paddle/fluid/contrib/tests/CMakeLists.txt index 79bec8c4ad34d682895250bc29b1fddb3a569bd4..81aee1233d1db756686d1a934b94672dc5c770fe 100644 --- a/python/paddle/fluid/contrib/tests/CMakeLists.txt +++ b/python/paddle/fluid/contrib/tests/CMakeLists.txt @@ -1,6 +1,10 @@ file(GLOB TEST_OPS RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}" "test_*.py") string(REPLACE ".py" "" TEST_OPS "${TEST_OPS}") +if(APPLE OR WIN32 OR NOT WITH_MKL) + list(REMOVE_ITEM TEST_OPS test_calibration) +endif() + foreach(src ${TEST_OPS}) py_test(${src} SRCS ${src}.py) endforeach() diff --git a/python/paddle/fluid/contrib/tests/test_calibration.py b/python/paddle/fluid/contrib/tests/test_calibration.py new file mode 100644 index 0000000000000000000000000000000000000000..f07fefe7e097377a845193bb37b6e9aa42708948 --- /dev/null +++ b/python/paddle/fluid/contrib/tests/test_calibration.py @@ -0,0 +1,257 @@ +# copyright (c) 2018 paddlepaddle authors. all rights reserved. +# +# licensed under the apache license, version 2.0 (the "license"); +# you may not use this file except in compliance with the license. +# you may obtain a copy of the license at +# +# http://www.apache.org/licenses/license-2.0 +# +# unless required by applicable law or agreed to in writing, software +# distributed under the license is distributed on an "as is" basis, +# without warranties or conditions of any kind, either express or implied. +# see the license for the specific language governing permissions and +# limitations under the license. +import unittest +import os +import numpy as np +import time +import sys +import random +import paddle +import paddle.fluid as fluid +import argparse +import functools +import contextlib +import paddle.fluid.profiler as profiler +from paddle.dataset.common import download +from PIL import Image, ImageEnhance +import math +sys.path.append('..') +import int8_inference.utility as int8_utility + +random.seed(0) +np.random.seed(0) + +DATA_DIM = 224 + +THREAD = 1 +BUF_SIZE = 102400 + +DATA_DIR = 'data/ILSVRC2012' + +img_mean = np.array([0.485, 0.456, 0.406]).reshape((3, 1, 1)) +img_std = np.array([0.229, 0.224, 0.225]).reshape((3, 1, 1)) + + +# TODO(guomingz): Remove duplicated code from line 45 ~ line 114 +def resize_short(img, target_size): + percent = float(target_size) / min(img.size[0], img.size[1]) + resized_width = int(round(img.size[0] * percent)) + resized_height = int(round(img.size[1] * percent)) + img = img.resize((resized_width, resized_height), Image.LANCZOS) + return img + + +def crop_image(img, target_size, center): + width, height = img.size + size = target_size + if center == True: + w_start = (width - size) / 2 + h_start = (height - size) / 2 + else: + w_start = np.random.randint(0, width - size + 1) + h_start = np.random.randint(0, height - size + 1) + w_end = w_start + size + h_end = h_start + size + img = img.crop((w_start, h_start, w_end, h_end)) + return img + + +def process_image(sample, mode, color_jitter, rotate): + img_path = sample[0] + + img = Image.open(img_path) + + img = resize_short(img, target_size=256) + img = crop_image(img, target_size=DATA_DIM, center=True) + + if img.mode != 'RGB': + img = img.convert('RGB') + + img = np.array(img).astype('float32').transpose((2, 0, 1)) / 255 + img -= img_mean + img /= img_std + + return img, sample[1] + + +def _reader_creator(file_list, + mode, + shuffle=False, + color_jitter=False, + rotate=False, + data_dir=DATA_DIR): + def reader(): + with open(file_list) as flist: + full_lines = [line.strip() for line in flist] + if shuffle: + np.random.shuffle(full_lines) + + lines = full_lines + + for line in lines: + img_path, label = line.split() + img_path = os.path.join(data_dir, img_path) + if not os.path.exists(img_path): + continue + yield img_path, int(label) + + mapper = functools.partial( + process_image, mode=mode, color_jitter=color_jitter, rotate=rotate) + + return paddle.reader.xmap_readers(mapper, reader, THREAD, BUF_SIZE) + + +def val(data_dir=DATA_DIR): + file_list = os.path.join(data_dir, 'val_list.txt') + return _reader_creator(file_list, 'val', shuffle=False, data_dir=data_dir) + + +class TestCalibrationForResnet50(unittest.TestCase): + def setUp(self): + self.int8_download = 'int8/download' + self.cache_folder = os.path.expanduser('~/.cache/paddle/dataset/' + + self.int8_download) + + data_url = 'http://paddle-inference-dist.cdn.bcebos.com/int8/calibration_test_data.tar.gz' + data_md5 = '1b6c1c434172cca1bf9ba1e4d7a3157d' + self.data_cache_folder = self.download_data(data_url, data_md5, "data") + + # reader/decorator.py requires the relative path to the data folder + cmd = 'rm -rf {0} && ln -s {1} {0}'.format("data", + self.data_cache_folder) + os.system(cmd) + + self.iterations = 50 + + def cache_unzipping(self, target_folder, zip_path): + if not os.path.exists(target_folder): + cmd = 'mkdir {0} && tar xf {1} -C {0}'.format(target_folder, + zip_path) + os.system(cmd) + + def download_data(self, data_url, data_md5, folder_name): + download(data_url, self.int8_download, data_md5) + data_cache_folder = os.path.join(self.cache_folder, folder_name) + file_name = data_url.split('/')[-1] + zip_path = os.path.join(self.cache_folder, file_name) + self.cache_unzipping(data_cache_folder, zip_path) + return data_cache_folder + + def download_resnet50_model(self): + # resnet50 fp32 data + data_url = 'http://paddle-inference-dist.cdn.bcebos.com/int8/resnet50_int8_model.tar.gz' + data_md5 = '4a5194524823d9b76da6e738e1367881' + self.model_cache_folder = self.download_data(data_url, data_md5, + "resnet50_fp32") + + def run_program(self, model_path, generate_int8=False, algo='direct'): + image_shape = [3, 224, 224] + os.environ['FLAGS_use_mkldnn'] = 'True' + + fluid.memory_optimize(fluid.default_main_program()) + + exe = fluid.Executor(fluid.CPUPlace()) + + [infer_program, feed_dict, + fetch_targets] = fluid.io.load_inference_model(model_path, exe) + + t = fluid.transpiler.InferenceTranspiler() + t.transpile(infer_program, fluid.CPUPlace()) + + val_reader = paddle.batch(val(), batch_size=1) + + if generate_int8: + int8_model = os.path.join(os.getcwd(), "calibration_out") + + if os.path.exists(int8_model): + os.system("rm -rf " + int8_model) + os.system("mkdir " + int8_model) + + print("Start calibration ...") + + calibrator = int8_utility.Calibrator( + program=infer_program, + pretrained_model=model_path, + algo=algo, + exe=exe, + output=int8_model, + feed_var_names=feed_dict, + fetch_list=fetch_targets) + + test_info = [] + cnt = 0 + for batch_id, data in enumerate(val_reader()): + image = np.array( + [x[0].reshape(image_shape) for x in data]).astype("float32") + label = np.array([x[1] for x in data]).astype("int64") + label = label.reshape([-1, 1]) + running_program = calibrator.sampling_program.clone( + ) if generate_int8 else infer_program.clone() + for op in running_program.current_block().ops: + if op.has_attr("use_mkldnn"): + op._set_attr("use_mkldnn", True) + + _, acc1, _ = exe.run( + running_program, + feed={feed_dict[0]: image, + feed_dict[1]: label}, + fetch_list=fetch_targets) + if generate_int8: + calibrator.sample_data() + + test_info.append(np.mean(acc1) * len(data)) + cnt += len(data) + + if batch_id != self.iterations - 1: + continue + + break + + if generate_int8: + calibrator.save_int8_model() + + print( + "Calibration is done and the corresponding files are generated at {}". + format(os.path.abspath("calibration_out"))) + else: + return np.sum(test_info) / cnt + + def test_calibration(self): + self.download_resnet50_model() + fp32_acc1 = self.run_program(self.model_cache_folder + "/model") + self.run_program(self.model_cache_folder + "/model", True) + int8_acc1 = self.run_program("calibration_out") + delta_value = np.abs(fp32_acc1 - int8_acc1) + self.assertLess(delta_value, 0.01) + + +class TestCalibrationForMobilenetv1(TestCalibrationForResnet50): + def download_mobilenetv1_model(self): + # mobilenetv1 fp32 data + data_url = 'http://paddle-inference-dist.cdn.bcebos.com/int8/mobilenetv1_int8_model.tar.gz' + data_md5 = '13892b0716d26443a8cdea15b3c6438b' + self.model_cache_folder = self.download_data(data_url, data_md5, + "mobilenetv1_fp32") + + def test_calibration(self): + self.download_mobilenetv1_model() + fp32_acc1 = self.run_program(self.model_cache_folder + "/model") + self.run_program(self.model_cache_folder + "/model", True, algo='KL') + int8_acc1 = self.run_program("calibration_out") + delta_value = np.abs(fp32_acc1 - int8_acc1) + self.assertLess(delta_value, 0.01) + + +if __name__ == '__main__': + unittest.main() diff --git a/python/paddle/fluid/framework.py b/python/paddle/fluid/framework.py index fc5e471ae3041b0245c873d85efa8b49f3a43678..2bdae60db347b3d42fded138a20a505486e48dbc 100644 --- a/python/paddle/fluid/framework.py +++ b/python/paddle/fluid/framework.py @@ -70,6 +70,7 @@ ZERO_VAR_SUFFIX = core.kZeroVarSuffix() CONTROL_DEP_VAR_PREFIX = core.kControlDepVarName() _imperative_tracer_ = None +_imperative_current_expected_place_ = None def _in_imperative_mode(): @@ -80,6 +81,10 @@ def _imperative_tracer(): return _imperative_tracer_ +def _current_expected_place(): + return _imperative_current_expected_place_ + + class NameScope(object): def __init__(self, name="", parent=None): self._children = dict() @@ -383,8 +388,8 @@ class Variable(object): self._ivar.stop_gradient = stop_gradient def _numpy(self): - tensor = self._ivar.value().get_tensor() - return np.array(tensor) + new_ivar = self._ivar._copy_to(core.CPUPlace(), True) + return np.array(new_ivar.value().get_tensor()) def _backward(self): self._ivar._run_backward() @@ -1311,6 +1316,7 @@ class Block(object): def _trace_op(self, op, stop_gradient=False): if _in_imperative_mode(): _imperative_tracer().trace(op.iop, op.inputs, op.outputs, self.desc, + _imperative_current_expected_place_, stop_gradient) def _insert_op(self, index, *args, **kwargs): @@ -1696,12 +1702,20 @@ class Program(object): self._current_role = core.op_proto_and_checker_maker.OpRole.Forward self._op_role_var = [] - # for distribute + # for distribute training + # _is_distributed = True if under distributed training self._is_distributed = False + # _is_chief = True if the trainer is the first one, usually No.0 self._is_chief = False - self._slice_vars_and_attrs = [] + # _parameters_on_pservers records all the parameters distributed on parameter servers. + self._parameters_on_pservers = None + # _endpoints is a list about parameter servers ip:port, such as ["ip:port","ip:port"] self._endpoints = [] + # if current role is parameter server, the _ps_endpoint is its "ip:port" + self._ps_endpoint = None + # trainers_endpoints, it is used for distribution. self._trainers_endpoints = [] + # the distributed lookup table names self._distributed_lookup_table = None @property @@ -2232,8 +2246,9 @@ class Program(object): "Program") self._is_distributed = other._is_distributed self._is_chief = other._is_chief - self._slice_vars_and_attrs = other._slice_vars_and_attrs + self._parameters_on_pservers = other._parameters_on_pservers self._endpoints = other._endpoints + self._ps_endpoint = other._ps_endpoint self._distributed_lookup_table = other._distributed_lookup_table def _copy_data_info_from(self, other): @@ -2493,5 +2508,18 @@ def _imperative_guard(tracer): global _imperative_tracer_ tmp_trace = _imperative_tracer_ _imperative_tracer_ = tracer + yield + _imperative_tracer_ = tmp_trace + + +@contextlib.contextmanager +def _imperative_place_guard(place): + global _imperative_current_expected_place_ + tmp_place = _imperative_current_expected_place_ + _imperative_current_expected_place_ = place + + yield + + _imperative_current_expected_place_ = tmp_place diff --git a/python/paddle/fluid/imperative/base.py b/python/paddle/fluid/imperative/base.py index 5d3ebb25a935cea6ec376e6bc044281dcba37337..ff3984b11f42cf9e6ff49c8654c600c065effe1d 100644 --- a/python/paddle/fluid/imperative/base.py +++ b/python/paddle/fluid/imperative/base.py @@ -25,18 +25,28 @@ def enabled(): @contextlib.contextmanager -def guard(): +def guard(place=None): train = framework.Program() startup = framework.Program() tracer = core.Tracer(train.current_block().desc) + + if place is None: + if core.is_compiled_with_cuda(): + place = core.CUDAPlace(0) + else: + place = core.CPUPlace() + with framework.program_guard(train, startup): with framework.unique_name.guard(): with framework._imperative_guard(tracer): - yield + with framework._imperative_place_guard(place): + yield def to_variable(value, block=None): if isinstance(value, np.ndarray): + assert enabled(), "to_variable could only be called in imperative mode" + if not block: block = framework.default_main_program().current_block() py_var = framework.Variable( @@ -47,9 +57,7 @@ def to_variable(value, block=None): dtype=value.dtype) var = py_var._ivar.value() tensor = var.get_tensor() - tensor.set(value, core.CPUPlace()) + tensor.set(value, framework._current_expected_place()) return py_var elif isinstance(value, framework.Variable): return value - else: - raise ValueError("Unsupported type %s" % type(value)) diff --git a/python/paddle/fluid/imperative/nn.py b/python/paddle/fluid/imperative/nn.py index 03fbfe76d120e30edbe7f88ca716141d150d3a9c..140c0ff037d453641cc119301269121025e17cbd 100644 --- a/python/paddle/fluid/imperative/nn.py +++ b/python/paddle/fluid/imperative/nn.py @@ -27,6 +27,7 @@ __all__ = [ 'Conv2D', 'Pool2D', 'FC', + 'BatchNorm', ] @@ -55,7 +56,8 @@ class Conv2D(layers.Layer): param_attr=param_attr, bias_attr=bias_attr, dtype=dtype, - name=name) + name=name, + act=act) self._groups = groups self._stride = utils.convert_to_list(stride, 2, 'stride') @@ -141,6 +143,7 @@ class Conv2D(layers.Layer): outputs={'Out': [pre_act]}, attrs={'axis': 1}) + # Currently, we don't support inplace in imperative mode return self._helper.append_activation(pre_act) @@ -216,6 +219,7 @@ class FC(layers.Layer): act=None, name=None): super(FC, self).__init__() + self._size = size self._num_flatten_dims = num_flatten_dims self._dtype = dtype @@ -241,6 +245,16 @@ class FC(layers.Layer): dtype=self._dtype, is_bias=False) + if self._helper.bias_attr: + size = list([self._size]) + self._b = self._helper.create_parameter( + attr=self._helper.bias_attr, + shape=size, + dtype=self._dtype, + is_bias=True) + else: + self._b = None + def forward(self, input): tmp = self._helper.create_variable_for_type_inference(self._dtype) self._helper.append_op( @@ -253,28 +267,155 @@ class FC(layers.Layer): "y_num_col_dims": 1 }) - out = self._helper.create_variable_for_type_inference(self._dtype) + pre_bias = self._helper.create_variable_for_type_inference(self._dtype) self._helper.append_op( type="sum", inputs={"X": [tmp]}, - outputs={"Out": out}, + outputs={"Out": pre_bias}, attrs={"use_mkldnn": False}) - bias_attr = self._helper.bias_attr - if bias_attr: - # add bias - size = list(out.shape[1:]) - if not self._built: - self._b = self._helper.create_parameter( - attr=bias_attr, shape=size, dtype=out.dtype, is_bias=True) - bias_out = self._helper.create_variable_for_type_inference( - dtype=out.dtype) + if self._b: + pre_activation = self._helper.create_variable_for_type_inference( + dtype=self._dtype) self._helper.append_op( type='elementwise_add', - inputs={'X': [out], + inputs={'X': [pre_bias], 'Y': [self._b]}, - outputs={'Out': [bias_out]}, - attrs={'axis': 1}) - out = bias_out - # add activation - return self._helper.append_activation(out) + outputs={'Out': [pre_activation]}, + attrs={'axis': self._num_flatten_dims}) + else: + pre_activation = pre_bias + # Currently, we don't support inplace in imperative mode + return self._helper.append_activation(pre_activation) + + +class BatchNorm(layers.Layer): + def __init__(self, + num_channels, + act=None, + is_test=False, + momentum=0.9, + epsilon=1e-05, + param_attr=None, + bias_attr=None, + dtype=core.VarDesc.VarType.FP32, + data_layout='NCHW', + in_place=False, + name=None, + moving_mean_name=None, + moving_variance_name=None, + do_model_average_for_mean_and_var=False, + fuse_with_relu=False, + use_global_stats=False): + super(BatchNorm, self).__init__() + + assert bias_attr is not False, "bias_attr should not be False in batch_norm." + + from ..layer_helper import LayerHelper + self._helper = LayerHelper( + 'batch_norm', + param_attr=param_attr, + bias_attr=bias_attr, + name=name, + act=act) + + if dtype == core.VarDesc.VarType.FP16: + self._dtype = core.VarDesc.VarType.FP32 + else: + self._dtype = dtype + + param_shape = [num_channels] + + # create parameter + self._scale = self._helper.create_parameter( + attr=self._helper.param_attr, + shape=param_shape, + dtype=self._dtype, + default_initializer=Constant(1.0)) + + # TODO(minqiyang): change stop_gradient sign to trainable to align with static graph + # # setting stop_gradient=True to reduce computation + # if use_global_stats and self._helper.param_attr.learning_rate == 0.: + # self._scale.stop_gradient = True + + self._bias = self._helper.create_parameter( + attr=self._helper.bias_attr, + shape=param_shape, + dtype=self._dtype, + is_bias=True) + # TODO(minqiyang): change stop_gradient sign to trainable to align with static graph + # # setting stop_gradient=True to reduce computation + # if use_global_stats and self._helper.bias_attr.learning_rate == 0.: + # self._bias.stop_gradient = True + + self._mean = self._helper.create_parameter( + attr=ParamAttr( + name=moving_mean_name, + initializer=Constant(0.0), + trainable=False, + do_model_average=do_model_average_for_mean_and_var), + shape=param_shape, + dtype=self._dtype) + self._mean.stop_gradient = True + + self._variance = self._helper.create_parameter( + attr=ParamAttr( + name=moving_variance_name, + initializer=Constant(1.0), + trainable=False, + do_model_average=do_model_average_for_mean_and_var), + shape=param_shape, + dtype=self._dtype) + self._variance.stop_gradient = True + + self._in_place = in_place + self._momentum = momentum + self._epsilon = epsilon + self._is_test = is_test + self._fuse_with_relu = fuse_with_relu + self._use_global_stats = use_global_stats + + def _build_once(self, input): + pass + + def forward(self, input): + # create output + # mean and mean_out share the same memory + mean_out = self._mean + # variance and variance out share the same memory + variance_out = self._variance + + saved_mean = self._helper.create_variable_for_type_inference( + dtype=self._dtype, stop_gradient=True) + saved_variance = self._helper.create_variable_for_type_inference( + dtype=self._dtype, stop_gradient=True) + batch_norm_out = input if self._in_place else self._helper.create_variable_for_type_inference( + self._dtype) + + self._helper.append_op( + type="batch_norm", + inputs={ + "X": input, + "Scale": self._scale, + "Bias": self._bias, + "Mean": self._mean, + "Variance": self._variance + }, + outputs={ + "Y": batch_norm_out, + "MeanOut": mean_out, + "VarianceOut": variance_out, + "SavedMean": saved_mean, + "SavedVariance": saved_variance + }, + attrs={ + "momentum": self._momentum, + "epsilon": self._epsilon, + "is_test": self._is_test, + "use_mkldnn": False, + "fuse_with_relu": self._fuse_with_relu, + "use_global_stats": self._use_global_stats + }) + + # Currently, we don't support inplace in imperative mode + return self._helper.append_activation(batch_norm_out) diff --git a/python/paddle/fluid/initializer.py b/python/paddle/fluid/initializer.py index 8a2cd4a9290ab1d2f3e2af2d682994c999d7b931..4f434328e47df4363b304ff55f587018d3157c5e 100644 --- a/python/paddle/fluid/initializer.py +++ b/python/paddle/fluid/initializer.py @@ -24,7 +24,8 @@ __all__ = [ 'Constant', 'Uniform', 'Normal', 'TruncatedNormal', 'Xavier', 'Bilinear', 'MSRA', 'force_init_on_cpu', 'init_on_cpu', 'ConstantInitializer', 'UniformInitializer', 'NormalInitializer', 'TruncatedNormalInitializer', - 'XavierInitializer', 'BilinearInitializer', 'MSRAInitializer' + 'XavierInitializer', 'BilinearInitializer', 'MSRAInitializer', + 'NumpyArrayInitializer' ] _force_init_on_cpu_ = False @@ -683,6 +684,64 @@ class BilinearInitializer(Initializer): return op +class NumpyArrayInitializer(Initializer): + """Init an parameter with an numpy array + + Args: + value (numpy): numpy array to initialize the variable + + Examples: + .. code-block:: python + + fc = fluid.layers.fc(input=x, size=10, + param_attr=fluid.initializer.NumpyArrayInitializer(numpy.array([1,2]))) + """ + + def __init__(self, value): + import numpy + assert isinstance(value, numpy.ndarray) + super(NumpyArrayInitializer, self).__init__() + self._value = value + + def __call__(self, var, block): + """Add constant initialization ops for a variable + + Args: + var: Variable that needs to be initialized + block: The block in which initialization ops + should be added + + Returns: + the initialization op + """ + assert isinstance(var, framework.Variable) + assert isinstance(block, framework.Block) + # Initialization Ops should be prepended and not appended + dtype = framework.convert_np_dtype_to_dtype_(self._value.dtype) + if dtype == VarDesc.VarType.FP32: + value_name = "fp32_values" + values = [float(v) for v in self._value.flat] + elif dtype == VarDesc.VarType.INT32: + value_name = "int32_values" + values = [int(v) for v in self._value.flat] + else: + raise ValueError("Unsupported dtype %s", self._value.dtype) + if self._value.size > 1024 * 1024 * 5: + raise ValueError("The size of input is too big. Please consider " + "saving it to file and 'load_op' to load it") + op = block._prepend_op( + type='assign_value', + outputs={'Out': var}, + attrs={ + 'dtype': dtype, + 'shape': list(self._value.shape), + value_name: values + }, + stop_gradient=True) + var.op = op + return op + + # We short the class name, since users will use the initializer with the package # name. The sample code: # diff --git a/python/paddle/fluid/io.py b/python/paddle/fluid/io.py index e74a87fc68db0e126098f7188db4a712dff2612d..6b1d4cc34f3cd40c878740f28618f26d5e89a6bd 100644 --- a/python/paddle/fluid/io.py +++ b/python/paddle/fluid/io.py @@ -19,6 +19,7 @@ import errno import time import shutil import six +from functools import reduce from paddle.fluid.executor import Executor from paddle.fluid.evaluator import Evaluator @@ -183,8 +184,6 @@ def save_vars(executor, # NOTE: don't save the variable which type is RAW if each_var.type == core.VarDesc.VarType.RAW: continue - if each_var.name == main_program._distributed_lookup_table: - continue new_var = _clone_var_in_block_(save_block, each_var) if filename is None: save_block.append_op( @@ -206,16 +205,6 @@ def save_vars(executor, outputs={}, attrs={'file_path': os.path.join(dirname, filename)}) - # if there is lookup table, the trainer 0 will notify all pserver to save. - if main_program._is_distributed and main_program._is_chief and main_program._distributed_lookup_table: - lookup_table_filename = os.path.join(dirname, "__lookup_table__") - attrs = {} - attrs['epmap'] = main_program._endpoints - attrs['dir'] = lookup_table_filename - attrs['lookup_table'] = main_program._distributed_lookup_table - save_block.append_op( - type='checkpoint_notify', inputs={}, outputs={}, attrs=attrs) - executor.run(save_program) @@ -267,6 +256,186 @@ def save_params(executor, dirname, main_program=None, filename=None): filename=filename) +def _save_distributed_persistables(executor, dirname, main_program): + """ + save_persistables for distributed training. + the method will do things listed below: + 1.save part of persistable variables on trainer. + 2.receive "remote prefetch variables" from parameter servers and merge them. + 3.save "distributed lookup table" on parameter servers. + 4.receive "optimizer variables" from parameter servers and merge them. + + Args: + executor(Executor): The executor to run for saving parameters. + dirname(str): The saving directory path. + main_program(Program): The program whose parameters will be + saved. the main_program must be the trainer_program + get after transpiler. + + Returns: + None + + Examples: + .. code-block:: python + + exe = fluid.Executor(fluid.CPUPlace()) + param_path = "./my_paddle_model" + t = distribute_transpiler.DistributeTranspiler() + t.transpile(...) + train_program = t.get_trainer_program() + _save_distributed_persistables(executor=exe, dirname=param_path, main_program=train_program) + """ + + def __save_remote_params(executor, dirname, remote_params_map): + """ + recive params on pserver through rpc. + if the params are be sliced, will concat them to one, then save it. + """ + if not remote_params_map: + return + + prog = Program() + block = prog.global_block() + + # recv optimize vars from pserver + for name, remote_params in remote_params_map.items(): + origin_var = None + is_slice = False + slice_vars = [0] * len(remote_params) + slice_var_names = [""] * len(remote_params) + endpoints = [""] * len(remote_params) + + for idx, optimizer in enumerate(remote_params): + origin = optimizer.origin + slice = optimizer.slice + is_slice = optimizer.is_slice + block_id = optimizer.block_id + endpoint = optimizer.endpoint + + if idx == 0: + origin_var = block.create_var( + name=origin.name, + type=origin.type, + shape=origin.shape, + dtype=origin.dtype, + persistable=True) + + slice_var = block.create_var( + name="{}.slice.{}".format(slice.name, idx), + type=slice.type, + shape=slice.shape, + dtype=slice.dtype, + persistable=True) + + index = block_id if is_slice else idx + slice_vars[index] = slice_var + slice_var_names[index] = slice.name + endpoints[index] = endpoint + + if is_slice: + block.append_op( + type='recv', + inputs={"X": []}, + outputs={"Out": slice_vars}, + attrs={ + "epmap": endpoints, + "with_barrier": False, + "varnames": slice_var_names, + "sync_mode": True + }) + block.append_op( + type='concat', + inputs={'X': slice_vars}, + outputs={'Out': origin_var}, + attrs={}) + else: + block.append_op( + type='recv', + inputs={"X": []}, + outputs={"Out": [origin_var]}, + attrs={ + "epmap": endpoints[:1], + "with_barrier": False, + "varnames": slice_var_names, + "sync_mode": True + }) + block.append_op( + type='save', + inputs={'X': [origin_var]}, + outputs={}, + attrs={'file_path': os.path.join(dirname, origin_var.name)}) + block.append_op(type='delete_var', inputs={'X': slice_vars}) + executor.run(prog) + + def __save_distributed_lookup_tables(executor, dirname, + distributed_lookup_table, endpoints): + """ + because the distributed lookup table may too huge to merge and save at one place, + it will be saved at parameter server independent respectively. + + the save directory is dirname/"__lookup_table__". + + """ + prog = Program() + block = prog.global_block() + + # if there is lookup table, the trainer 0 will notify all pserver to save. + lookup_table_filename = os.path.join(dirname, "__lookup_table__") + attrs = {} + attrs['epmap'] = endpoints + attrs['dir'] = lookup_table_filename + attrs['lookup_table'] = distributed_lookup_table + block.append_op( + type='checkpoint_notify', inputs={}, outputs={}, attrs=attrs) + executor.run(prog) + + def __exclude_vars(exclude_var_names=[]): + def is_valid(var): + if var.name in exclude_var_names: + return False + if var.desc.type() == core.VarDesc.VarType.FEED_MINIBATCH or \ + var.desc.type() == core.VarDesc.VarType.FETCH_LIST or \ + var.desc.type() == core.VarDesc.VarType.READER: + return False + return var.persistable + + return is_valid + + if not isinstance(main_program, Program): + raise ValueError("'main_program' should be an instance of Program.") + + if not main_program._is_distributed: + raise ValueError( + "'_save_distributed_persistables' just be designed for distributed training." + ) + + remote_params_map = main_program._parameters_on_pservers.get_distributed_vars_by_vtypes( + ["Optimizer", "RemotePrefetch"], groupby=True) + + exclude_var_names = [] + if remote_params_map: + exclude_var_names.extend(remote_params_map.keys()) + + if main_program._distributed_lookup_table: + if isinstance(main_program._distributed_lookup_table, list): + exclude_var_names.extend(main_program._distributed_lookup_table) + else: + exclude_var_names.append(main_program._distributed_lookup_table) + + local_vars = list( + filter(__exclude_vars(exclude_var_names), main_program.list_vars())) + save_vars( + executor, main_program=main_program, dirname=dirname, vars=local_vars) + + if main_program._is_chief: + if remote_params_map: + __save_remote_params(executor, dirname, remote_params_map) + if main_program._distributed_lookup_table: + __save_distributed_lookup_tables( + executor, dirname, main_program._distributed_lookup_table, + main_program._endpoints) + + def save_persistables(executor, dirname, main_program=None, filename=None): """ This function filters out all variables with `persistable==True` from the @@ -301,13 +470,19 @@ def save_persistables(executor, dirname, main_program=None, filename=None): fluid.io.save_persistables(executor=exe, dirname=param_path, main_program=None) """ - save_vars( - executor, - dirname=dirname, - main_program=main_program, - vars=None, - predicate=is_persistable, - filename=filename) + + if main_program and main_program._is_distributed: + _save_distributed_persistables( + executor, dirname=dirname, main_program=main_program) + + else: + save_vars( + executor, + dirname=dirname, + main_program=main_program, + vars=None, + predicate=is_persistable, + filename=filename) def load_vars(executor, @@ -402,17 +577,11 @@ def load_vars(executor, if not isinstance(main_program, Program): raise TypeError("program should be as Program type or None") - load_slice_vars = [] - for each_var in main_program._slice_vars_and_attrs: - load_slice_vars.append(each_var[2].name) - load_var_map = {} for each_var in vars: assert isinstance(each_var, Variable) if each_var.type == core.VarDesc.VarType.RAW: continue - if each_var.name in load_slice_vars: - continue new_var = _clone_var_in_block_(load_block, each_var) if filename is None: load_block.append_op( @@ -435,10 +604,6 @@ def load_vars(executor, attrs={'file_path': os.path.join(dirname, filename)}) executor.run(load_prog) - # load slice vars on pserver, if have it. - _load_slice_up_vars(executor, dirname, - main_program._slice_vars_and_attrs) - def load_params(executor, dirname, main_program=None, filename=None): """ @@ -521,12 +686,134 @@ def load_persistables(executor, dirname, main_program=None, filename=None): fluid.io.load_persistables(executor=exe, dirname=param_path, main_program=None) """ - load_vars( - executor, - dirname=dirname, - main_program=main_program, - predicate=is_persistable, - filename=filename) + + if main_program and main_program._is_distributed: + _load_distributed_persistables( + executor, dirname=dirname, main_program=main_program) + else: + load_vars( + executor, + dirname=dirname, + main_program=main_program, + predicate=is_persistable, + filename=filename) + + +def _load_distributed_persistables(executor, dirname, main_program=None): + """ + customized load_persistables for distributed training. + it should be used on parameter server, + + Args: + executor(Executor): The executor to run for saving parameters. + dirname(str): The load directory path. + main_program(Program): The program whose parameters will be + loaded. the main_program must be the pserver_program + get after transpiler. + + Returns: + None + + Examples: + .. code-block:: python + + exe = fluid.Executor(fluid.CPUPlace()) + param_path = "./my_paddle_model" + t = distribute_transpiler.DistributeTranspiler() + t.transpile(...) + pserver_prog = t.get_pserver_program(...) + _load_distributed_persistables(executor=exe, dirname=param_path, main_program=pserver_prog) + """ + + def __is_distributed_part_var(varname): + trainer_idx = varname.find(".trainer_") + block_idx = varname.find(".block") + return trainer_idx or block_idx + + def __load_persistable_vars(executor, dirname, need_load_vars): + load_prog = Program() + load_block = load_prog.global_block() + need_delete_vars = [] + + for param in need_load_vars: + origin_var = param.origin + slice_var = param.slice + is_slice = param.is_slice + offset = param.offset + + if is_slice: + origin = load_block.create_var( + name="{}.load".format(origin_var.name), + type=origin_var.type, + shape=origin_var.shape, + dtype=origin_var.dtype, + persistable=True) + + load_block.append_op( + type='load', + inputs={}, + outputs={'Out': [origin]}, + attrs={ + 'file_path': os.path.join(dirname, origin_var.name) + }) + + slice = load_block.create_var( + name=slice_var.name, + type=slice_var.type, + shape=slice_var.shape, + dtype=slice_var.dtype, + persistable=True) + + dim1_flatten = reduce(lambda x, y: x * y, slice.shape[1:]) + start = int(offset / dim1_flatten) + end = int(offset / dim1_flatten + slice.shape[0]) + + load_block.append_op( + type="slice", + inputs={'Input': origin}, + outputs={'Out': slice}, + attrs={'axes': [0], + 'starts': [start], + 'ends': [end]}) + + need_delete_vars.append(origin) + else: + origin = load_block.create_var( + name="{}".format(origin_var.name), + type=origin_var.type, + shape=origin_var.shape, + dtype=origin_var.dtype, + persistable=True) + load_block.append_op( + type='load', + inputs={}, + outputs={'Out': [origin]}, + attrs={ + 'file_path': os.path.join(dirname, origin_var.name) + }) + + load_block.append_op( + type='delete_var', + inputs={'X': need_delete_vars}, ) + + executor.run(load_prog) + + if not isinstance(main_program, Program): + raise ValueError("'main_program' should be an instance of Program.") + + if not main_program._is_distributed: + raise ValueError( + "'_load_distributed_persistables' just be designed for distributed training." + ) + + if not main_program._ps_endpoint: + raise ValueError( + "'_load_distributed_persistables' need current_endpoint set in DistributeTranspiler.transpile" + ) + + need_load_vars = main_program._parameters_on_pservers.get_distributed_vars_by_ep( + main_program._ps_endpoint) + __load_persistable_vars(executor, dirname, need_load_vars) def prepend_feed_ops(inference_program, @@ -795,52 +1082,6 @@ def load_inference_model(dirname, return [program, feed_target_names, fetch_targets] -def _save_lookup_tables_by_notify(executor, dirname, lookup_table, - pserver_endpoints): - """ - This function will send checkpoint notify message from Trainer 0 - to all the pservers. - The checkpoint notify message contains lookup table name, - the absolute path on pserver to save lookup_table. - - Args: - executor(Executor): The executor to run for send checkpoint notify. - dirname(str): The folder where to save. - lookup_table(string): the lookup table name, when use distribute - lookup table, we can get lookup table name by DistributeTranspiler. - table_name - ps_endpoint_list(list): the parameter server ip:port list. - when use distribute lookup table, we can get ps_endpoint_list by - distribute arguments. - Return: - None - - Examples: - .. code-block:: python - - exe = fluid.Executor(fluid.CPUPlace()) - param_path = "./my_paddle_model" - table_name = "share_w" - ps_endpoints = ["127.0.0.1:6000","127.0.0.1:6001"] - - _save_pserver_vars_by_notify(executor=exe, - dirname=param_path, lookup_table=table_name, - pserver_endpoints=ps_endpoints) - """ - - pserver_notify_program = Program() - pserver_notify_block = pserver_notify_program.global_block() - - attrs = {} - attrs['epmap'] = pserver_endpoints - attrs['dir'] = dirname - attrs['lookup_table'] = lookup_table - - pserver_notify_block.append_op( - type='checkpoint_notify', inputs={}, outputs={}, attrs=attrs) - executor.run(pserver_notify_program) - - def _endpoints_replacement(program, endpoints): ENDPOINT_MAP = "epmap" for op in program.global_block().ops: @@ -911,54 +1152,3 @@ def get_parameter_value_by_name(name, executor, program=None): program = default_main_program() var = program.global_block().var(name) return get_parameter_value(var, executor) - - -def _load_slice_up_vars(executor, dirname, slice_vars_and_attrs): - if not slice_vars_and_attrs: - return - - load_prog = Program() - load_block = load_prog.global_block() - need_delete_vars = [] - - for var_tuple in slice_vars_and_attrs: - orig_var = var_tuple[0] - start = var_tuple[1] - slice_var = var_tuple[2] - end = start + slice_var.shape[0] - - orig_var_name = orig_var.name - orig_var.name = "{}.origin".format(orig_var_name) - - clone_orig_var = load_block.create_var( - name=orig_var.name, - type=orig_var.type, - shape=orig_var.shape, - dtype=orig_var.dtype, - persistable=True) - - clone_slice_var = load_block.create_var( - name=slice_var.name, - type=slice_var.type, - shape=slice_var.shape, - dtype=slice_var.dtype, - persistable=True) - - load_block.append_op( - type='load', - inputs={}, - outputs={'Out': [clone_orig_var]}, - attrs={'file_path': os.path.join(dirname, orig_var_name)}) - load_block.append_op( - type="slice", - inputs={'Input': clone_orig_var}, - outputs={'Out': clone_slice_var}, - attrs={'axes': [0], - 'starts': [start], - 'ends': [end]}) - need_delete_vars.append(clone_orig_var) - - load_block.append_op( - type='delete_var', - inputs={'X': need_delete_vars}, ) - executor.run(load_prog) diff --git a/python/paddle/fluid/layer_helper.py b/python/paddle/fluid/layer_helper.py index ea9953f5814207c569e799c2ffa11758e31699aa..972c51938f2b2282f8de4b090f9af3bc66f89155 100644 --- a/python/paddle/fluid/layer_helper.py +++ b/python/paddle/fluid/layer_helper.py @@ -435,7 +435,10 @@ class LayerHelper(object): act_type = act.pop('type') tmp = input_var # NOTE(dzhwinter): some activation support inplace compution. - if not core.IsInplace(act_type): + # NOTE(minqiyang): currently, we don't support inplace in imperative mode + if not imperative_base.enabled() and core.IsInplace(act_type): + tmp = input_var + else: tmp = self.create_variable_for_type_inference(dtype=input_var.dtype) self.append_op( type=act_type, diff --git a/python/paddle/fluid/layers/io.py b/python/paddle/fluid/layers/io.py index 9a29b2509357c93a684d736cf0d2523970fb5ff1..1762bd3e343e8af6768dd23f8fbc58cd0182d3c9 100644 --- a/python/paddle/fluid/layers/io.py +++ b/python/paddle/fluid/layers/io.py @@ -523,7 +523,7 @@ def _py_reader(capacity, double_buffer_name = "_".join([name, "double_buffer"]) var = global_scope().var(queue_name) - feed_queue = core.init_lod_tensor_blocking_queue(var, capacity, shapes) + feed_queue = core.init_lod_tensor_blocking_queue(var, capacity) startup_blk = default_startup_program().current_block() startup_var = startup_blk.create_var(name=reader_name) diff --git a/python/paddle/fluid/layers/learning_rate_scheduler.py b/python/paddle/fluid/layers/learning_rate_scheduler.py index dde05189722fef77e03a1c2d8f3cbae44a3e8245..617704a53138bd081a2ebe318de0c89e8db4aa96 100644 --- a/python/paddle/fluid/layers/learning_rate_scheduler.py +++ b/python/paddle/fluid/layers/learning_rate_scheduler.py @@ -321,7 +321,7 @@ def append_LARS(params_grads, learning_rate, weight_decay): The decayed learning rate Examples: .. code-block:: python - + learning_rate *= local_gw_ratio * sqrt(sumsq(param)) / (sqrt(sumsq(gradient))+ weight_decay * sqrt(sumsq(param))) """ diff --git a/python/paddle/fluid/layers/nn.py b/python/paddle/fluid/layers/nn.py index 6765a89a1b0d24b47953f46bcd62136efef3f69b..beb5e31211c5f9aa6bddfcb1da7e63d6480e99e1 100644 --- a/python/paddle/fluid/layers/nn.py +++ b/python/paddle/fluid/layers/nn.py @@ -22,7 +22,7 @@ import six import os import inspect from ..layer_helper import LayerHelper -from ..initializer import Normal, Constant +from ..initializer import Normal, Constant, NumpyArrayInitializer from ..framework import Variable, OpProtoHolder from ..param_attr import ParamAttr from .layer_function_generator import autodoc, templatedoc, _generate_doc_string_ @@ -179,6 +179,7 @@ __all__ = [ 'merge_selected_rows', 'get_tensor_from_selected_rows', 'lstm', + 'shuffle_channel', 'py_func', 'psroi_pool', 'teacher_student_sigmoid_loss', @@ -2874,7 +2875,7 @@ def batch_norm(input, attr=helper.bias_attr, shape=param_shape, dtype=dtype, is_bias=True) # setting stop_gradient=True to reduce computation if use_global_stats and helper.bias_attr.learning_rate == 0.: - scale.stop_gradient = True + bias.stop_gradient = True mean = helper.create_parameter( attr=ParamAttr( @@ -3875,6 +3876,7 @@ def beam_search(pre_ids, beam_size, end_id, level=0, + is_accumulated=True, name=None): """ Beam search is a classical algorithm for selecting candidate words in a @@ -3887,14 +3889,17 @@ def beam_search(pre_ids, selects the top-K candidate word ids of current step from :attr:`ids` according to their :attr:`scores` for all source sentences, where K is :attr:`beam_size` and :attr:`ids, scores` are predicted results from the - computation cell. Additionally, :attr:`pre_ids` and :attr:`pre_scores` are - the output of beam_search at previous step, they are needed for special use - to handle ended candidate translations. - - Note that the :attr:`scores` passed in should be accumulated scores, and - length penalty should be done with extra operators before calculating the - accumulated scores if needed, also suggest finding top-K before it and - using the top-K candidates following. + computation cell. If :attr:`ids` is not set, it will be calculated out + according to :attr:`scores`. Additionally, :attr:`pre_ids` and + :attr:`pre_scores` are the output of beam_search at previous step, they + are needed for special use to handle ended candidate translations. + + Note that if :attr:`is_accumulated` is :attr:`True`, the :attr:`scores` + passed in should be accumulated scores. Else, the :attr:`scores` are + considered as the straightforward scores and will be transformed to the + log field and accumulated the :attr:`pre_scores` in this operator. + Length penalty should be done with extra operators before calculating the + accumulated scores if needed. Please see the following demo for a fully beam search usage example: @@ -3924,6 +3929,8 @@ def beam_search(pre_ids, describes how these candidates belong to the prefix. The paths linking prefixes and selected candidates are organized and reserved in lod. + is_accumulated(bool, default True): Whether the input :attr:`score` is + accumulated scores. name(str|None): A name for this layer(optional). If set None, the layer will be named automatically. @@ -3952,8 +3959,12 @@ def beam_search(pre_ids, end_id=end_id) """ helper = LayerHelper('beam_search', **locals()) - score_type = scores.dtype - id_type = ids.dtype + score_type = pre_scores.dtype + id_type = pre_ids.dtype + + inputs = {"pre_ids": pre_ids, "pre_scores": pre_scores, "scores": scores} + if ids is not None: + inputs["ids"] = ids selected_scores = helper.create_variable_for_type_inference( dtype=score_type) @@ -3961,12 +3972,7 @@ def beam_search(pre_ids, helper.append_op( type='beam_search', - inputs={ - 'pre_ids': pre_ids, - 'pre_scores': pre_scores, - 'ids': ids, - 'scores': scores, - }, + inputs=inputs, outputs={ 'selected_ids': selected_ids, 'selected_scores': selected_scores, @@ -3976,6 +3982,7 @@ def beam_search(pre_ids, 'level': level, 'beam_size': beam_size, 'end_id': end_id, + 'is_accumulated': is_accumulated, }) return selected_ids, selected_scores @@ -5146,9 +5153,9 @@ def nce(input, littles = [] for i in range(custom_dist_len): normal_prob = custom_dist[i] * custom_dist_len - if normal_prob - 1.0 > 1e-4: + if normal_prob - 1.0 > 0: bigs.append((i, normal_prob)) - elif 1.0 - normal_prob > 1e-4: + elif 1.0 - normal_prob > 0: littles.append((i, normal_prob)) else: alias_probs_[i] = normal_prob @@ -5164,9 +5171,9 @@ def nce(input, alias_probs_[little[0]] = little[1] alias_[little[0]] = big_idx big_left = big[1] + little[1] - 1 - if big_left - 1.0 > 1e-4: + if big_left - 1.0 > 0: bigs.append((big_idx, big_left)) - elif 1.0 - big_left > 1e-4: + elif 1.0 - big_left > 0: littles.append((big_idx, big_left)) else: alias_probs_[big_idx] = big_left @@ -5181,14 +5188,21 @@ def nce(input, alias_probs_[little[0]] = 1.0 alias_[little[0]] = -1 - probs = assign(input=np.array(custom_dist).astype('float32')) - custom_alias = assign(input=np.array(alias_).astype('int32')) - custom_alias_probs = assign( - input=np.array(alias_probs_).astype('float32')) - - inputs['CustomDistProbs'] = probs - inputs['CustomDistAlias'] = custom_alias - inputs['CustomDistAliasProbs'] = custom_alias_probs + def _init_by_numpy_array(numpy_array): + ret = helper.create_parameter( + attr=ParamAttr(), + shape=numpy_array.shape, + dtype=numpy_array.dtype, + default_initializer=NumpyArrayInitializer(numpy_array)) + ret.stop_gradient = True + return ret + + inputs['CustomDistProbs'] = _init_by_numpy_array( + np.array(custom_dist).astype('float32')) + inputs['CustomDistAlias'] = _init_by_numpy_array( + np.array(alias_).astype('int32')) + inputs['CustomDistAliasProbs'] = _init_by_numpy_array( + np.array(alias_probs_).astype('float32')) sampler = 2 else: raise Exception("Unsupported sampler type.") @@ -5849,7 +5863,8 @@ def autoincreased_step_counter(counter_name=None, begin=1, step=1): type='increment', inputs={'X': [counter]}, outputs={'Out': [counter]}, - attrs={'step': float(step)}) + attrs={'step': float(step)}, + stop_gradient=True) counter.stop_gradient = True return counter @@ -9468,7 +9483,7 @@ def teacher_student_sigmoid_loss(input, by the previous operator. label (Variable|list): the ground truth which is a 2-D tensor with shape [N x 1], where N is the batch size. - soft_max_up_bound (float): if input > soft_max_up_bound, will be bound + soft_max_up_bound (float): if input > soft_max_up_bound, will be bound soft_max_lower_bound (float): if input < soft_max_lower_bound, will be bound Returns: @@ -9632,6 +9647,79 @@ def get_tensor_from_selected_rows(x, name=None): return out +def shuffle_channel(x, group, name=None): + """ + **Shuffle Channel Operator** + + This operator shuffles the channels of input x. + It divide the input channels in each group into :attr:`group` subgroups, + and obtain a new order by selecting element from every subgroup one by one. + + Please refer to the paper + https://arxiv.org/pdf/1707.01083.pdf + + .. code-block:: text + + Given a 4-D tensor input with the shape (N, C, H, W): + input.shape = (1, 4, 2, 2) + input.data =[[[[0.1, 0.2], + [0.2, 0.3]], + + [[0.3, 0.4], + [0.4, 0.5]], + + [[0.5, 0.6], + [0.6, 0.7]], + + [[0.7, 0.8], + [0.8, 0.9]]]] + Given group: 2 + then we get a 4-D tensor out whth the same shape of input: + out.shape = (1, 4, 2, 2) + out.data = [[[[0.1, 0.2], + [0.2, 0.3]], + + [[0.5, 0.6], + [0.6, 0.7]], + + [[0.3, 0.4], + [0.4, 0.5]], + + [[0.7, 0.8], + [0.8, 0.9]]]] + + Args: + x(Variable): The input tensor variable. It should be a 4-D tensor with shape [N, C, H, W] + group(int): Indicating the conuts of subgroups, It should divide the number of channels. + + Returns: + out(Variable): the channels shuffling result is a tensor variable with the + same shape and same type as the input. + + Raises: + ValueError: If group is not an int type variable. + + Examples: + .. code-block:: python + + input = fluid.layers.data(name='input', shape=[4,2,2], dtype='float32') + out = fluid.layers.shuffle_channel(x=input, group=2) + """ + helper = LayerHelper("shuffle_channel", **locals()) + + out = helper.create_variable_for_type_inference(dtype=x.dtype) + + if not isinstance(group, int): + raise TypeError("group must be int type") + + helper.append_op( + type="shuffle_channel", + inputs={"X": x}, + outputs={"Out": out}, + attrs={"group": group}) + return out + + class PyFuncRegistry(object): _register_funcs = [] diff --git a/python/paddle/fluid/layers/tensor.py b/python/paddle/fluid/layers/tensor.py index ce9f508c9f10981d62b7f8417080f0f3b8d9b8a7..2153ca254f0e286a77160a2d53473e1bc76109d5 100644 --- a/python/paddle/fluid/layers/tensor.py +++ b/python/paddle/fluid/layers/tensor.py @@ -382,7 +382,8 @@ def fill_constant(shape, dtype, value, force_cpu=False, out=None): 'dtype': out.dtype, 'value': float(value), 'force_cpu': force_cpu or force_init_on_cpu() - }) + }, + stop_gradient=True) out.stop_gradient = True return out diff --git a/python/paddle/fluid/optimizer.py b/python/paddle/fluid/optimizer.py index b72b900d3b26cb705b76623e9bacc16c76a7c79f..14f4276e2f4fc4a24d701ef05c94b88c4f0336da 100644 --- a/python/paddle/fluid/optimizer.py +++ b/python/paddle/fluid/optimizer.py @@ -301,10 +301,10 @@ class Optimizer(object): no_grad_set (set|None): set of Variables should be ignored. callbacks (list|None): list of callables to run when appending backward operator for one parameter. - + Return: list: list of (param, grad) pair, grad is the output of backward. - + Examples: See examples in `apply_gradients`. """ @@ -322,10 +322,10 @@ class Optimizer(object): Args: params_grads (list): list of (param, grad) pair to do optimization. - + Returns: list: A list of operators appended to the current program. - + Examples: .. code-block:: python @@ -364,7 +364,7 @@ class Optimizer(object): This method combines interface `backward()` and `apply_gradients()` into one. - + Args: loss (Variable): loss variable to run optimizations. startup_program (Program): startup_program for initializing parameters @@ -381,18 +381,21 @@ class Optimizer(object): optimize_ops = [] if imperative_base.enabled(): if parameter_list is not None: - params_grads = parameter_list + parameters = parameter_list else: parameters = program.global_block().all_parameters() - params_grads = [] - for param in parameters: - # create gradient variable - grad_var = Variable( - block=loss.block, - name=param._ivar._grad_name(), - stop_gradient=True, - ivar=param._ivar._grad_ivar()) - params_grads.append((param, grad_var)) + + params_grads = [] + for param in parameters: + if param.stop_gradient: + continue + # create gradient variable + grad_var = Variable( + block=loss.block, + name=param._ivar._grad_name(), + stop_gradient=True, + ivar=param._ivar._grad_ivar()) + params_grads.append((param, grad_var)) with program_guard(program, startup_program): optimize_ops = self._create_optimization_pass(params_grads) else: diff --git a/python/paddle/fluid/parallel_executor.py b/python/paddle/fluid/parallel_executor.py index a1b1d2f584c399b790580757dea746d7b4e4ac80..a07ff6ac69ca20c8c68659a67606076ce8cdf027 100644 --- a/python/paddle/fluid/parallel_executor.py +++ b/python/paddle/fluid/parallel_executor.py @@ -159,7 +159,7 @@ class ParallelExecutor(object): trainers_endpoints = main._trainers_endpoints if num_trainers > 1 and trainers_endpoints: assert num_trainers == len( - trainers_endpoints), "num_trainers == len(end_points)" + trainers_endpoints), "num_trainers == len(endpoints)" build_strategy.trainers_endpoints = trainers_endpoints # step6: get persistable_vars, places. persistable_vars diff --git a/python/paddle/fluid/tests/unittests/CMakeLists.txt b/python/paddle/fluid/tests/unittests/CMakeLists.txt index 808e1e6aa80744db1289094d7c1bad00002a4c3e..c23dfa01e76c21d0d162f2fed986e2eaf3a70a6d 100644 --- a/python/paddle/fluid/tests/unittests/CMakeLists.txt +++ b/python/paddle/fluid/tests/unittests/CMakeLists.txt @@ -84,6 +84,7 @@ list(REMOVE_ITEM TEST_OPS test_parallel_executor_transformer) list(REMOVE_ITEM TEST_OPS test_image_classification_resnet) list(REMOVE_ITEM TEST_OPS test_bilinear_interp_op) list(REMOVE_ITEM TEST_OPS test_nearest_interp_op) +list(REMOVE_ITEM TEST_OPS test_imperative_resnet) foreach(TEST_OP ${TEST_OPS}) py_test_modules(${TEST_OP} MODULES ${TEST_OP}) endforeach(TEST_OP) @@ -91,6 +92,8 @@ py_test_modules(test_adam_op_multi_thread MODULES test_adam_op ENVS FLAGS_inner_ py_test_modules(test_warpctc_op MODULES test_warpctc_op ENVS FLAGS_warpctc_dir=${WARPCTC_LIB_DIR} SERIAL) py_test_modules(test_bilinear_interp_op MODULES test_bilinear_interp_op SERIAL) py_test_modules(test_nearest_interp_op MODULES test_nearest_interp_op SERIAL) +py_test_modules(test_imperative_resnet MODULES test_imperative_resnet ENVS + FLAGS_cudnn_deterministic=1) if(WITH_DISTRIBUTE) py_test_modules(test_dist_train MODULES test_dist_train SERIAL) set_tests_properties(test_listen_and_serv_op PROPERTIES TIMEOUT 20) diff --git a/python/paddle/fluid/tests/unittests/dist_save_load.py b/python/paddle/fluid/tests/unittests/dist_save_load.py index faec5350424668fca6416e91c3e58174bd4ec877..f0f13a9d49c5b84521aa3e00bdcabe0c494853a7 100644 --- a/python/paddle/fluid/tests/unittests/dist_save_load.py +++ b/python/paddle/fluid/tests/unittests/dist_save_load.py @@ -80,7 +80,8 @@ class TestDistSaveLoad2x2(TestDistSimnetBow2x2): # NOTE: pserver should not call memory optimize t = self.get_transpiler(args.trainer_id, fluid.default_main_program(), args.endpoints, - args.trainers, args.sync_mode) + args.trainers, args.sync_mode, False, + args.current_endpoint) pserver_prog = t.get_pserver_program(args.current_endpoint) startup_prog = t.get_startup_program(args.current_endpoint, pserver_prog) @@ -93,7 +94,8 @@ class TestDistSaveLoad2x2(TestDistSimnetBow2x2): exe.run(startup_prog) if need_load and model_dir: - self._load_persistable_vars(exe, model_dir, startup_prog) + fluid.io.load_persistables(exe, model_dir, pserver_prog) + exe.run(pserver_prog) def run_trainer(self, args): @@ -158,19 +160,46 @@ class TestDistSaveLoad2x2(TestDistSimnetBow2x2): need_save = bool(int(os.getenv("SAVE", "0"))) model_dir = os.getenv("MODEL_DIR", "") - - if need_save: - for _ in six.moves.xrange(RUN_STEP): - loss, = exe.run(fetch_list=[avg_cost.name], - feed=feeder.feed(get_data())) - if need_save and model_dir: - io.save_persistables(startup_exe, model_dir, trainer_prog) - - var = np.array(fluid.global_scope().find_var('__fc_b__').get_tensor()) - if six.PY2: - print(pickle.dumps(np.ravel(var).tolist())) + save_mode = os.getenv("SAVE_MODE", "") + + if save_mode == "LOCAL": + if need_save: + for _ in six.moves.xrange(RUN_STEP): + loss, = exe.run(fetch_list=[avg_cost.name], + feed=feeder.feed(get_data())) + if need_save and model_dir: + io.save_persistables(startup_exe, model_dir, trainer_prog) + + var = np.array(fluid.global_scope().find_var('__fc_b__').get_tensor( + )) + if six.PY2: + print(pickle.dumps(np.ravel(var).tolist())) + else: + sys.stdout.buffer.write(pickle.dumps(np.ravel(var).tolist())) + + elif save_mode == "DIST": + skip_steps = int(os.getenv("SKIP_STEPS")) + loss = None + if need_save: + for idx in six.moves.xrange(8): + loss, = exe.run(fetch_list=[avg_cost.name], + feed=feeder.feed(get_data())) + if need_save and model_dir and idx == skip_steps and args.trainer_id == 0: + io.save_persistables(startup_exe, model_dir, + trainer_prog) + else: + for idx in six.moves.xrange(8): + data = get_data() + if idx <= skip_steps: + continue + loss, = exe.run(fetch_list=[avg_cost.name], + feed=feeder.feed(data)) + if six.PY2: + print(pickle.dumps(loss.tolist())) + else: + sys.stdout.buffer.write(pickle.dumps(loss.tolist())) else: - sys.stdout.buffer.write(pickle.dumps(np.ravel(var).tolist())) + raise Exception("save_mode must be LOCAL or DIST") if __name__ == "__main__": diff --git a/python/paddle/fluid/tests/unittests/dist_simnet_bow.py b/python/paddle/fluid/tests/unittests/dist_simnet_bow.py index fac5e037a46715d146e354825f09ee8ccc4f3d70..09afae6114e2b6cc8bce9b2be3b221ba9825db8c 100644 --- a/python/paddle/fluid/tests/unittests/dist_simnet_bow.py +++ b/python/paddle/fluid/tests/unittests/dist_simnet_bow.py @@ -75,9 +75,13 @@ def get_loss(cos_q_pt, cos_q_nt): return avg_cost -def get_optimizer(): - # SGD optimizer - optimizer = fluid.optimizer.SGD(learning_rate=base_lr) +def get_optimizer(op="sgd"): + if op.upper() == "sgd".upper(): + optimizer = fluid.optimizer.SGD(learning_rate=base_lr) + elif op.upper() == "adam".upper(): + optimizer = fluid.optimizer.Adam(learning_rate=base_lr) + else: + optimizer = fluid.optimizer.SGD(learning_rate=base_lr) return optimizer @@ -237,7 +241,8 @@ class TestDistSimnetBow2x2(TestDistRunnerBase): inference_program = fluid.default_main_program().clone() # Optimization - opt = get_optimizer() + opt = os.getenv('OPTIMIZER', 'sgd') + opt = get_optimizer(opt) opt.minimize(avg_cost) # Reader diff --git a/python/paddle/fluid/tests/unittests/test_dist_base.py b/python/paddle/fluid/tests/unittests/test_dist_base.py index 69a38618cde7f100849a30e1cb1a2f4738e55e0d..0968ace62b6a4e258f7763dbf6fbeda07feb4cd5 100644 --- a/python/paddle/fluid/tests/unittests/test_dist_base.py +++ b/python/paddle/fluid/tests/unittests/test_dist_base.py @@ -43,7 +43,8 @@ class TestDistRunnerBase(object): pserver_endpoints, trainers, sync_mode, - dc_asgd=False): + dc_asgd=False, + current_endpoint=None): # NOTE: import fluid until runtime, or else forking processes will cause error. config = fluid.DistributeTranspilerConfig() config.enable_dc_asgd = dc_asgd @@ -53,7 +54,8 @@ class TestDistRunnerBase(object): program=main_program, pservers=pserver_endpoints, trainers=trainers, - sync_mode=sync_mode) + sync_mode=sync_mode, + current_endpoint=current_endpoint) return t def run_pserver(self, args): @@ -122,7 +124,7 @@ class TestDistRunnerBase(object): if args.batch_merge_repeat > 1: pass_builder = build_stra._finalize_strategy_and_create_passes() mypass = pass_builder.insert_pass( - len(pass_builder.all_passes()) - 2, "multi_batch_merge_pass") + len(pass_builder.all_passes()) - 3, "multi_batch_merge_pass") mypass.set("num_repeats", args.batch_merge_repeat) if args.update_method == "nccl2": diff --git a/python/paddle/fluid/tests/unittests/test_dist_save_load.py b/python/paddle/fluid/tests/unittests/test_dist_save_load.py index 4588ca7c17ba5db893f080813d299feaa47626a7..e795bc410ee45a18cc0c7c914636f5b03309fad1 100644 --- a/python/paddle/fluid/tests/unittests/test_dist_save_load.py +++ b/python/paddle/fluid/tests/unittests/test_dist_save_load.py @@ -33,7 +33,6 @@ class TestDistSaveLoadDense2x2(TestDistBase): delta=1e-3, check_error_log=False, need_envs={}): - required_envs = { "PATH": os.getenv("PATH", ""), "PYTHONPATH": os.getenv("PYTHONPATH", ""), @@ -77,7 +76,77 @@ class TestDistSaveLoadDense2x2(TestDistBase): need_envs = { "IS_DISTRIBUTED": '0', "IS_SPARSE": '0', - 'IS_SELF_CONTAINED_LR': '1' + 'IS_SELF_CONTAINED_LR': '1', + 'SAVE_MODE': 'LOCAL', + } + self.check_with_place( + "dist_save_load.py", + delta=0, + check_error_log=False, + need_envs=need_envs) + + +class TestDistSaveLoadWithPServerStateDense2x2(TestDistBase): + def _setup_config(self): + self._sync_mode = True + self._enforce_place = "CPU" + + def check_with_place(self, + model_file, + delta=1e-3, + check_error_log=False, + need_envs={}): + required_envs = { + "PATH": os.getenv("PATH", ""), + "PYTHONPATH": os.getenv("PYTHONPATH", ""), + "LD_LIBRARY_PATH": os.getenv("LD_LIBRARY_PATH", ""), + "http_proxy": "" + } + + required_envs.update(need_envs) + + if check_error_log: + required_envs["GLOG_v"] = "3" + required_envs["GLOG_logtostderr"] = "1" + + model_dir = tempfile.mkdtemp() + + save_env = {} + save_env["SAVE_MODE"] = "DIST" + save_env["SAVE"] = "1" + save_env["MODEL_DIR"] = model_dir + save_env.update(required_envs) + + tr0_var_1, tr1_var_1 = self._run_cluster(model_file, save_env, + check_error_log) + + load_env = {} + load_env["LOAD"] = "1" + load_env["MODEL_DIR"] = model_dir + load_env.update(required_envs) + tr0_var_2, tr1_var_2 = self._run_cluster(model_file, load_env, + check_error_log) + + shutil.rmtree(model_dir) + + train0_1_np = np.array(tr0_var_1) + train1_1_np = np.array(tr1_var_1) + train0_2_np = np.array(tr0_var_2) + train1_2_np = np.array(tr1_var_2) + + self.assertAlmostEqual( + train0_1_np.all(), train0_2_np.all(), delta=delta) + self.assertAlmostEqual( + train1_1_np.all(), train1_2_np.all(), delta=delta) + + def test_dist(self): + need_envs = { + "IS_DISTRIBUTED": '0', + "IS_SPARSE": '0', + 'IS_SELF_CONTAINED_LR': '1', + 'SAVE_MODE': 'DIST', + 'OPTIMIZER': 'ADAM', + 'SKIP_STEPS': str(np.random.randint(2, 6)) } self.check_with_place( "dist_save_load.py", diff --git a/python/paddle/fluid/tests/unittests/test_dist_transpiler.py b/python/paddle/fluid/tests/unittests/test_dist_transpiler.py index 3d1ce6b27c935ddca0f2f5fb377e69b571e3714c..3566fed215229223f4d2ecd1bbb66cb297dd7716 100644 --- a/python/paddle/fluid/tests/unittests/test_dist_transpiler.py +++ b/python/paddle/fluid/tests/unittests/test_dist_transpiler.py @@ -741,21 +741,40 @@ class TestLoadSliceVar(TranspilerTest): pserver, _ = self.get_pserver(self.pserver1_ep) pserver2, _ = self.get_pserver(self.pserver2_ep) - self.assertTrue(pserver._slice_vars_and_attrs) - self.assertTrue(pserver2._slice_vars_and_attrs) - - for idx in six.moves.xrange(len(pserver._slice_vars_and_attrs)): - self.assertEqual(pserver._slice_vars_and_attrs[idx][0], - pserver2._slice_vars_and_attrs[idx][0]) - - total_numel = six.moves.reduce( - lambda x, y: x * y, pserver._slice_vars_and_attrs[idx][0].shape) - self.assertEqual( - total_numel, - six.moves.reduce(lambda x, y: x * y, - pserver._slice_vars_and_attrs[idx][2].shape) + - six.moves.reduce(lambda x, y: x * y, - pserver2._slice_vars_and_attrs[idx][2].shape)) + vars_ps1 = pserver._parameters_on_pservers.get_distributed_vars_by_ep( + self.pserver1_ep) + vars_ps2 = pserver._parameters_on_pservers.get_distributed_vars_by_ep( + self.pserver2_ep) + + self.assertTrue(vars_ps1) + self.assertTrue(vars_ps2) + + for idx in six.moves.xrange(len(vars_ps1)): + total_numel = 0 + ps1_numel, ps2_numel = 0, 0 + + ps1_var = vars_ps1[idx] + + if not ps1_var.is_slice: + total_numel = six.moves.reduce(lambda x, y: x * y, + vars_ps1[idx].origin.shape) + ps1_numel = six.moves.reduce(lambda x, y: x * y, + vars_ps1[idx].slice.shape) + else: + ps2_var = None + for var in vars_ps2: + if var.origin.name == ps1_var.origin.name: + ps2_var = var + break + + total_numel = six.moves.reduce(lambda x, y: x * y, + ps1_var.origin.shape) + ps1_numel = six.moves.reduce(lambda x, y: x * y, + ps1_var.slice.shape) + ps2_numel = six.moves.reduce(lambda x, y: x * y, + ps2_var.slice.shape) + + self.assertEqual(total_numel, ps1_numel + ps2_numel) class TestNCCL2Transpile(TranspilerTest): diff --git a/python/paddle/fluid/tests/unittests/test_eager_deletion_mnist.py b/python/paddle/fluid/tests/unittests/test_eager_deletion_mnist.py index 7ec1f0ae753724dac5c4675926ead87a097a7a99..56dfb095def62bc617948821038f0c15c1547683 100644 --- a/python/paddle/fluid/tests/unittests/test_eager_deletion_mnist.py +++ b/python/paddle/fluid/tests/unittests/test_eager_deletion_mnist.py @@ -16,12 +16,17 @@ import os import unittest os.environ['FLAGS_eager_delete_tensor_gb'] = "0.0" +# FIXME(zjl): It seems that this unittest fails randomly +# when comparing all reduce last loss and reduce last loss +# e.g.: AssertionError: 1.0357145 != 1.0673475 within 0.01 delta +# Disable it temporarily. +''' from test_parallel_executor_mnist import TestMNIST class EagerDeletionTestMNIST(TestMNIST): pass - +''' if __name__ == '__main__': unittest.main() diff --git a/python/paddle/fluid/tests/unittests/test_imperative.py b/python/paddle/fluid/tests/unittests/test_imperative.py index dfe4daca95af5e7b1aff93c6fa9027dec7c64642..adf35c851bf05011223e483e472900a3d415e2ee 100644 --- a/python/paddle/fluid/tests/unittests/test_imperative.py +++ b/python/paddle/fluid/tests/unittests/test_imperative.py @@ -67,6 +67,18 @@ class MLP(fluid.imperative.Layer): class TestImperative(unittest.TestCase): + def test_sum_op(self): + x = np.ones([2, 2], np.float32) + with fluid.imperative.guard(): + inputs = [] + for _ in range(10): + inputs.append(fluid.imperative.base.to_variable(x)) + ret = fluid.layers.sums(inputs) + loss = fluid.layers.reduce_sum(ret) + loss._backward() + self.assertTrue(np.allclose(ret._numpy(), x * 10)) + self.assertTrue(np.allclose(inputs[0]._gradient(), x)) + def test_layer(self): with fluid.imperative.guard(): cl = core.Layer() @@ -133,7 +145,8 @@ class TestImperative(unittest.TestCase): x = fluid.layers.reduce_sum(fluid.layers.tanh(x1)) param_grads = fluid.backward.append_backward( x, parameter_list=[x1.name])[0] - exe = fluid.Executor(fluid.CPUPlace()) + exe = fluid.Executor(fluid.CPUPlace( + ) if not core.is_compiled_with_cuda() else fluid.CUDAPlace(0)) static_out, static_grad = exe.run( feed={inp.name: np_inp}, @@ -160,7 +173,8 @@ class TestImperative(unittest.TestCase): x = l(inp)[0] param_grads = fluid.backward.append_backward( x, parameter_list=[l._x_for_debug.name])[0] - exe = fluid.Executor(fluid.CPUPlace()) + exe = fluid.Executor(fluid.CPUPlace( + ) if not core.is_compiled_with_cuda() else fluid.CUDAPlace(0)) static_out, static_grad = exe.run( feed={inp.name: np_inp}, @@ -186,7 +200,8 @@ class TestImperative(unittest.TestCase): out = mlp(inp) param_grads = fluid.backward.append_backward( out, parameter_list=[mlp._fc1._w.name])[0] - exe = fluid.Executor(fluid.CPUPlace()) + exe = fluid.Executor(fluid.CPUPlace( + ) if not core.is_compiled_with_cuda() else fluid.CUDAPlace(0)) exe.run(fluid.default_startup_program()) static_out, static_grad = exe.run( diff --git a/python/paddle/fluid/tests/unittests/test_imperative_gan.py b/python/paddle/fluid/tests/unittests/test_imperative_gan.py index 4fe286f85ec551946a9431f70d7012b4e7d79662..681661bfc63db95653be371688a047efe96f3866 100644 --- a/python/paddle/fluid/tests/unittests/test_imperative_gan.py +++ b/python/paddle/fluid/tests/unittests/test_imperative_gan.py @@ -20,6 +20,7 @@ import sys import paddle import paddle.fluid as fluid +import paddle.fluid.core as core from paddle.fluid.optimizer import SGDOptimizer from paddle.fluid.imperative.nn import Conv2D, Pool2D, FC from test_imperative_base import new_program_scope @@ -58,7 +59,7 @@ class Generator(fluid.imperative.Layer): class TestImperativeMnist(unittest.TestCase): - def test_mnist_cpu_float32(self): + def test_gan_float32(self): seed = 90 startup = fluid.Program() @@ -115,7 +116,8 @@ class TestImperativeMnist(unittest.TestCase): sgd = SGDOptimizer(learning_rate=1e-3) sgd.minimize(g_loss) - exe = fluid.Executor(fluid.CPUPlace()) + exe = fluid.Executor(fluid.CPUPlace() if not core.is_compiled_with_cuda( + ) else fluid.CUDAPlace(0)) static_params = dict() with fluid.scope_guard(scope): img = np.ones([2, 1], np.float32) diff --git a/python/paddle/fluid/tests/unittests/test_imperative_optimizer.py b/python/paddle/fluid/tests/unittests/test_imperative_optimizer.py index 63eeae4b712c2064309b664b91d5f0347b67817d..d0a5a883174cb33a035b344f9489b2ba02ba99f1 100644 --- a/python/paddle/fluid/tests/unittests/test_imperative_optimizer.py +++ b/python/paddle/fluid/tests/unittests/test_imperative_optimizer.py @@ -145,7 +145,8 @@ class TestImperativeMnist(unittest.TestCase): fluid.default_startup_program().random_seed = seed fluid.default_main_program().random_seed = seed - exe = fluid.Executor(fluid.CPUPlace()) + exe = fluid.Executor(fluid.CPUPlace( + ) if not core.is_compiled_with_cuda() else fluid.CUDAPlace(0)) mnist = MNIST() sgd = SGDOptimizer(learning_rate=1e-3) diff --git a/python/paddle/fluid/tests/unittests/test_imperative_resnet.py b/python/paddle/fluid/tests/unittests/test_imperative_resnet.py new file mode 100644 index 0000000000000000000000000000000000000000..87a72dd04e376cf9225e275d862b0cbbb9774e2c --- /dev/null +++ b/python/paddle/fluid/tests/unittests/test_imperative_resnet.py @@ -0,0 +1,370 @@ +# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import contextlib +import unittest +import numpy as np +import six + +import paddle +import paddle.fluid as fluid +from paddle.fluid import core +from paddle.fluid.layer_helper import LayerHelper +from paddle.fluid.optimizer import SGDOptimizer +from paddle.fluid.imperative.nn import Conv2D, Pool2D, BatchNorm, FC +from paddle.fluid.imperative.base import to_variable +from test_imperative_base import new_program_scope + +batch_size = 8 +train_parameters = { + "input_size": [3, 224, 224], + "input_mean": [0.485, 0.456, 0.406], + "input_std": [0.229, 0.224, 0.225], + "learning_strategy": { + "name": "piecewise_decay", + "batch_size": batch_size, + "epochs": [30, 60, 90], + "steps": [0.1, 0.01, 0.001, 0.0001] + }, + "batch_size": batch_size, + "lr": 0.1, + "total_images": 1281164, +} + + +def optimizer_setting(params): + ls = params["learning_strategy"] + if ls["name"] == "piecewise_decay": + if "total_images" not in params: + total_images = 1281167 + else: + total_images = params["total_images"] + batch_size = ls["batch_size"] + step = int(total_images / batch_size + 1) + + bd = [step * e for e in ls["epochs"]] + base_lr = params["lr"] + lr = [] + lr = [base_lr * (0.1**i) for i in range(len(bd) + 1)] + optimizer = fluid.optimizer.SGD(learning_rate=0.01) + # TODO(minqiyang): Add learning rate scheduler support to imperative mode + # optimizer = fluid.optimizer.Momentum( + # learning_rate=params["lr"], + # learning_rate=fluid.layers.piecewise_decay( + # boundaries=bd, values=lr), + # momentum=0.9, + # regularization=fluid.regularizer.L2Decay(1e-4)) + + return optimizer + + +class ConvBNLayer(fluid.imperative.Layer): + def __init__(self, + num_channels, + num_filters, + filter_size, + stride=1, + groups=1, + act=None): + super(ConvBNLayer, self).__init__() + + self._conv = Conv2D( + num_channels=num_channels, + num_filters=num_filters, + filter_size=filter_size, + stride=stride, + padding=(filter_size - 1) // 2, + groups=groups, + act=None, + bias_attr=None) + + self._batch_norm = BatchNorm(num_filters, act=act) + + def forward(self, inputs): + y = self._conv(inputs) + y = self._batch_norm(y) + + return y + + +class BottleneckBlock(fluid.imperative.Layer): + def __init__(self, num_channels, num_filters, stride, shortcut=True): + super(BottleneckBlock, self).__init__() + + self.conv0 = ConvBNLayer( + num_channels=num_channels, + num_filters=num_filters, + filter_size=1, + act='relu') + self.conv1 = ConvBNLayer( + num_channels=num_filters, + num_filters=num_filters, + filter_size=3, + stride=stride, + act='relu') + self.conv2 = ConvBNLayer( + num_channels=num_filters, + num_filters=num_filters * 4, + filter_size=1, + act=None) + + if not shortcut: + self.short = ConvBNLayer( + num_channels=num_channels, + num_filters=num_filters * 4, + filter_size=1, + stride=stride) + + self.shortcut = shortcut + + self._num_channels_out = num_filters * 4 + + def forward(self, inputs): + y = self.conv0(inputs) + conv1 = self.conv1(y) + conv2 = self.conv2(conv1) + + if self.shortcut: + short = inputs + else: + short = self.short(inputs) + + y = fluid.layers.elementwise_add(x=short, y=conv2) + + layer_helper = LayerHelper('elementwise_add_activation', act='relu') + return layer_helper.append_activation(y) + + +class ResNet(fluid.imperative.Layer): + def __init__(self, layers=50, class_dim=102): + super(ResNet, self).__init__() + + self.layers = layers + supported_layers = [50, 101, 152] + assert layers in supported_layers, \ + "supported layers are {} but input layer is {}".format(supported_layers, layers) + + if layers == 50: + depth = [3, 4, 6, 3] + elif layers == 101: + depth = [3, 4, 23, 3] + elif layers == 152: + depth = [3, 8, 36, 3] + num_filters = [64, 128, 256, 512] + + self.conv = ConvBNLayer( + num_channels=3, num_filters=64, filter_size=7, stride=2, act='relu') + self.pool2d_max = Pool2D( + pool_size=3, pool_stride=2, pool_padding=1, pool_type='max') + + self.bottleneck_block_list = [] + num_channels = 64 + for block in range(len(depth)): + shortcut = False + for i in range(depth[block]): + bottleneck_block = BottleneckBlock( + num_channels=num_channels, + num_filters=num_filters[block], + stride=2 if i == 0 and block != 0 else 1, + shortcut=shortcut) + num_channels = bottleneck_block._num_channels_out + self.bottleneck_block_list.append(bottleneck_block) + shortcut = True + + self.pool2d_avg = Pool2D( + pool_size=7, pool_type='avg', global_pooling=True) + + import math + stdv = 1.0 / math.sqrt(2048 * 1.0) + + self.out = FC(size=class_dim, + act='softmax', + param_attr=fluid.param_attr.ParamAttr( + initializer=fluid.initializer.Uniform(-stdv, stdv))) + + def forward(self, inputs): + y = self.conv(inputs) + y = self.pool2d_max(y) + for bottleneck_block in self.bottleneck_block_list: + y = bottleneck_block(y) + y = self.pool2d_avg(y) + y = self.out(y) + return y + + +class TestImperativeResnet(unittest.TestCase): + def test_resnet_float32(self): + seed = 90 + + batch_size = train_parameters["batch_size"] + batch_num = 1 + with fluid.imperative.guard(): + fluid.default_startup_program().random_seed = seed + fluid.default_main_program().random_seed = seed + + resnet = ResNet() + optimizer = optimizer_setting(train_parameters) + np.random.seed(seed) + import random + random.seed = seed + train_reader = paddle.batch( + paddle.dataset.flowers.train(use_xmap=False), + batch_size=batch_size) + + dy_param_init_value = {} + for param in fluid.default_main_program().global_block( + ).all_parameters(): + dy_param_init_value[param.name] = param._numpy() + + for batch_id, data in enumerate(train_reader()): + if batch_id >= batch_num: + break + + dy_x_data = np.array( + [x[0].reshape(3, 224, 224) for x in data]).astype('float32') + y_data = np.array([x[1] for x in data]).astype('int64').reshape( + batch_size, 1) + + img = to_variable(dy_x_data) + label = to_variable(y_data) + label._stop_gradient = True + + out = resnet(img) + loss = fluid.layers.cross_entropy(input=out, label=label) + avg_loss = fluid.layers.mean(x=loss) + + dy_out = avg_loss._numpy() + + if batch_id == 0: + for param in fluid.default_main_program().global_block( + ).all_parameters(): + if param.name not in dy_param_init_value: + dy_param_init_value[param.name] = param._numpy() + + avg_loss._backward() + + dy_grad_value = {} + for param in fluid.default_main_program().global_block( + ).all_parameters(): + if not param.stop_gradient: + np_array = np.array(param._ivar._grad_ivar().value() + .get_tensor()) + dy_grad_value[param.name + core.grad_var_suffix( + )] = np_array + + optimizer.minimize(avg_loss) + + dy_param_value = {} + for param in fluid.default_main_program().global_block( + ).all_parameters(): + dy_param_value[param.name] = param._numpy() + + with new_program_scope(): + fluid.default_startup_program().random_seed = seed + fluid.default_main_program().random_seed = seed + + exe = fluid.Executor(fluid.CPUPlace( + ) if not core.is_compiled_with_cuda() else fluid.CUDAPlace(0)) + + resnet = ResNet() + optimizer = optimizer_setting(train_parameters) + + np.random.seed(seed) + import random + random.seed = seed + train_reader = paddle.batch( + paddle.dataset.flowers.train(use_xmap=False), + batch_size=batch_size) + + img = fluid.layers.data( + name='pixel', shape=[3, 224, 224], dtype='float32') + label = fluid.layers.data(name='label', shape=[1], dtype='int64') + out = resnet(img) + loss = fluid.layers.cross_entropy(input=out, label=label) + avg_loss = fluid.layers.mean(x=loss) + optimizer.minimize(avg_loss) + + # initialize params and fetch them + static_param_init_value = {} + static_param_name_list = [] + static_grad_name_list = [] + for param in fluid.default_startup_program().global_block( + ).all_parameters(): + static_param_name_list.append(param.name) + for param in fluid.default_main_program().global_block( + ).all_parameters(): + if not param.stop_gradient: + static_grad_name_list.append(param.name + + core.grad_var_suffix()) + + out = exe.run(fluid.default_startup_program(), + fetch_list=static_param_name_list) + + for i in range(len(static_param_name_list)): + static_param_init_value[static_param_name_list[i]] = out[i] + + for batch_id, data in enumerate(train_reader()): + if batch_id >= batch_num: + break + + static_x_data = np.array( + [x[0].reshape(3, 224, 224) for x in data]).astype('float32') + y_data = np.array([x[1] for x in data]).astype('int64').reshape( + [batch_size, 1]) + + fetch_list = [avg_loss.name] + fetch_list.extend(static_param_name_list) + fetch_list.extend(static_grad_name_list) + out = exe.run(fluid.default_main_program(), + feed={"pixel": static_x_data, + "label": y_data}, + fetch_list=fetch_list) + + static_param_value = {} + static_grad_value = {} + static_out = out[0] + param_start_pos = 1 + grad_start_pos = len(static_param_name_list) + param_start_pos + for i in range(param_start_pos, + len(static_param_name_list) + param_start_pos): + static_param_value[static_param_name_list[ + i - param_start_pos]] = out[i] + for i in range(grad_start_pos, + len(static_grad_name_list) + grad_start_pos): + static_grad_value[static_grad_name_list[ + i - grad_start_pos]] = out[i] + + self.assertTrue(np.allclose(static_out, dy_out)) + + self.assertEqual(len(dy_param_init_value), len(static_param_init_value)) + for key, value in six.iteritems(static_param_init_value): + self.assertTrue(np.allclose(value, dy_param_init_value[key])) + self.assertTrue(np.isfinite(value.all())) + self.assertFalse(np.isnan(value.any())) + + self.assertEqual(len(dy_grad_value), len(static_grad_value)) + for key, value in six.iteritems(static_grad_value): + self.assertTrue(np.allclose(value, dy_grad_value[key])) + self.assertTrue(np.isfinite(value.all())) + self.assertFalse(np.isnan(value.any())) + + self.assertEqual(len(dy_param_value), len(static_param_value)) + for key, value in six.iteritems(static_param_value): + self.assertTrue(np.allclose(value, dy_param_value[key])) + self.assertTrue(np.isfinite(value.all())) + self.assertFalse(np.isnan(value.any())) + + +if __name__ == '__main__': + unittest.main() diff --git a/python/paddle/fluid/tests/unittests/test_initializer.py b/python/paddle/fluid/tests/unittests/test_initializer.py index ab7183f88df809e584ca50ba16221bfdfe1376a9..2d98b063d10e2bb9071c4b8dc4ac9373f63df387 100644 --- a/python/paddle/fluid/tests/unittests/test_initializer.py +++ b/python/paddle/fluid/tests/unittests/test_initializer.py @@ -420,5 +420,26 @@ class TestMSRAInitializer(unittest.TestCase): self.assertEqual(init_op.type, 'assign_value') +class TestNumpyArrayInitializer(unittest.TestCase): + def test_numpy_array_initializer(self): + """Test the numpy array initializer with supplied arguments + """ + import numpy + program = framework.Program() + block = program.global_block() + np_array = numpy.random.random((10000)).astype("float32") + for _ in range(2): + block.create_parameter( + dtype=np_array.dtype, + shape=np_array.shape, + lod_level=0, + name="param", + initializer=initializer.NumpyArrayInitializer(np_array)) + self.assertEqual(len(block.ops), 1) + init_op = block.ops[0] + self.assertEqual(init_op.type, 'assign_value') + assert (init_op.attr('fp32_values') == np_array).all() + + if __name__ == '__main__': unittest.main() diff --git a/python/paddle/fluid/tests/unittests/test_layers.py b/python/paddle/fluid/tests/unittests/test_layers.py index 90f5d797a67d951e618e64cfc5a3608335714e05..c13f03e86f3e375026b04a31d51ac1a5223360ef 100644 --- a/python/paddle/fluid/tests/unittests/test_layers.py +++ b/python/paddle/fluid/tests/unittests/test_layers.py @@ -1023,6 +1023,14 @@ class TestBook(unittest.TestCase): print(str(program)) + def test_shuffle_channel(self): + program = Program() + with program_guard(program): + x = layers.data(name="X", shape=[16, 4, 4], dtype="float32") + out = layers.shuffle_channel(x, group=4) + self.assertIsNotNone(out) + print(str(program)) + if __name__ == '__main__': unittest.main() diff --git a/python/paddle/fluid/tests/unittests/test_shuffle_channel_op.py b/python/paddle/fluid/tests/unittests/test_shuffle_channel_op.py new file mode 100644 index 0000000000000000000000000000000000000000..aeaae9058187be1c9191bcbec21237c69fefe6e6 --- /dev/null +++ b/python/paddle/fluid/tests/unittests/test_shuffle_channel_op.py @@ -0,0 +1,52 @@ +# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import print_function + +import unittest +import numpy as np +import sys +import math +from op_test import OpTest +import paddle.fluid.core as core + + +class TestShuffleChannelOp(OpTest): + def setUp(self): + self.op_type = "shuffle_channel" + self.batch_size = 10 + self.input_channels = 16 + self.layer_h = 4 + self.layer_w = 4 + self.group = 4 + self.x = np.random.random( + (self.batch_size, self.input_channels, self.layer_h, + self.layer_w)).astype('float32') + self.inputs = {'X': self.x} + self.attrs = {'group': self.group} + n, c, h, w = self.x.shape + input_reshaped = np.reshape(self.x, + (-1, self.group, c // self.group, h, w)) + input_transposed = np.transpose(input_reshaped, (0, 2, 1, 3, 4)) + self.outputs = {'Out': np.reshape(input_transposed, (-1, c, h, w))} + + def test_check_output(self): + self.check_output() + + def test_check_grad(self): + self.check_grad(['X'], 'Out') + + +if __name__ == '__main__': + unittest.main() diff --git a/python/paddle/fluid/transpiler/details/checkport.py b/python/paddle/fluid/transpiler/details/checkport.py index 6b78ceeaeec4d9b3db6524a5b5e939f88267340c..89dd4dd50b0299de986b84f46e889d554030f180 100644 --- a/python/paddle/fluid/transpiler/details/checkport.py +++ b/python/paddle/fluid/transpiler/details/checkport.py @@ -16,6 +16,7 @@ import sys import time import socket from contextlib import closing +from six import string_types def wait_server_ready(endpoints): @@ -32,6 +33,7 @@ def wait_server_ready(endpoints): wait_server_ready(["127.0.0.1:8080", "127.0.0.1:8081"]) """ + assert not isinstance(endpoints, string_types) while True: all_ok = True not_ready_endpoints = [] @@ -45,7 +47,7 @@ def wait_server_ready(endpoints): all_ok = False not_ready_endpoints.append(ep) if not all_ok: - sys.stderr.write("pserver not ready, wait 3 sec to retry...\n") + sys.stderr.write("server not ready, wait 3 sec to retry...\n") sys.stderr.write("not ready endpoints:" + str(not_ready_endpoints) + "\n") sys.stderr.flush() diff --git a/python/paddle/fluid/transpiler/distribute_transpiler.py b/python/paddle/fluid/transpiler/distribute_transpiler.py index ea5a4cf7cdb3ef91a02bb88d9b859da1ecd1ed0b..e58f34e3750803669149685003ea5858fa775ed7 100644 --- a/python/paddle/fluid/transpiler/distribute_transpiler.py +++ b/python/paddle/fluid/transpiler/distribute_transpiler.py @@ -39,7 +39,7 @@ from .ps_dispatcher import RoundRobin, PSDispatcher from .. import core, framework, unique_name from ..framework import Program, default_main_program, \ default_startup_program, Block, \ - Parameter, grad_var_name + Parameter, Variable, grad_var_name from .details import * from ..distribute_lookup_table import find_distributed_lookup_table from functools import reduce @@ -62,6 +62,260 @@ def log(*args): print(args) +class VarStruct(object): + """ + record part properties of a Variable in python. + """ + + def __init__(self, name, shape, dtype, type, lod_level, persistable): + self.name = name + self.shape = shape + self.dtype = dtype + self.type = type + self.lod_level = lod_level + self.persistable = persistable + + +class VarDistributed(object): + """ + a class to record the var distributed on parameter servers. + the class will record the relationship between origin var and slice var. + the slice var's properties, such as type/shape/offset/endpoint. + """ + + def __init__(self, + origin_var, + slice_var, + is_slice=None, + block_id=None, + offset=None, + vtype=None, + endpoint=None): + """ + Args: + origin_var(Variable|VarStruct): origin var properties + slice_var(Variable|VarStruct): slice var properties + is_slice(bool|None): slice or not, slice_var=True/False and its block size > 8192 are the judgement standard. + block_id(int|None): the number about the slice var. + offset(int|None): if the slice var is sliced, offset is the numel before the var. + vtype(str|None): a tag, such as Optimizer/Param/RemoteProfetch. + endpoint(str|None): which parameter the slice var on, such as "127.0.0.1:1001" + """ + + if isinstance(origin_var, Variable): + self.origin = self.__create_var_struct(origin_var) + else: + self.origin = origin_var + + if isinstance(slice_var, Variable): + self.slice = self.__create_var_struct(slice_var) + else: + self.slice = slice_var + + if self.equal(self.origin, self.slice): + self.is_slice = False + self.block_id = 0 + self.offset = 0 + else: + self.is_slice = True + self.block_id = 0 + self.offset = 0 + + if is_slice is not None: + self.is_slice = is_slice + if block_id is not None: + self.block_id = block_id + if offset is not None: + self.offset = offset + + self.vtype = vtype + self.endpoint = endpoint + + @staticmethod + def __create_var_struct(var): + return VarStruct(var.name, var.shape, var.dtype, var.type, + var.lod_level, var.persistable) + + @staticmethod + def equal(var1, var2): + """ + the two var is equal or not. + Returns: + bool: equal will return True else False + """ + assert isinstance(var1, VarStruct) and isinstance(var2, VarStruct) + + return var1.name == var2.name and \ + var1.type == var2.type and \ + var1.shape == var2.shape and \ + var1.dtype == var2.dtype and \ + var1.lod_level == var2.lod_level and \ + var1.persistable == var2.persistable + + def __str__(self): + origin_var_str = "{name} : fluid.{type}.shape{shape}.astype({dtype})". \ + format(i="{", e="}", name=self.origin.name, type=self.origin.type, + shape=self.origin.shape, dtype=self.origin.dtype) + + slice_var_str = "{name} : fluid.{type}.shape{shape}.astype({dtype})" \ + ".slice({is_slice}).block({block_id}).offset({offset})". \ + format(i="{", e="}", name=self.slice.name, type=self.slice.type, + shape=self.slice.shape, dtype=self.slice.dtype, + is_slice=self.is_slice, block_id=self.block_id, offset=self.offset) + + return "var owned: {}, origin var: ( {} ), slice var: ( {} ), endpoint: {} ".format( + self.vtype, origin_var_str, slice_var_str, self.endpoint) + + +class VarsDistributed(object): + """ + a gather about VarDistributed with many methods to find distributed vars. + through the class, we can get overview about the distributed parameters on parameter servers. + this class may centralized and convenient for developer to manage and get variable's distribute. + other module can also use this to find variables such io.py. + """ + + def __init__(self): + self.distributed_vars = [] + + def add_distributed_var(self, + origin_var, + slice_var, + is_slice=None, + block_id=None, + offset=None, + vtype=None, + endpoint=None): + """ + add distributed var in this. + + Args: + origin_var(Variable|VarStruct): origin var properties + slice_var(Variable|VarStruct): slice var properties + is_slice(bool|None): slice or not, slice_var=True/False and its block size > 8192 are the judgement standard. + block_id(int|None): the number about the slice var. + offset(int|None): if the slice var is sliced, offset is the numel before the var. + vtype(str|None): a tag, such as Optimizer/Param/RemoteProfetch. + endpoint(str|None): which parameter the slice var on, such as "127.0.0.1:1001" + Returns: + None + """ + self.distributed_vars.append( + VarDistributed(origin_var, slice_var, is_slice, block_id, offset, + vtype, endpoint)) + + def get_distributed_var_by_slice(self, var_name): + """ + get distributed var by conditions. + + Args: + var_name(str): slice var name, such as "w.traier0.block1" + Returns: + VarDistributed: distributed var. + """ + for dist_var in self.distributed_vars: + if dist_var.slice.name == var_name: + return dist_var + return None + + @staticmethod + def equal(var1, var2): + """ + the two var is equal or not. + Returns: + bool: equal will return True else False + """ + return var1.name == var2.name and \ + var1.type == var2.type and \ + var1.shape == var2.shape and \ + var1.dtype == var2.dtype and \ + var1.lod_level == var2.lod_level and \ + var1.persistable == var2.persistable + + def get_distributed_var_by_origin_and_ep(self, origin_var_name, endpoint): + """ + get distributed var by conditions. + + Args: + origin_var_name(str): + endpoint(str): the parameter endpoint, such as "127.0.0.1:1001" + Returns: + VarDistributed: distributed var. + """ + for dist_var in self.distributed_vars: + if dist_var.origin.name == origin_var_name and dist_var.endpoint == endpoint: + return dist_var + return None + + def get_distributed_vars_by_vtypes(self, vtypes, groupby=False): + """ + get distributed vars by conditions. + + Args: + vtype(str|None): distributed var's vtype, such as "Optimizer", "RemotePrefetch" + groupby(bool|False): group by origin var or not. + + Returns: + list: distributed var list. + dict: distributed var map when groupby=True + """ + vtype_vars = [] + for var in self.distributed_vars: + if var.vtype in vtypes: + vtype_vars.append(var) + if not groupby: + return vtype_vars + + params_map = {} + for var in vtype_vars: + origin_var_name = var.origin.name + + if origin_var_name in params_map.keys(): + optimizers = params_map.get(origin_var_name) + else: + optimizers = [] + optimizers.append(var) + params_map[origin_var_name] = optimizers + return params_map + + def get_distributed_vars_by_ep(self, endpoint, vtype=None): + """ + get distributed vars by conditions. + + Args: + endpoint(str): the parameter server endpoint, such as "127.0.0.1:2001" + vtype(str|None): distributed var's vtype, such as "Optimizer", "RemotePrefetch" + + Returns: + list: distributed var list. + """ + endpoint_vars = [] + for var in self.distributed_vars: + if var.endpoint == endpoint: + endpoint_vars.append(var) + if not vtype: + return endpoint_vars + + vtype_vars = [] + for var in endpoint_vars: + if var.vtype == vtype: + vtype_vars.append(var) + return vtype_vars + + def overview(self): + """ + get the overview string about all params on all parameter servers. + + Returns: + Str: overview string. + + """ + vars_str = [] + for var in self.distributed_vars: + vars_str.append(str(var)) + return "\n".join(vars_str) + + class VarBlock: def __init__(self, varname, offset, size): self.varname = varname @@ -327,6 +581,7 @@ class DistributeTranspiler(object): self.trainer_id = trainer_id pserver_endpoints = pservers.split(",") self.pserver_endpoints = pserver_endpoints + self.vars_overview = VarsDistributed() self.optimize_ops, self.params_grads = self._get_optimize_pass() ps_dispatcher = self.config.split_method(self.pserver_endpoints) @@ -347,6 +602,7 @@ class DistributeTranspiler(object): # add distributed attrs to program self.origin_program._is_distributed = True self.origin_program._endpoints = self.pserver_endpoints + self.origin_program._ps_endpoint = current_endpoint self.origin_program._is_chief = self.trainer_id == 0 self.origin_program._distributed_lookup_table = self.table_name if self.table_name else None @@ -454,6 +710,10 @@ class DistributeTranspiler(object): self.param_grad_ep_mapping[ep]["params"].append(recv_vars[i]) self.param_grad_ep_mapping[ep]["grads"].append(send_vars[i]) + distributed_var = self.vars_overview.get_distributed_var_by_slice( + recv_vars[i].name) + distributed_var.endpoint = ep + # step4: Concat the parameters splits together after recv. all_recv_outputs = [] for param_varname, splited_var in six.iteritems(self.param_var_mapping): @@ -480,6 +740,12 @@ class DistributeTranspiler(object): recv_op_role_var_name = splited_trainer_grad[0].name if param_varname in self.sparse_param_to_height_sections: + + for table_name in table_names: + distributed_var = self.vars_overview.get_distributed_var_by_slice( + table_name) + distributed_var.vtype = "RemotePrefetch" + height_sections = self.sparse_param_to_height_sections[ param_varname] self._update_remote_sparse_update_op( @@ -532,6 +798,9 @@ class DistributeTranspiler(object): pserver_endpoints) self._split_table_grad_and_add_send_vars(program, pserver_endpoints) + self._get_distributed_optimizer_vars() + self.origin_program._parameters_on_pservers = self.vars_overview + def get_trainer_program(self, wait_port=True): """ Get transpiled trainer side program. @@ -541,6 +810,7 @@ class DistributeTranspiler(object): """ # remove optimize ops and add a send op to main_program # FIXME(typhoonzero): Also ops like clip_gradient, lrn_decay? + lr_ops = self._get_lr_ops() delete_ops(self.origin_program.global_block(), self.optimize_ops) delete_ops(self.origin_program.global_block(), lr_ops) @@ -665,9 +935,14 @@ class DistributeTranspiler(object): # NOTE: assume blocks of the same variable is not distributed # on the same pserver, only change param/grad varnames for # trainers to fetch. + sys.stderr.write( + "get_pserver_program() is deprecated, call get_pserver_programs() to get pserver main and startup in a single call.\n" + ) # step1 pserver_program = Program() pserver_program.random_seed = self.origin_program.random_seed + pserver_program._copy_dist_param_info_from(self.origin_program) + # step2: Create vars to receive vars at parameter servers. recv_inputs = [] for v in self.param_grad_ep_mapping[endpoint]["params"]: @@ -703,9 +978,6 @@ class DistributeTranspiler(object): else: recv_inputs.append(single_trainer_var) - self._slice_params_and_optimizes = self._get_slice_vars_and_attrs( - endpoint) - # step 3 # Create a union-find data structure from optimize ops, # If two ops are connected, we could add these two ops @@ -882,10 +1154,6 @@ class DistributeTranspiler(object): outputs={}, attrs=attrs) - # add distributed attrs - pserver_program._slice_vars_and_attrs = list( - self._slice_params_and_optimizes.values()) - pserver_program._sync_with_cpp() # save pserver program to generate pserver side startup relatively. self.pserver_program = pserver_program @@ -984,30 +1252,88 @@ class DistributeTranspiler(object): inputs={"X": startup_param_var}, outputs={"Out": startup_tmpvar}) - # add slice vars - s_prog._slice_vars_and_attrs = pserver_program._slice_vars_and_attrs - return s_prog - def _get_slice_vars_and_attrs(self, endpoint): - slice_vars_and_attrs = {} + # ====================== private transpiler functions ===================== + def _get_slice_var_info(self, slice_var): block_suffix = "block" - for param in self.param_grad_ep_mapping[endpoint]["params"]: - orig_var_name, block_name, _ = self._get_varname_parts(param.name) - if not block_name: - continue + block_idx = 0 + offset = 0 + is_slice = False - block_idx = int(block_name.split(block_suffix)[1]) - orig_var = self.origin_program.global_block().vars[orig_var_name] + orig_var_name, block_name, _ = self._get_varname_parts(slice_var.name) - skip_dim0 = 0 - slice_vars = self.param_var_mapping[orig_var_name] - for slice_var in slice_vars[:block_idx]: - skip_dim0 += slice_var.shape[0] - slice_vars_and_attrs[param.name] = [orig_var, skip_dim0, param] - return slice_vars_and_attrs + if not block_name: + return is_slice, block_idx, offset - # ====================== private transpiler functions ===================== + block_idx = int(block_name.split(block_suffix)[1]) + skip_dim0 = 0 + slice_vars = self.param_var_mapping[orig_var_name] + + orig_dim1_flatten = reduce(lambda x, y: x * y, slice_vars[0].shape[1:]) + + for slice_var in slice_vars[:block_idx]: + skip_dim0 += slice_var.shape[0] + + offset = skip_dim0 * orig_dim1_flatten + is_slice = True + return is_slice, block_idx, offset + + def _get_distributed_optimizer_vars(self): + def _get_distributed_optimizer_var(endpoint): + opt_op_on_pserver = [] + for _, op in enumerate(self.optimize_ops): + if self._is_optimizer_op(op) and self._is_opt_op_on_pserver( + endpoint, op): + opt_op_on_pserver.append(op) + + for opt_op in opt_op_on_pserver: + dist_var = None + for key in opt_op.input_names: + if key == "Param": + param_name = opt_op.input(key)[0] + dist_var = self.vars_overview.get_distributed_var_by_origin_and_ep( + param_name, endpoint) + break + for key in opt_op.input_names: + if key in ["Param", "Grad", "LearningRate"]: + continue + origin_var = self.origin_program.global_block().vars[ + opt_op.input(key)[0]] + # update accumulator variable shape + new_shape = self._get_optimizer_input_shape( + opt_op.type, key, origin_var.shape, + dist_var.slice.shape) + + if new_shape == dist_var.slice.shape: + splited_var = VarStruct( + name=origin_var.name, + shape=new_shape, + dtype=origin_var.dtype, + type=origin_var.type, + lod_level=origin_var.lod_level, + persistable=origin_var.persistable) + + self.vars_overview.add_distributed_var( + origin_var=origin_var, + slice_var=splited_var, + is_slice=dist_var.is_slice, + block_id=dist_var.block_id, + offset=dist_var.offset, + vtype="Optimizer", + endpoint=endpoint) + else: + self.vars_overview.add_distributed_var( + origin_var=origin_var, + slice_var=origin_var, + is_slice=False, + block_id=0, + offset=0, + vtype="Optimizer", + endpoint=endpoint) + + for ep in self.pserver_endpoints: + _get_distributed_optimizer_var(ep) def _update_dist_lookup_table_vars(self, param_list, grad_list, params_grads): @@ -1093,6 +1419,22 @@ class DistributeTranspiler(object): # origin_param_name -> [splited_param_vars] self.param_var_mapping = self._create_vars_from_blocklist( self.origin_program, param_blocks) + + for orig_name, splited_vars in self.param_var_mapping.items(): + orig_var = self.origin_program.global_block().var(orig_name) + + for splited_var in splited_vars: + is_slice, block_id, offset = self._get_slice_var_info( + splited_var) + + self.vars_overview.add_distributed_var( + origin_var=orig_var, + slice_var=splited_var, + block_id=block_id, + offset=offset, + is_slice=is_slice, + vtype="Param") + # origin_grad_name -> [splited_grad_vars] self.grad_var_mapping = self._create_vars_from_blocklist( self.origin_program, @@ -1729,13 +2071,6 @@ class DistributeTranspiler(object): shape=new_shape) new_inputs[key] = tmpvar - # var shape been changed - if new_shape != var.shape: - slice_var_args = self._slice_params_and_optimizes[ - param_var.name] - self._slice_params_and_optimizes[ - var.name] = [var, slice_var_args[1], tmpvar] - # change output's ParamOut variable outputs = self._get_output_map_from_op( self.origin_program.global_block().vars, opt_op) @@ -1763,8 +2098,8 @@ class DistributeTranspiler(object): # skip per trainer vars if g.name.find(".trainer_") == -1: # only param or grads have splited blocks - if self._orig_varname(g.name) in self.grad_name_to_param_name or\ - self._orig_varname(g.name) in self.param_name_to_grad_name: + if self._orig_varname(g.name) in self.grad_name_to_param_name or \ + self._orig_varname(g.name) in self.param_name_to_grad_name: grad_block = g break return grad_block diff --git a/python/setup.py.in b/python/setup.py.in index fb4b273a0676fcbcb4402eaf54ddf73d37a2754f..c947785cbf7517be56c3e43120db65284ab22d10 100644 --- a/python/setup.py.in +++ b/python/setup.py.in @@ -109,6 +109,7 @@ packages=['paddle', 'paddle.fluid.contrib', 'paddle.fluid.contrib.decoder', 'paddle.fluid.contrib.quantize', + 'paddle.fluid.contrib.reader', 'paddle.fluid.contrib.slim', 'paddle.fluid.contrib.slim.core', 'paddle.fluid.contrib.slim.graph',