From ee74be3a499d9a39410cb52360122a6ba2818de3 Mon Sep 17 00:00:00 2001 From: Yan Chunwei Date: Sun, 28 Oct 2018 22:56:39 +0800 Subject: [PATCH] [1.1] Bugfix/tensorarray (#14044) --- CMakeLists.txt | 6 ++ cmake/inference_lib.cmake | 3 + paddle/fluid/framework/lod_tensor_array.h | 78 ++++++++++++++++++- paddle/fluid/framework/scope.h | 2 + paddle/fluid/inference/CMakeLists.txt | 4 +- paddle/fluid/inference/api/CMakeLists.txt | 21 +++-- .../fluid/inference/api/analysis_predictor.cc | 8 ++ .../fluid/inference/api/analysis_predictor.h | 2 + paddle/fluid/inference/api/api_impl.cc | 5 ++ paddle/fluid/inference/api/api_impl.h | 5 +- paddle/fluid/inference/api/demo_ci/run.sh | 17 ++-- .../api/details/reset_tensor_array.cc | 50 ++++++++++++ .../api/details/reset_tensor_array.h | 37 +++++++++ .../tests/api/analyzer_rnn1_tester.cc | 1 + .../fluid/operators/beam_search_decode_op.cc | 3 + paddle/scripts/paddle_build.sh | 2 +- 16 files changed, 225 insertions(+), 19 deletions(-) create mode 100644 paddle/fluid/inference/api/details/reset_tensor_array.cc create mode 100644 paddle/fluid/inference/api/details/reset_tensor_array.h diff --git a/CMakeLists.txt b/CMakeLists.txt index aca5b3dfb..d3379a663 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -69,6 +69,7 @@ option(WITH_ANAKIN "Compile with Anakin library" OFF) option(WITH_GRPC "Use grpc as the default rpc framework" ${WITH_DISTRIBUTE}) option(WITH_BRPC_RDMA "Use brpc rdma as the rpc protocal" OFF) option(WITH_INFERENCE "Compile fluid inference library" ON) +option(ON_INFER "Turn on inference optimization." OFF) option(WITH_INFERENCE_API_TEST "Test fluid inference high-level api interface" OFF) option(WITH_SYSTEM_BLAS "Use system blas library" OFF) option(PY_VERSION "Compile PaddlePaddle with python3 support" ${PY_VERSION}) @@ -302,3 +303,8 @@ if(WITH_DOC) find_python_module(recommonmark REQUIRED) add_subdirectory(doc) endif() + +if (ON_INFER) + message(WARNING "On inference mode, will take place some specific optimization.") + add_definitions(-DPADDLE_ON_INFERENCE) +endif() diff --git a/cmake/inference_lib.cmake b/cmake/inference_lib.cmake index efdb093a7..1047b6f99 100644 --- a/cmake/inference_lib.cmake +++ b/cmake/inference_lib.cmake @@ -14,6 +14,9 @@ # make package for paddle fluid shared and static library function(copy TARGET) + if (NOT ON_INFER) + message(WARNING "Turn on the ON_INFER flag when building inference_lib only.") + endif() set(options "") set(oneValueArgs "") set(multiValueArgs SRCS DSTS DEPS) diff --git a/paddle/fluid/framework/lod_tensor_array.h b/paddle/fluid/framework/lod_tensor_array.h index 6d7b6a4ad..0ad6a7090 100644 --- a/paddle/fluid/framework/lod_tensor_array.h +++ b/paddle/fluid/framework/lod_tensor_array.h @@ -18,6 +18,82 @@ limitations under the License. */ namespace paddle { namespace framework { + +// NOTE The vector can't be replaced with the class LoDTensorArray +// directly, because there are many vector used accross the project, +// and some of them are treated as LoDTensorArray. +#if !defined(PADDLE_ON_INFERENCE) + using LoDTensorArray = std::vector; -} + +#else // !PADDLE_ON_INFERENCE + +#pragma message "LoDTensorArray is replaced with the inference one." +/* + * A LoDTensorArray which will not deallocate buffer when resized, fix the data + * diff in inference, and more performance friendly in the concurrency + * scenerios. + */ +class LoDTensorArray { + public: + LoDTensorArray() = default; + + using iterator = std::vector::iterator; + using const_iterator = std::vector::const_iterator; + + const_iterator begin() const { return array_.begin(); } + const_iterator end() const { return array_.begin() + size_; } + iterator begin() { return array_.begin(); } + iterator end() { return array_.begin() + size_; } + + void push_back(const LoDTensor& x) { + if (size_ < array_.size()) { + array_[size_++] = x; + } else { + array_.push_back(x); + ++size_; + } + } + void resize(size_t size) { + if (array_.size() < size) { + array_.resize(size); + } + size_ = size; + } + + void emplace_back() { array_.emplace_back(); } + + void emplace_back(LoDTensor&& x) { array_.emplace_back(std::move(x)); } + + LoDTensor& back() { return array_.back(); } + + size_t space() const { return array_.size(); } + + void reserve(size_t size) { + // Naive warning to tell user this array might be to large. The memory and + // buffer used by this TensorArray will not be deleted during the training + // and inference phase, so attention not to make it expand too long. + if (size > 800UL) { + LOG(WARNING) << "TensorArray has more than 800 items"; + } + array_.reserve(size); + } + + bool empty() const { return size_ == 0UL; } + void clear() { size_ = 0UL; } + + LoDTensor& operator[](size_t id) { return array_[id]; } + const LoDTensor& operator[](size_t id) const { return array_[id]; } + LoDTensor& at(size_t id) { return array_.at(id); } + const LoDTensor& at(size_t id) const { return array_.at(id); } + + size_t size() const { return size_; } + + private: + size_t size_{0}; + std::vector array_; +}; +#endif // !PADDLE_ON_INFERENCE + +} // namespace framework } // namespace paddle diff --git a/paddle/fluid/framework/scope.h b/paddle/fluid/framework/scope.h index 14f9f3681..9462620e8 100644 --- a/paddle/fluid/framework/scope.h +++ b/paddle/fluid/framework/scope.h @@ -78,6 +78,8 @@ class Scope { /// Drop all kids scopes belonged to this scope. void DropKids(); + std::list& kids() const { return kids_; } + /// Find if a scope exists in the kid scopes bool HasKid(const Scope* scope) const; diff --git a/paddle/fluid/inference/CMakeLists.txt b/paddle/fluid/inference/CMakeLists.txt index 9794a193b..dbbe8bcba 100644 --- a/paddle/fluid/inference/CMakeLists.txt +++ b/paddle/fluid/inference/CMakeLists.txt @@ -30,7 +30,7 @@ if (WITH_GPU AND TENSORRT_FOUND) endif() # Create static library -cc_library(paddle_fluid DEPS ${fluid_modules} ${STATIC_INFERENCE_APIS} zero_copy_tensor) +cc_library(paddle_fluid DEPS ${fluid_modules} ${STATIC_INFERENCE_APIS} zero_copy_tensor reset_tensor_array) if(NOT APPLE) # TODO(liuyiqu: Temporarily disable the link flag because it is not support on Mac. @@ -40,7 +40,7 @@ endif() # Create shared library cc_library(paddle_fluid_shared SHARED SRCS ${SHARED_INFERENCE_SRCS} - DEPS ${fluid_modules} paddle_fluid_api) + DEPS ${fluid_modules} paddle_fluid_api reset_tensor_array) set_target_properties(paddle_fluid_shared PROPERTIES OUTPUT_NAME paddle_fluid) if(NOT APPLE) diff --git a/paddle/fluid/inference/api/CMakeLists.txt b/paddle/fluid/inference/api/CMakeLists.txt index 0ddd5d53f..e2027b7cb 100644 --- a/paddle/fluid/inference/api/CMakeLists.txt +++ b/paddle/fluid/inference/api/CMakeLists.txt @@ -18,7 +18,8 @@ if(APPLE) endif(APPLE) -set(inference_deps paddle_inference_api paddle_fluid_api analysis pass ir_pass_manager naive_executor ${GLOB_PASS_LIB}) +set(inference_deps paddle_inference_api paddle_fluid_api analysis pass ir_pass_manager naive_executor ${GLOB_PASS_LIB} + ) if(WITH_GPU AND TENSORRT_FOUND) set(inference_deps ${inference_deps} paddle_inference_tensorrt_subgraph_engine analysis_predictor) @@ -31,10 +32,17 @@ function(inference_api_test TARGET_NAME) set(multiValueArgs ARGS) cmake_parse_arguments(inference_test "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN}) - cc_test(${TARGET_NAME} - SRCS ${inference_test_SRC} - DEPS "${inference_deps}" - ARGS --dirname=${PYTHON_TESTS_DIR}/book/) + if (WITH_GPU) + cc_test(${TARGET_NAME} + SRCS ${inference_test_SRC} + DEPS "${inference_deps}" + ARGS --dirname=${PYTHON_TESTS_DIR}/book/ --fraction_of_gpu_memory_to_use=0.15) + else() + cc_test(${TARGET_NAME} + SRCS ${inference_test_SRC} + DEPS "${inference_deps}" + ARGS --dirname=${PYTHON_TESTS_DIR}/book/) + endif() if(inference_test_ARGS) set_tests_properties(${TARGET_NAME} PROPERTIES DEPENDS "${inference_test_ARGS}") @@ -42,7 +50,8 @@ function(inference_api_test TARGET_NAME) endif(WITH_TESTING) endfunction(inference_api_test) -cc_library(paddle_inference_api SRCS api.cc api_impl.cc helper.cc DEPS lod_tensor scope) +cc_library(reset_tensor_array SRCS details/reset_tensor_array.cc DEPS lod_tensor scope) +cc_library(paddle_inference_api SRCS api.cc api_impl.cc helper.cc DEPS reset_tensor_array lod_tensor scope) cc_library(analysis_predictor SRCS analysis_predictor.cc DEPS paddle_inference_api analysis naive_executor zero_copy_tensor) cc_library(zero_copy_tensor SRCS details/zero_copy_tensor.cc DEPS paddle_inference_api) cc_library(zero_copy_tensor_dummy SRCS details/zero_copy_tensor_dummy.cc DEPS paddle_inference_api) diff --git a/paddle/fluid/inference/api/analysis_predictor.cc b/paddle/fluid/inference/api/analysis_predictor.cc index eec665767..54c37fe64 100644 --- a/paddle/fluid/inference/api/analysis_predictor.cc +++ b/paddle/fluid/inference/api/analysis_predictor.cc @@ -82,6 +82,7 @@ bool AnalysisPredictor::Init( // Get the feed_target_names and fetch_target_names PrepareFeedFetch(); + return true; } @@ -109,6 +110,10 @@ bool AnalysisPredictor::Run(const std::vector &inputs, return false; } VLOG(3) << "predict cost: " << timer.toc() << "ms"; + + // Fix TensorArray reuse not cleaned bug. + tensor_array_batch_cleaner_.CollectTensorArrays(scope_.get()); + tensor_array_batch_cleaner_.ResetTensorArray(); return true; } @@ -322,6 +327,9 @@ std::unique_ptr AnalysisPredictor::GetOutputTensor( bool AnalysisPredictor::ZeroCopyRun() { executor_->Run(); + // Fix TensorArray reuse not cleaned bug. + tensor_array_batch_cleaner_.CollectTensorArrays(scope_.get()); + tensor_array_batch_cleaner_.ResetTensorArray(); return true; } diff --git a/paddle/fluid/inference/api/analysis_predictor.h b/paddle/fluid/inference/api/analysis_predictor.h index 5a9f4d369..b7dc20673 100644 --- a/paddle/fluid/inference/api/analysis_predictor.h +++ b/paddle/fluid/inference/api/analysis_predictor.h @@ -18,6 +18,7 @@ #include "paddle/fluid/framework/naive_executor.h" #include "paddle/fluid/inference/analysis/analyzer.h" #include "paddle/fluid/inference/api/api_impl.h" +#include "paddle/fluid/inference/api/details/reset_tensor_array.h" #include "paddle/fluid/inference/api/paddle_inference_api.h" #include "paddle/fluid/string/printf.h" @@ -88,6 +89,7 @@ class AnalysisPredictor : public PaddlePredictor { // Memory buffer for feed inputs. The temporary LoDTensor will cause serious // concurrency problems, so cache them. std::vector feed_tensors_; + details::TensorArrayBatchCleaner tensor_array_batch_cleaner_; }; } // namespace paddle diff --git a/paddle/fluid/inference/api/api_impl.cc b/paddle/fluid/inference/api/api_impl.cc index 7cda9c5d8..d06ab8f8c 100644 --- a/paddle/fluid/inference/api/api_impl.cc +++ b/paddle/fluid/inference/api/api_impl.cc @@ -22,6 +22,7 @@ limitations under the License. */ #include "paddle/fluid/framework/feed_fetch_method.h" #include "paddle/fluid/inference/api/api_impl.h" +#include "paddle/fluid/inference/api/details/reset_tensor_array.h" #include "paddle/fluid/inference/api/helper.h" #include "paddle/fluid/platform/cpu_helper.h" #include "paddle/fluid/platform/profiler.h" @@ -157,6 +158,10 @@ bool NativePaddlePredictor::Run(const std::vector &inputs, return false; } VLOG(3) << "predict cost: " << timer.toc() << "ms"; + + // Fix TensorArray reuse not cleaned bug. + tensor_array_batch_cleaner_.CollectTensorArrays(scope_.get()); + tensor_array_batch_cleaner_.ResetTensorArray(); return true; } diff --git a/paddle/fluid/inference/api/api_impl.h b/paddle/fluid/inference/api/api_impl.h index 7882f6a53..4e4ab47ca 100644 --- a/paddle/fluid/inference/api/api_impl.h +++ b/paddle/fluid/inference/api/api_impl.h @@ -26,11 +26,11 @@ limitations under the License. */ #include #include -#include "paddle/fluid/inference/api/paddle_inference_api.h" - #include "paddle/fluid/framework/ddim.h" #include "paddle/fluid/framework/lod_tensor.h" +#include "paddle/fluid/framework/lod_tensor_array.h" #include "paddle/fluid/framework/naive_executor.h" +#include "paddle/fluid/inference/api/details/reset_tensor_array.h" #include "paddle/fluid/inference/api/paddle_inference_api.h" #include "paddle/fluid/inference/io.h" #include "paddle/fluid/platform/init.h" @@ -77,6 +77,7 @@ class NativePaddlePredictor : public PaddlePredictor { std::vector fetchs_; // Do not use unique_ptr, use parent scope to delete framework::Scope *sub_scope_{nullptr}; + details::TensorArrayBatchCleaner tensor_array_batch_cleaner_; }; } // namespace paddle diff --git a/paddle/fluid/inference/api/demo_ci/run.sh b/paddle/fluid/inference/api/demo_ci/run.sh index 6e682b695..340e84d93 100755 --- a/paddle/fluid/inference/api/demo_ci/run.sh +++ b/paddle/fluid/inference/api/demo_ci/run.sh @@ -16,7 +16,7 @@ if [ $2 == ON ]; then fi if [ $3 == ON ]; then use_gpu_list='true false' -else +else use_gpu_list='false' fi @@ -60,7 +60,8 @@ for WITH_STATIC_LIB in ON OFF; do -DWITH_MKL=$TURN_ON_MKL \ -DDEMO_NAME=simple_on_word2vec \ -DWITH_GPU=$TEST_GPU_CPU \ - -DWITH_STATIC_LIB=$WITH_STATIC_LIB + -DWITH_STATIC_LIB=$WITH_STATIC_LIB \ + -DON_INFER=ON make -j word2vec_model=${PADDLE_ROOT}'/build/python/paddle/fluid/tests/book/word2vec.inference.model' if [ -d $word2vec_model ]; then @@ -80,10 +81,11 @@ for WITH_STATIC_LIB in ON OFF; do -DWITH_MKL=$TURN_ON_MKL \ -DDEMO_NAME=vis_demo \ -DWITH_GPU=$TEST_GPU_CPU \ - -DWITH_STATIC_LIB=$WITH_STATIC_LIB + -DWITH_STATIC_LIB=$WITH_STATIC_LIB \ + -DON_INFER=ON make -j for use_gpu in $use_gpu_list; do - for vis_demo_name in $vis_demo_list; do + for vis_demo_name in $vis_demo_list; do ./vis_demo \ --modeldir=$DATA_DIR/$vis_demo_name/model \ --data=$DATA_DIR/$vis_demo_name/data.txt \ @@ -95,7 +97,7 @@ for WITH_STATIC_LIB in ON OFF; do fi done done - + # --------tensorrt mobilenet------ if [ $USE_TENSORRT == ON -a $TEST_GPU_CPU == ON ]; then rm -rf * @@ -106,8 +108,9 @@ for WITH_STATIC_LIB in ON OFF; do -DWITH_STATIC_LIB=$WITH_STATIC_LIB \ -DUSE_TENSORRT=$USE_TENSORRT \ -DTENSORRT_INCLUDE_DIR=$TENSORRT_INCLUDE_DIR \ - -DTENSORRT_LIB_DIR=$TENSORRT_LIB_DIR - make -j + -DTENSORRT_LIB_DIR=$TENSORRT_LIB_DIR \ + -DON_INFER=ON + make -j ./trt_mobilenet_demo \ --modeldir=$DATA_DIR/mobilenet/model \ --data=$DATA_DIR/mobilenet/data.txt \ diff --git a/paddle/fluid/inference/api/details/reset_tensor_array.cc b/paddle/fluid/inference/api/details/reset_tensor_array.cc new file mode 100644 index 000000000..4ae6c6dc9 --- /dev/null +++ b/paddle/fluid/inference/api/details/reset_tensor_array.cc @@ -0,0 +1,50 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/inference/api/details/reset_tensor_array.h" + +namespace paddle { +namespace details { + +// Should be called after the parameters are loaded. +void TensorArrayBatchCleaner::CollectTensorArrays(framework::Scope *scope) { + if (flag_) { + for (auto &var_name : scope->LocalVarNames()) { + auto *var = scope->FindVar(var_name); + // TODO(Superjomn) should avoid the case when a TensorArray is a + // parameter. + if (var_name == "feed" || var_name == "fetch") continue; + if (var->Type() == typeid(framework::LoDTensorArray)) { + VLOG(4) << "collect " << var_name; + arrays_.push_back(var->GetMutable()); + } + } + for (auto *kid : scope->kids()) { + CollectTensorArrays(kid); + } + + VLOG(3) << "Collect " << arrays_.size() << " arrays"; + flag_ = false; + } +} + +// Should be called when `Run` finished. +void TensorArrayBatchCleaner::ResetTensorArray() { + for (auto *arr : arrays_) { + arr->clear(); + } +} + +} // namespace details +} // namespace paddle diff --git a/paddle/fluid/inference/api/details/reset_tensor_array.h b/paddle/fluid/inference/api/details/reset_tensor_array.h new file mode 100644 index 000000000..a39449ff0 --- /dev/null +++ b/paddle/fluid/inference/api/details/reset_tensor_array.h @@ -0,0 +1,37 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include +#include "paddle/fluid/framework/lod_tensor_array.h" +#include "paddle/fluid/framework/scope.h" + +namespace paddle { +namespace details { + +// Clean the TensorArray each batch to make the behavior the same with the +// training phase. +struct TensorArrayBatchCleaner { + // Fix the tensor array not clear in the inference scenarios. + void CollectTensorArrays(framework::Scope *scope); + void ResetTensorArray(); + + private: + bool flag_{true}; + std::vector arrays_; +}; + +} // namespace details +} // namespace paddle diff --git a/paddle/fluid/inference/tests/api/analyzer_rnn1_tester.cc b/paddle/fluid/inference/tests/api/analyzer_rnn1_tester.cc index 639947668..e0416ff95 100644 --- a/paddle/fluid/inference/tests/api/analyzer_rnn1_tester.cc +++ b/paddle/fluid/inference/tests/api/analyzer_rnn1_tester.cc @@ -228,6 +228,7 @@ void SetInput(std::vector> *inputs) { TEST(Analyzer_rnn1, profile) { contrib::AnalysisConfig cfg; SetConfig(&cfg); + cfg.use_gpu = false; std::vector outputs; std::vector> input_slots_all; diff --git a/paddle/fluid/operators/beam_search_decode_op.cc b/paddle/fluid/operators/beam_search_decode_op.cc index b6cb93581..0d32cae0e 100644 --- a/paddle/fluid/operators/beam_search_decode_op.cc +++ b/paddle/fluid/operators/beam_search_decode_op.cc @@ -79,6 +79,9 @@ struct BeamSearchDecodeFunctor { bool tensor_on_gpu_; size_t beam_size_; int end_id_; + // TODO(Superjomn) Here might result serious performance issue in the + // concurrency + // scenarios. const LoDTensorArray& step_ids_origin_; const LoDTensorArray& step_scores_origin_; LoDTensorArray step_ids_ = LoDTensorArray(); diff --git a/paddle/scripts/paddle_build.sh b/paddle/scripts/paddle_build.sh index d6b9d1108..5a71382fb 100755 --- a/paddle/scripts/paddle_build.sh +++ b/paddle/scripts/paddle_build.sh @@ -659,7 +659,7 @@ function gen_fluid_lib() { Generating fluid library for train and inference ... ======================================== EOF - cmake .. -DWITH_DISTRIBUTE=OFF + cmake .. -DWITH_DISTRIBUTE=OFF -DON_INFER=ON make -j `nproc` fluid_lib_dist make -j `nproc` inference_lib_dist fi -- GitLab