From c8744d118d3ad02ae250f66f9b4f82f2e0687d67 Mon Sep 17 00:00:00 2001 From: Yan Chunwei Date: Fri, 28 Sep 2018 12:51:12 +0800 Subject: [PATCH] fea/infer executor and concurrency performance issue bug fix (#13451) - add naive executor - fix concurrency performance issue --- cmake/external/anakin.cmake | 1 + paddle/fluid/framework/CMakeLists.txt | 7 +- paddle/fluid/framework/ir/CMakeLists.txt | 12 +- paddle/fluid/framework/naive_executor.cc | 150 +++++++++ paddle/fluid/framework/naive_executor.h | 63 ++++ paddle/fluid/framework/naive_executor_test.cc | 70 +++++ paddle/fluid/framework/operator.cc | 10 +- paddle/fluid/framework/scope.cc | 31 ++ paddle/fluid/inference/CMakeLists.txt | 2 +- .../fluid/inference/analysis/CMakeLists.txt | 2 +- paddle/fluid/inference/api/CMakeLists.txt | 20 +- .../fluid/inference/api/analysis_predictor.cc | 242 +++++++++++++-- .../fluid/inference/api/analysis_predictor.h | 59 +++- .../api/analysis_predictor_tester.cc | 67 ++++ paddle/fluid/inference/api/api.cc | 38 ++- paddle/fluid/inference/api/api_impl.cc | 2 +- paddle/fluid/inference/api/api_impl.h | 22 +- paddle/fluid/inference/api/api_impl_tester.cc | 6 +- .../inference/api/details/zero_copy_tensor.cc | 111 +++++++ .../api/details/zero_copy_tensor_dummy.cc | 46 +++ paddle/fluid/inference/api/helper.h | 138 +++++++++ .../inference/api/paddle_inference_api.h | 54 +++- .../tests/api/analyzer_lac_tester.cc | 7 +- .../tests/api/analyzer_ner_tester.cc | 5 +- .../tests/api/analyzer_rnn1_tester.cc | 285 +++++++++++++++++- .../tests/api/analyzer_seq_conv1_tester.cc | 3 +- .../tests/api/analyzer_vis_tester.cc | 4 +- .../fluid/inference/tests/api/tester_helper.h | 6 +- paddle/fluid/memory/malloc.cc | 21 ++ paddle/fluid/string/pretty_log.h | 8 +- .../fluid/tests/unittests/CMakeLists.txt | 1 - 31 files changed, 1387 insertions(+), 106 deletions(-) create mode 100644 paddle/fluid/framework/naive_executor.cc create mode 100644 paddle/fluid/framework/naive_executor.h create mode 100644 paddle/fluid/framework/naive_executor_test.cc create mode 100644 paddle/fluid/inference/api/analysis_predictor_tester.cc create mode 100644 paddle/fluid/inference/api/details/zero_copy_tensor.cc create mode 100644 paddle/fluid/inference/api/details/zero_copy_tensor_dummy.cc diff --git a/cmake/external/anakin.cmake b/cmake/external/anakin.cmake index ed054ff41ae..84354c446e2 100644 --- a/cmake/external/anakin.cmake +++ b/cmake/external/anakin.cmake @@ -52,6 +52,7 @@ ExternalProject_Add( PREFIX ${ANAKIN_SOURCE_DIR} UPDATE_COMMAND "" CMAKE_ARGS ${CMAKE_ARGS_PREFIX} + -DUSE_LOGGER=YES -DUSE_X86_PLACE=YES -DBUILD_WITH_UNIT_TEST=NO -DPROTOBUF_ROOT=${THIRD_PARTY_PATH}/install/protobuf diff --git a/paddle/fluid/framework/CMakeLists.txt b/paddle/fluid/framework/CMakeLists.txt index 69c6dd02005..39898dd2364 100644 --- a/paddle/fluid/framework/CMakeLists.txt +++ b/paddle/fluid/framework/CMakeLists.txt @@ -56,9 +56,9 @@ else() cc_test(mixed_vector_test SRCS mixed_vector_test.cc DEPS place memory device_context tensor) endif() if (NOT WIN32) -cc_library(lod_tensor SRCS lod_tensor.cc DEPS ddim place tensor framework_proto recordio version) + cc_library(lod_tensor SRCS lod_tensor.cc DEPS ddim place tensor framework_proto recordio version) else() -cc_library(lod_tensor SRCS lod_tensor.cc DEPS ddim place tensor framework_proto version) + cc_library(lod_tensor SRCS lod_tensor.cc DEPS ddim place tensor framework_proto version) endif (NOT WIN32) cc_test(lod_tensor_test SRCS lod_tensor_test.cc DEPS lod_tensor memory) @@ -141,12 +141,15 @@ cc_library(lod_rank_table SRCS lod_rank_table.cc DEPS lod_tensor) cc_library(feed_fetch_method SRCS feed_fetch_method.cc DEPS lod_tensor scope glog) +cc_library(naive_executor SRCS naive_executor.cc DEPS op_registry device_context scope framework_proto glog lod_rank_table feed_fetch_method graph_to_program_pass) + if(WITH_DISTRIBUTE) cc_library(executor SRCS executor.cc DEPS op_registry device_context scope framework_proto glog lod_rank_table feed_fetch_method sendrecvop_grpc cares grpc++_unsecure grpc_unsecure gpr graph_to_program_pass) set(DISTRIBUTE_COMPILE_FLAGS "-Wno-non-virtual-dtor -Wno-error=non-virtual-dtor -Wno-error=delete-non-virtual-dtor") set_source_files_properties(executor.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS}) else() cc_library(executor SRCS executor.cc DEPS op_registry device_context scope framework_proto glog lod_rank_table feed_fetch_method graph_to_program_pass) + cc_test(test_naive_executor SRCS naive_executor_test.cc DEPS naive_executor op_registry device_context scope framework_proto glog lod_rank_table feed_fetch_method graph_to_program_pass elementwise_add_op) endif() if (NOT WIN32) diff --git a/paddle/fluid/framework/ir/CMakeLists.txt b/paddle/fluid/framework/ir/CMakeLists.txt index 9796f277895..a0bf1afd402 100644 --- a/paddle/fluid/framework/ir/CMakeLists.txt +++ b/paddle/fluid/framework/ir/CMakeLists.txt @@ -28,9 +28,9 @@ cc_library(graph_pattern_detector SRCS graph_pattern_detector.cc DEPS graph grap pass_library(graph_to_program_pass base) pass_library(graph_viz_pass base) pass_library(fc_fuse_pass inference) -if(WITH_MKLDNN) - pass_library(conv_relu_mkldnn_fuse_pass inference) -endif() +if (WITH_MKLDNN) + pass_library(conv_relu_mkldnn_fuse_pass inference) +endif () pass_library(attention_lstm_fuse_pass inference) pass_library(infer_clean_graph_pass inference) pass_library(fc_lstm_fuse_pass inference) @@ -49,6 +49,6 @@ cc_test(graph_helper_test SRCS graph_helper_test.cc DEPS graph graph_helper op_r cc_test(graph_to_program_pass_test SRCS graph_to_program_pass_test.cc DEPS graph_to_program_pass) cc_test(test_graph_pattern_detector SRCS graph_pattern_detector_tester.cc DEPS graph_pattern_detector) cc_test(test_fc_fuse_pass SRCS fc_fuse_pass_tester.cc DEPS fc_fuse_pass framework_proto) -if(WITH_MKLDNN) - cc_test(test_conv_relu_mkldnn_fuse_pass SRCS conv_relu_mkldnn_fuse_pass_tester.cc DEPS conv_relu_mkldnn_fuse_pass) -endif() +if (WITH_MKLDNN) + cc_test(test_conv_relu_mkldnn_fuse_pass SRCS conv_relu_mkldnn_fuse_pass_tester.cc DEPS conv_relu_mkldnn_fuse_pass) +endif () diff --git a/paddle/fluid/framework/naive_executor.cc b/paddle/fluid/framework/naive_executor.cc new file mode 100644 index 00000000000..f681d4ecef9 --- /dev/null +++ b/paddle/fluid/framework/naive_executor.cc @@ -0,0 +1,150 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/framework/naive_executor.h" +#include "paddle/fluid/framework/channel.h" +#include "paddle/fluid/framework/feed_fetch_method.h" +#include "paddle/fluid/framework/lod_rank_table.h" +#include "paddle/fluid/framework/lod_tensor_array.h" +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/framework/reader.h" +#include "paddle/fluid/string/pretty_log.h" + +namespace paddle { +namespace framework { + +// These code can be shared with Executor. +static void InitializeVariable(Variable *var, proto::VarType::Type var_type) { + if (var_type == proto::VarType::LOD_TENSOR) { + var->GetMutable(); + } else if (var_type == proto::VarType::SELECTED_ROWS) { + var->GetMutable(); + } else if (var_type == proto::VarType::FEED_MINIBATCH) { + var->GetMutable(); + } else if (var_type == proto::VarType::FETCH_LIST) { + var->GetMutable(); + } else if (var_type == proto::VarType::STEP_SCOPES) { + var->GetMutable>(); + } else if (var_type == proto::VarType::LOD_RANK_TABLE) { + var->GetMutable(); + } else if (var_type == proto::VarType::LOD_TENSOR_ARRAY) { + var->GetMutable(); + } else if (var_type == proto::VarType::PLACE_LIST) { + var->GetMutable(); + } else if (var_type == proto::VarType::READER) { + var->GetMutable(); + } else if (var_type == proto::VarType::CHANNEL) { + var->GetMutable(); + } else if (var_type == proto::VarType::RAW) { + // GetMutable will be called in operator + } else { + PADDLE_THROW( + "Variable type %d is not in " + "[LOD_TENSOR, SELECTED_ROWS, FEED_MINIBATCH, FETCH_LIST, " + "LOD_RANK_TABLE, PLACE_LIST, READER, CHANNEL, RAW]", + var_type); + } +} + +void NaiveExecutor::Prepare(Scope *parent_scope, + const ProgramDesc &program_desc, int block_id, + bool with_feed_fetch_ops) { + if (!parent_scope) { + scope_ = new framework::Scope; + } else { + scope_ = &parent_scope->NewScope(); + } + CreateVariables(program_desc, scope_, block_id); + CreateOps(program_desc, block_id, with_feed_fetch_ops); +} + +void NaiveExecutor::Run() { + for (auto &op : ops_) { + VLOG(4) << "run " << op->Type(); + op->Run(*scope_, place_); + } +} + +void NaiveExecutor::CreateVariables(const ProgramDesc &desc, Scope *scope, + int block_id) { + PADDLE_ENFORCE(scope); + auto &global_block = desc.Block(block_id); + + const Scope *ancestor_scope = scope; + while (ancestor_scope->parent()) { + ancestor_scope = ancestor_scope->parent(); + } + + if (ancestor_scope != scope) { + for (auto &var : global_block.AllVars()) { + if (var->Name() == framework::kEmptyVarName) { + continue; + } + // Create persistable vars in ancestor scope. + if (var->Persistable()) { + auto *ptr = const_cast(ancestor_scope)->Var(var->Name()); + InitializeVariable(ptr, var->GetType()); + VLOG(3) << "Create Variable " << var->Name() + << " global, which pointer is " << ptr; + } else { // Create temporary variables in local scope. + auto *ptr = scope->Var(var->Name()); + InitializeVariable(ptr, var->GetType()); + VLOG(3) << "Create Variable " << var->Name() + << " locally, which pointer is " << ptr; + } + } + } else { + for (auto &var : global_block.AllVars()) { + auto *ptr = scope->Var(var->Name()); + InitializeVariable(ptr, var->GetType()); + VLOG(3) << "Create variable " << var->Name() << ", which pointer is " + << ptr; + } + } +} + +void NaiveExecutor::CreateOps(const ProgramDesc &desc, int block_id, + bool with_feed_fetch_ops) { + for (const auto &op_desc : desc.Block(block_id).AllOps()) { + if (!with_feed_fetch_ops && + (op_desc->Type() == "feed" || op_desc->Type() == "fetch")) { + string::PrettyLogEndl(string::Style::detail(), "--- skip [%s], %s -> %s", + op_desc->Input("X")[0], op_desc->Type(), + op_desc->Output("Out")[0]); + continue; + } + ops_.emplace_back(OpRegistry::CreateOp(*op_desc)); + } +} + +LoDTensor *NaiveExecutor::FindTensor(const std::string &name) { + PADDLE_ENFORCE(scope_, "Need to init scope first"); + auto *var = scope_->FindVar(name); + PADDLE_ENFORCE(var, "No variable [%s] in the scope"); + auto *tensor = const_cast(&var->Get()); + return tensor; +} + +void NaiveExecutor::CleanFeedFetchOps() { + std::vector> ops; + for (auto &op : ops_) { + if (op->Type() != "feed" && op->Type() != "fetch") { + ops.emplace_back(std::move(op)); + } + } + ops_.swap(ops); +} + +} // namespace framework +} // namespace paddle diff --git a/paddle/fluid/framework/naive_executor.h b/paddle/fluid/framework/naive_executor.h new file mode 100644 index 00000000000..9355e9e36a6 --- /dev/null +++ b/paddle/fluid/framework/naive_executor.h @@ -0,0 +1,63 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include "paddle/fluid/framework/operator.h" +#include "paddle/fluid/framework/program_desc.h" +#include "paddle/fluid/framework/scope.h" +#include "paddle/fluid/platform/device_context.h" + +namespace paddle { +namespace framework { + +/* + * Simple, intuitive and effective. Only single thread is supported, and + * currently designed for inference. + */ +class NaiveExecutor { + public: + explicit NaiveExecutor(const platform::Place& place) : place_(place) {} + + // Create child scope. + // Create variables. + // @with_feed_fetch_ops: whether to work with the feed and fetch operators. + void Prepare(Scope* parent_scope, const ProgramDesc& program_desc, + int block_id, bool with_feed_fetch_ops); + + // Run all the operators. + void Run(); + + // Get an tensor to operating directly, without the need for feed_ops. + LoDTensor* FindTensor(const std::string& name); + + Scope* scope() { return scope_; } + + void CleanFeedFetchOps(); + + protected: + void CreateVariables(const ProgramDesc& desc, Scope* scope, int block_id); + + void CreateOps(const ProgramDesc& desc, int block_id, + bool with_feed_fetch_ops); + + private: + const platform::Place place_; + // Catch the required resource to avoid recreate. + std::vector> ops_; + Scope* scope_; +}; + +} // namespace framework +} // namespace paddle diff --git a/paddle/fluid/framework/naive_executor_test.cc b/paddle/fluid/framework/naive_executor_test.cc new file mode 100644 index 00000000000..6b9f79b9d39 --- /dev/null +++ b/paddle/fluid/framework/naive_executor_test.cc @@ -0,0 +1,70 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/framework/naive_executor.h" +#include +#include +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/framework/program_desc.h" + +namespace paddle { +namespace framework { + +TEST(NaiveExecutor, Basic) { + ProgramDesc program; + auto* main_block = program.MutableBlock(0); + auto* a = main_block->Var("a"); // input + auto* b = main_block->Var("b"); // input + auto* c = main_block->Var("c"); // input + a->SetType(proto::VarType::LOD_TENSOR); + b->SetType(proto::VarType::LOD_TENSOR); + c->SetType(proto::VarType::LOD_TENSOR); + + auto* add = main_block->AppendOp(); + add->SetType("elementwise_add"); + add->SetInput("X", {"a"}); + add->SetInput("Y", {"b"}); + add->SetOutput("Out", {"c"}); + + auto place = platform::CPUPlace(); + NaiveExecutor exe(place); + exe.Prepare(nullptr, program, 0, false /*with feed fetch ops*/); + auto* a_tensor = exe.FindTensor("a"); + auto* b_tensor = exe.FindTensor("b"); + auto* c_tensor = exe.FindTensor("c"); + + a_tensor->Resize({1, 4}); + b_tensor->Resize({1, 4}); + c_tensor->Resize({1, 4}); + b_tensor->mutable_data(place); + a_tensor->mutable_data(place); + + float a_arr[] = {0, 1, 2, 3}; + float b_arr[] = {0.0, .1, .2, .3}; + + std::copy_n(a_arr, 4, a_tensor->mutable_data(place)); + std::copy_n(b_arr, 4, b_tensor->mutable_data(place)); + + exe.Run(); + + auto* c_data = c_tensor->mutable_data(place); + for (int i = 0; i < 4; i++) { + EXPECT_NEAR(c_data[i], 1.1 * i, 1e-3); + } +} + +} // namespace framework +} // namespace paddle + +USE_OP(elementwise_add); diff --git a/paddle/fluid/framework/operator.cc b/paddle/fluid/framework/operator.cc index e800cb9993d..96624e33c63 100644 --- a/paddle/fluid/framework/operator.cc +++ b/paddle/fluid/framework/operator.cc @@ -154,9 +154,15 @@ void OperatorBase::Run(const Scope& scope, const platform::Place& place) { platform::SetDeviceId(dev_id); #endif } - platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance(); - platform::RecordEvent record_event(Type(), pool.Get(place)); + + if (platform::IsProfileEnabled()) { + platform::DeviceContextPool& pool = + platform::DeviceContextPool::Instance(); + platform::RecordEvent record_event(Type(), pool.Get(place)); + } + RunImpl(scope, place); + if (VLOG_IS_ON(3)) { VLOG(3) << place << " " << DebugStringEx(&scope); } diff --git a/paddle/fluid/framework/scope.cc b/paddle/fluid/framework/scope.cc index 1a727a2c8c7..40dee143f5d 100644 --- a/paddle/fluid/framework/scope.cc +++ b/paddle/fluid/framework/scope.cc @@ -20,6 +20,13 @@ limitations under the License. */ #include "paddle/fluid/framework/threadpool.h" #include "paddle/fluid/string/printf.h" +// The mutex is not needed by training and inference, only for distribution. +#if PADDLE_WITH_DISTRIBUTE +#define WITH_LOCK 1 +#else +#define WITH_LOCK 0 +#endif + DEFINE_bool(benchmark, false, "Doing memory benchmark. It will make deleting scope synchronized, " "and add some memory usage logs." @@ -49,18 +56,24 @@ int64_t GetEagerDeletionThreshold() { Scope::~Scope() { DropKids(); } Scope& Scope::NewScope() const { +#if WITH_LOCK std::unique_lock lock(mutex_); +#endif kids_.push_back(new Scope(this)); return *kids_.back(); } Variable* Scope::Var(const std::string& name) { +#if WITH_LOCK std::unique_lock lock(mutex_); +#endif return VarInternal(name); } Variable* Scope::Var(std::string* name) { +#if WITH_LOCK std::unique_lock lock(mutex_); +#endif auto new_name = string::Sprintf("%p.%d", this, vars_.size()); if (name != nullptr) { *name = new_name; @@ -69,29 +82,39 @@ Variable* Scope::Var(std::string* name) { } Variable* Scope::FindVar(const std::string& name) const { +#if WITH_LOCK std::unique_lock lock(mutex_); +#endif return FindVarInternal(name); } const Scope* Scope::FindScope(const Variable* var) const { +#if WITH_LOCK std::unique_lock lock(mutex_); +#endif return FindScopeInternal(var); } void Scope::DropKids() { +#if WITH_LOCK std::unique_lock lock(mutex_); +#endif for (Scope* s : kids_) delete s; kids_.clear(); } bool Scope::HasKid(const Scope* scope) const { +#if WITH_LOCK std::unique_lock lock(mutex_); +#endif auto it = std::find(this->kids_.begin(), this->kids_.end(), scope); return it != this->kids_.end(); } std::vector Scope::LocalVarNames() const { +#if WITH_LOCK std::unique_lock lock(mutex_); +#endif std::vector known_vars; known_vars.reserve(this->vars_.size()); for (auto& p : vars_) { @@ -101,7 +124,9 @@ std::vector Scope::LocalVarNames() const { } void Scope::DeleteScope(Scope* scope) const { +#if WITH_LOCK std::unique_lock lock(mutex_); +#endif auto it = std::find(this->kids_.begin(), this->kids_.end(), scope); PADDLE_ENFORCE(it != this->kids_.end(), "Cannot find %p as kid scope", scope); this->kids_.erase(it); @@ -114,7 +139,9 @@ void Scope::DeleteScope(Scope* scope) const { } void Scope::EraseVars(const std::vector& var_names) { +#if WITH_LOCK std::unique_lock lock(mutex_); +#endif std::set var_set(var_names.begin(), var_names.end()); for (auto it = vars_.begin(); it != vars_.end();) { if (var_set.find(it->first) != var_set.end()) { @@ -127,12 +154,16 @@ void Scope::EraseVars(const std::vector& var_names) { void Scope::Rename(const std::string& origin_name, const std::string& new_name) const { +#if WITH_LOCK std::unique_lock lock(mutex_); +#endif RenameInternal(origin_name, new_name); } std::string Scope::Rename(const std::string& origin_name) const { +#if WITH_LOCK std::unique_lock lock(mutex_); +#endif auto new_name = string::Sprintf("%p.%d", this, vars_.size()); RenameInternal(origin_name, new_name); return new_name; diff --git a/paddle/fluid/inference/CMakeLists.txt b/paddle/fluid/inference/CMakeLists.txt index 6698efd1fa7..db381bbc391 100644 --- a/paddle/fluid/inference/CMakeLists.txt +++ b/paddle/fluid/inference/CMakeLists.txt @@ -53,7 +53,7 @@ if(NOT APPLE) endif() if(WITH_TESTING) - # tests/book depends the models that generated by python/paddle/fluid/tests/book + # tests/book depends the models that generated by python/paddle/fluid/tests/book add_subdirectory(tests/book) if(WITH_INFERENCE_API_TEST) add_subdirectory(tests/api) diff --git a/paddle/fluid/inference/analysis/CMakeLists.txt b/paddle/fluid/inference/analysis/CMakeLists.txt index c2a1c6634bd..c740ea009f6 100644 --- a/paddle/fluid/inference/analysis/CMakeLists.txt +++ b/paddle/fluid/inference/analysis/CMakeLists.txt @@ -1,6 +1,6 @@ cc_library(ir_pass_manager SRCS ir_pass_manager.cc DEPS graph pass) set(analysis_deps - framework_proto proto_desc ir_pass_manager graph pass paddle_fluid_api executor pretty_log) + framework_proto proto_desc ir_pass_manager graph pass paddle_fluid_api executor pretty_log) cc_library(analysis SRCS pass_manager.cc node.cc data_flow_graph.cc graph_traits.cc subgraph_splitter.cc analyzer.cc diff --git a/paddle/fluid/inference/api/CMakeLists.txt b/paddle/fluid/inference/api/CMakeLists.txt index e569df94c54..32d58b87413 100644 --- a/paddle/fluid/inference/api/CMakeLists.txt +++ b/paddle/fluid/inference/api/CMakeLists.txt @@ -18,10 +18,10 @@ if(APPLE) endif(APPLE) -set(inference_deps paddle_inference_api paddle_fluid_api analysis pass ir_pass_manager ${GLOB_PASS_LIB}) +set(inference_deps paddle_inference_api paddle_fluid_api analysis pass ir_pass_manager naive_executor ${GLOB_PASS_LIB}) if(WITH_GPU AND TENSORRT_FOUND) - set(inference_deps ${inference_deps} paddle_inference_tensorrt_subgraph_engine) + set(inference_deps ${inference_deps} paddle_inference_tensorrt_subgraph_engine analysis_predictor) endif() function(inference_api_test TARGET_NAME) @@ -43,8 +43,10 @@ function(inference_api_test TARGET_NAME) endif(WITH_TESTING) endfunction(inference_api_test) -cc_library(paddle_inference_api SRCS api.cc api_impl.cc helper.cc DEPS lod_tensor) -cc_library(analysis_predictor SRCS analysis_predictor.cc DEPS paddle_inference_api analysis) +cc_library(paddle_inference_api SRCS api.cc api_impl.cc helper.cc DEPS lod_tensor scope) +cc_library(analysis_predictor SRCS analysis_predictor.cc DEPS paddle_inference_api analysis naive_executor zero_copy_tensor) +cc_library(zero_copy_tensor SRCS details/zero_copy_tensor.cc DEPS paddle_inference_api) +cc_library(zero_copy_tensor_dummy SRCS details/zero_copy_tensor_dummy.cc DEPS paddle_inference_api) cc_test(test_paddle_inference_api SRCS api_tester.cc DEPS paddle_inference_api) @@ -52,18 +54,22 @@ cc_test(test_paddle_inference_api inference_api_test(test_api_impl SRC api_impl_tester.cc ARGS test_word2vec test_image_classification) +set(PYTHON_TESTS_DIR ${PADDLE_BINARY_DIR}/python/paddle/fluid/tests) +cc_test(test_analysis_predictor SRCS analysis_predictor_tester.cc DEPS analysis_predictor ${inference_deps} paddle_inference_api + ARGS --dirname=${PYTHON_TESTS_DIR}/book) + if(WITH_GPU AND TENSORRT_FOUND) cc_library(paddle_inference_tensorrt_subgraph_engine SRCS api_tensorrt_subgraph_engine.cc - DEPS paddle_inference_api analysis tensorrt_engine paddle_inference_api paddle_fluid_api tensorrt_converter) + DEPS paddle_inference_api analysis tensorrt_engine paddle_inference_api paddle_fluid_api tensorrt_converter zero_copy_tensor_dummy) inference_api_test(test_api_tensorrt_subgraph_engine SRC api_tensorrt_subgraph_engine_tester.cc ARGS test_word2vec) endif() if (WITH_ANAKIN AND WITH_MKL) # only needed in CI # compile the libinference_anakin_api.a and anakin.so. - cc_library(inference_anakin_api SRCS api.cc api_anakin_engine.cc DEPS anakin_shared anakin_saber mklml) - cc_library(inference_anakin_api_shared SHARED SRCS api.cc api_anakin_engine.cc DEPS anakin_shared anakin_saber) + cc_library(inference_anakin_api SRCS api.cc api_anakin_engine.cc DEPS anakin_shared anakin_saber mklml scope zero_copy_tensor_dummy) + cc_library(inference_anakin_api_shared SHARED SRCS api.cc api_anakin_engine.cc DEPS anakin_shared anakin_saber scope) function(anakin_target target_name) target_compile_options(${target_name} BEFORE PUBLIC ${ANAKIN_COMPILE_EXTRA_FLAGS}) endfunction() diff --git a/paddle/fluid/inference/api/analysis_predictor.cc b/paddle/fluid/inference/api/analysis_predictor.cc index 1032aadcbda..0c11694d5a9 100644 --- a/paddle/fluid/inference/api/analysis_predictor.cc +++ b/paddle/fluid/inference/api/analysis_predictor.cc @@ -16,11 +16,15 @@ #include #include #include +#include "paddle/fluid/framework/feed_fetch_method.h" #include "paddle/fluid/framework/ir/fuse_pass_base.h" #include "paddle/fluid/framework/ir/pass.h" +#include "paddle/fluid/framework/naive_executor.h" #include "paddle/fluid/framework/scope.h" +#include "paddle/fluid/inference/api/helper.h" #include "paddle/fluid/inference/api/paddle_inference_api.h" #include "paddle/fluid/inference/api/paddle_inference_pass.h" +#include "paddle/fluid/inference/api/timer.h" #include "paddle/fluid/inference/utils/singleton.h" #include "paddle/fluid/platform/profiler.h" @@ -28,8 +32,11 @@ DECLARE_bool(profile); namespace paddle { +using contrib::AnalysisConfig; + bool AnalysisPredictor::Init( - const std::shared_ptr& parent_scope) { + const std::shared_ptr &parent_scope, + const std::shared_ptr &program) { VLOG(3) << "Predictor::init()"; #if !defined(_WIN32) if (FLAGS_profile) { @@ -43,7 +50,8 @@ bool AnalysisPredictor::Init( if (config_.use_gpu) { place_ = paddle::platform::CUDAPlace(config_.device); - LOG(WARNING) << "ir optimize only supports CPU currently"; + LOG(WARNING) << "ir optimize only supports CPU currently, enable_ir_optim " + "is turned false."; config_.enable_ir_optim = false; } else { place_ = paddle::platform::CPUPlace(); @@ -56,37 +64,134 @@ bool AnalysisPredictor::Init( scope_.reset(new paddle::framework::Scope()); } - executor_.reset(new paddle::framework::Executor(place_)); + executor_.reset(new paddle::framework::NaiveExecutor(place_)); - // Initialize the inference program - if (!config_.model_dir.empty()) { - // Parameters are saved in separate files sited in - // the specified `dirname`. - inference_program_ = paddle::inference::Load(executor_.get(), scope_.get(), - config_.model_dir); - } else if (!config_.prog_file.empty() && !config_.param_file.empty()) { - // All parameters are saved in a single file. - // The file names should be consistent with that used - // in Python API `fluid.io.save_inference_model`. - inference_program_ = paddle::inference::Load( - executor_.get(), scope_.get(), config_.prog_file, config_.param_file); + if (!program) { + if (!LoadProgramDesc()) return false; + OptimizeInferenceProgram(); } else { - LOG(ERROR) << "fail to load inference model from " << config_.model_dir; + inference_program_ = program; + } + executor_->Prepare(scope_.get(), *inference_program_, 0, + config_.use_feed_fetch_ops); + + // Get the feed_target_names and fetch_target_names + PrepareFeedFetch(); + return true; +} + +bool AnalysisPredictor::Run(const std::vector &inputs, + std::vector *output_data, + int batch_size) { + VLOG(3) << "Predictor::predict"; + inference::Timer timer; + timer.tic(); + // set feed variable + std::vector feeds; + framework::Scope *scope = sub_scope_ ? sub_scope_ : scope_.get(); + if (!SetFeed(inputs, scope)) { + LOG(ERROR) << "fail to set feed"; return false; } + // Run the inference program + // if share variables, we need not create variables + executor_->Run(); - OptimizeInferenceProgram(); - if (config_._use_mkldnn) { - executor_->EnableMKLDNN(*inference_program_); + // get fetch variable + if (!GetFetch(output_data, scope)) { + LOG(ERROR) << "fail to get fetches"; + return false; } - ctx_ = executor_->Prepare(*inference_program_, 0); + VLOG(3) << "predict cost: " << timer.toc() << "ms"; + return true; +} - VLOG(5) << "to create variables"; - PADDLE_ENFORCE(scope_.get()); - executor_->CreateVariables(*inference_program_, - sub_scope_ ? sub_scope_ : scope_.get(), 0); - // Get the feed_target_names and fetch_target_names - PrepareFeedFetch(); +bool AnalysisPredictor::SetFeed(const std::vector &inputs, + framework::Scope *scope) { + VLOG(3) << "Predictor::set_feed"; + if (inputs.size() != feeds_.size()) { + LOG(ERROR) << "wrong feed input size, need " << feeds_.size() << " but get " + << inputs.size(); + return false; + } + + // Cache the inputs memory for better concurrency performance. + feed_tensors_.resize(inputs.size()); + + for (size_t i = 0; i < inputs.size(); ++i) { + auto &input = feed_tensors_[i]; + framework::DDim ddim = framework::make_ddim(inputs[i].shape); + void *input_ptr; + if (inputs[i].dtype == PaddleDType::INT64) { + input_ptr = input.mutable_data(ddim, platform::CPUPlace()); + } else if (inputs[i].dtype == PaddleDType::FLOAT32) { + input_ptr = input.mutable_data(ddim, platform::CPUPlace()); + } else { + LOG(ERROR) << "unsupported feed type " << inputs[i].dtype; + return false; + } + + // TODO(panyx0718): Init LoDTensor from existing memcpy to save a copy. + std::memcpy(static_cast(input_ptr), inputs[i].data.data(), + inputs[i].data.length()); + // TODO(Superjomn) Low performance, need optimization for heavy LoD copy. + framework::LoD lod; + for (auto &level : inputs[i].lod) { + lod.emplace_back(level); + } + input.set_lod(lod); + int idx = -1; + if (config_.specify_input_name) { + idx = feed_names_[inputs[i].name]; + } else { + idx = boost::get(feeds_[i]->GetAttr("col")); + } + framework::SetFeedVariable(scope, input, "feed", idx); + } + return true; +} + +template +void AnalysisPredictor::GetFetchOne(const framework::LoDTensor &fetch, + PaddleTensor *output) { + // set shape. + auto shape = framework::vectorize(fetch.dims()); + output->shape.assign(shape.begin(), shape.end()); + // set data. + const T *data = fetch.data(); + int num_elems = inference::VecReduceToInt(shape); + output->data.Resize(num_elems * sizeof(T)); + // The fetched tensor output by fetch op, should always in CPU memory, so just + // copy. + memcpy(output->data.data(), data, num_elems * sizeof(T)); + // set lod + output->lod.clear(); + for (auto &level : fetch.lod()) { + output->lod.emplace_back(level.begin(), level.end()); + } +} + +bool AnalysisPredictor::GetFetch(std::vector *outputs, + framework::Scope *scope) { + VLOG(3) << "Predictor::get_fetch"; + outputs->resize(fetchs_.size()); + for (size_t i = 0; i < fetchs_.size(); ++i) { + int idx = boost::get(fetchs_[i]->GetAttr("col")); + PADDLE_ENFORCE((size_t)idx == i); + framework::LoDTensor &fetch = + framework::GetFetchVariable(*scope, "fetch", idx); + auto type = fetch.type(); + auto output = &(outputs->at(i)); + if (type == typeid(float)) { + GetFetchOne(fetch, output); + output->dtype = PaddleDType::FLOAT32; + } else if (type == typeid(int64_t)) { + GetFetchOne(fetch, output); + output->dtype = PaddleDType::INT64; + } else { + LOG(ERROR) << "unknown type, only support float32 and int64 now."; + } + } return true; } @@ -107,6 +212,7 @@ void AnalysisPredictor::OptimizeInferenceProgram() { new std::string(config_.prog_file)); argument_.fluid_model_param_path.reset(new std::string(config_.param_file)); } + argument_.origin_program_desc.reset( new ProgramDesc(*inference_program_->Proto())); PADDLE_ENFORCE( @@ -127,9 +233,8 @@ void AnalysisPredictor::OptimizeInferenceProgram() { } template <> -std::unique_ptr -CreatePaddlePredictor( - const contrib::AnalysisConfig& config) { +std::unique_ptr CreatePaddlePredictor< + AnalysisConfig, PaddleEngineKind::kAnalysis>(const AnalysisConfig &config) { VLOG(3) << "create AnalysisConfig"; if (config.use_gpu) { // 1. GPU memeroy @@ -150,15 +255,90 @@ CreatePaddlePredictor( } std::unique_ptr predictor(new AnalysisPredictor(config)); - if (!dynamic_cast(predictor.get())->Init(nullptr)) { + if (!dynamic_cast(predictor.get())->Init(nullptr)) { return nullptr; } return predictor; } +void AnalysisPredictor::PrepareFeedFetch() { + for (auto *op : inference_program_->Block(0).AllOps()) { + if (op->Type() == "feed") { + int idx = boost::get(op->GetAttr("col")); + if (feeds_.size() <= static_cast(idx)) { + feeds_.resize(idx + 1); + } + feeds_[idx] = op; + feed_names_[op->Output("Out")[0]] = idx; + } else if (op->Type() == "fetch") { + int idx = boost::get(op->GetAttr("col")); + if (fetchs_.size() <= static_cast(idx)) { + fetchs_.resize(idx + 1); + } + fetchs_[idx] = op; + } + } +} + +std::unique_ptr AnalysisPredictor::GetInputTensor( + const std::string &name) { + PADDLE_ENFORCE(executor_->scope()->FindVar(name), "no name called %s", name); + std::unique_ptr res( + new ZeroCopyTensor(static_cast(executor_->scope()))); + res->input_or_output_ = true; + res->SetName(name); + return res; +} + +std::unique_ptr AnalysisPredictor::GetOutputTensor( + const std::string &name) { + PADDLE_ENFORCE(executor_->scope()->FindVar(name), "no name called %s", name); + std::unique_ptr res( + new ZeroCopyTensor(static_cast(executor_->scope()))); + res->input_or_output_ = false; + res->SetName(name); + return res; +} + +bool AnalysisPredictor::ZeroCopyRun() { + executor_->Run(); + return true; +} + +bool AnalysisPredictor::LoadProgramDesc() { + // Initialize the inference program + std::unique_ptr tmp_exe( + new framework::Executor(platform::CPUPlace())); + if (!config_.model_dir.empty()) { + // Parameters are saved in separate files sited in + // the specified `dirname`. + inference_program_ = paddle::inference::Load( + static_cast(tmp_exe.get()), scope_.get(), + config_.model_dir); + } else if (!config_.prog_file.empty() && !config_.param_file.empty()) { + // All parameters are saved in a single file. + // The file names should be consistent with that used + // in Python API `fluid.io.save_inference_model`. + inference_program_ = paddle::inference::Load( + static_cast(tmp_exe.get()), scope_.get(), + config_.prog_file, config_.param_file); + } else { + LOG(ERROR) << string::Sprintf( + "not valid model path '%s' or program path '%s'.", config_.model_dir, + config_.param_file); + return false; + } + return true; +} +std::unique_ptr AnalysisPredictor::Clone() { + auto *x = new AnalysisPredictor(config_); + x->Init(scope_, inference_program_); + return std::unique_ptr(x); +} + template <> std::unique_ptr CreatePaddlePredictor( - const contrib::AnalysisConfig& config) { + const contrib::AnalysisConfig &config) { return CreatePaddlePredictor(config); } diff --git a/paddle/fluid/inference/api/analysis_predictor.h b/paddle/fluid/inference/api/analysis_predictor.h index aa00e8be5c2..0d01d7ac2b2 100644 --- a/paddle/fluid/inference/api/analysis_predictor.h +++ b/paddle/fluid/inference/api/analysis_predictor.h @@ -12,42 +12,81 @@ // See the License for the specific language governing permissions and // limitations under the License. +#pragma once #include #include +#include "paddle/fluid/framework/naive_executor.h" #include "paddle/fluid/inference/analysis/analyzer.h" #include "paddle/fluid/inference/api/api_impl.h" #include "paddle/fluid/inference/api/paddle_inference_api.h" +#include "paddle/fluid/string/printf.h" namespace paddle { using inference::analysis::Argument; using inference::analysis::Analyzer; using framework::proto::ProgramDesc; +using framework::NaiveExecutor; +using contrib::AnalysisConfig; /* This predictor is based on the original native predictor with IR and Analysis * support. It will optimize IR and Parameters in the runtime. * TODO(Superjomn) Replace the Navive predictor? */ -class AnalysisPredictor : public NativePaddlePredictor { +class AnalysisPredictor : public PaddlePredictor { public: - explicit AnalysisPredictor(const contrib::AnalysisConfig& config) - : NativePaddlePredictor(config), config_(config) {} + explicit AnalysisPredictor(const AnalysisConfig &config) : config_(config) {} - bool Init(const std::shared_ptr& parent_scope); + bool Init(const std::shared_ptr &parent_scope, + const std::shared_ptr &program = nullptr); - bool Run(const std::vector& inputs, - std::vector* output_data, - int batch_size = -1) override { - return NativePaddlePredictor::Run(inputs, output_data, batch_size); - } + bool Run(const std::vector &inputs, + std::vector *output_data, + int batch_size = -1) override; + + std::unique_ptr GetInputTensor( + const std::string &name) override; + std::unique_ptr GetOutputTensor( + const std::string &name) override; + + bool ZeroCopyRun() override; + + void PrepareFeedFetch(); void OptimizeInferenceProgram(); - Argument& analysis_argument() { return argument_; } + Argument &analysis_argument() { return argument_; } + + std::unique_ptr Clone() override; + + framework::Scope *scope() { return executor_->scope(); } + framework::ProgramDesc &program() { return *inference_program_; } + + protected: + bool LoadProgramDesc(); + + bool SetFeed(const std::vector &input_datas, + framework::Scope *scope); + bool GetFetch(std::vector *output_data, + framework::Scope *scope); + template + void GetFetchOne(const framework::LoDTensor &fetchs, + PaddleTensor *output_data); private: contrib::AnalysisConfig config_; Argument argument_; + std::unique_ptr executor_; + platform::Place place_; + std::shared_ptr scope_; + framework::Scope *sub_scope_{nullptr}; + std::shared_ptr inference_program_; + std::vector feeds_; + std::map feed_names_; + std::vector fetchs_; + // Memory buffer for feed inputs. The temporary LoDTensor will cause serious + // concurrency problems, so cache them. + std::vector feed_tensors_; }; } // namespace paddle diff --git a/paddle/fluid/inference/api/analysis_predictor_tester.cc b/paddle/fluid/inference/api/analysis_predictor_tester.cc new file mode 100644 index 00000000000..1d25f55b318 --- /dev/null +++ b/paddle/fluid/inference/api/analysis_predictor_tester.cc @@ -0,0 +1,67 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include +#include +#include "paddle/fluid/inference/api/paddle_inference_api.h" + +DEFINE_string(dirname, "", "dirname to tests."); + +namespace paddle { +namespace inference { +using contrib::AnalysisConfig; + +TEST(AnalysisPredictor, ZeroCopy) { + AnalysisConfig config; + config.model_dir = FLAGS_dirname + "/word2vec.inference.model"; + config.use_feed_fetch_ops = false; + + auto predictor = + CreatePaddlePredictor( + config); + + auto w0 = predictor->GetInputTensor("firstw"); + auto w1 = predictor->GetInputTensor("secondw"); + auto w2 = predictor->GetInputTensor("thirdw"); + auto w3 = predictor->GetInputTensor("forthw"); + + w0->Reshape({4, 1}); + w1->Reshape({4, 1}); + w2->Reshape({4, 1}); + w3->Reshape({4, 1}); + + auto* w0_data = w0->mutable_data(PaddlePlace::kCPU); + auto* w1_data = w1->mutable_data(PaddlePlace::kCPU); + auto* w2_data = w2->mutable_data(PaddlePlace::kCPU); + auto* w3_data = w3->mutable_data(PaddlePlace::kCPU); + + for (int i = 0; i < 4; i++) { + w0_data[i] = i; + w1_data[i] = i; + w2_data[i] = i; + w3_data[i] = i; + } + + predictor->ZeroCopyRun(); + + auto out = predictor->GetOutputTensor("fc_1.tmp_2"); + PaddlePlace place; + int size = 0; + auto* out_data = out->data(&place, &size); + LOG(INFO) << "output size: " << size / sizeof(float); + LOG(INFO) << "output_data: " << out_data; +} + +} // namespace inference +} // namespace paddle diff --git a/paddle/fluid/inference/api/api.cc b/paddle/fluid/inference/api/api.cc index c71769a32f6..01ea942d3c8 100644 --- a/paddle/fluid/inference/api/api.cc +++ b/paddle/fluid/inference/api/api.cc @@ -1,16 +1,22 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at -http://www.apache.org/licenses/LICENSE-2.0 -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +#include "paddle/fluid/framework/lod_tensor.h" +#include "paddle/fluid/framework/scope.h" #include "paddle/fluid/inference/api/paddle_inference_api.h" #include "paddle/fluid/platform/enforce.h" +#include "paddle_inference_api.h" namespace paddle { @@ -26,7 +32,7 @@ int PaddleDtypeSize(PaddleDType dtype) { } } -PaddleBuf::PaddleBuf(PaddleBuf&& other) +PaddleBuf::PaddleBuf(PaddleBuf &&other) : data_(other.data_), length_(other.length_), memory_owned_(other.memory_owned_) { @@ -35,9 +41,9 @@ PaddleBuf::PaddleBuf(PaddleBuf&& other) other.length_ = 0; } -PaddleBuf::PaddleBuf(const PaddleBuf& other) { *this = other; } +PaddleBuf::PaddleBuf(const PaddleBuf &other) { *this = other; } -PaddleBuf& PaddleBuf::operator=(const PaddleBuf& other) { +PaddleBuf &PaddleBuf::operator=(const PaddleBuf &other) { if (!other.memory_owned_) { data_ = other.data_; length_ = other.length_; @@ -51,7 +57,7 @@ PaddleBuf& PaddleBuf::operator=(const PaddleBuf& other) { return *this; } -PaddleBuf& PaddleBuf::operator=(PaddleBuf&& other) { +PaddleBuf &PaddleBuf::operator=(PaddleBuf &&other) { // only the buffer with external memory can be copied data_ = other.data_; length_ = other.length_; @@ -75,7 +81,7 @@ void PaddleBuf::Resize(size_t length) { } } -void PaddleBuf::Reset(void* data, size_t length) { +void PaddleBuf::Reset(void *data, size_t length) { Free(); memory_owned_ = false; data_ = data; @@ -85,7 +91,7 @@ void PaddleBuf::Reset(void* data, size_t length) { void PaddleBuf::Free() { if (memory_owned_ && data_) { PADDLE_ENFORCE_GT(length_, 0); - free(static_cast(data_)); + free(static_cast(data_)); data_ = nullptr; length_ = 0; } diff --git a/paddle/fluid/inference/api/api_impl.cc b/paddle/fluid/inference/api/api_impl.cc index dca4386b21b..53740899cd4 100644 --- a/paddle/fluid/inference/api/api_impl.cc +++ b/paddle/fluid/inference/api/api_impl.cc @@ -145,7 +145,7 @@ bool NativePaddlePredictor::Run(const std::vector &inputs, VLOG(4) << "Run prepared context"; executor_->RunPreparedContext(ctx_.get(), scope, false, /* don't create local scope each time*/ - false /* don't create variable eatch time */); + false /* don't create variable each time */); VLOG(4) << "Finish prepared context"; // get fetch variable if (!GetFetch(output_data, scope)) { diff --git a/paddle/fluid/inference/api/api_impl.h b/paddle/fluid/inference/api/api_impl.h index 6386d601262..7882f6a53c7 100644 --- a/paddle/fluid/inference/api/api_impl.h +++ b/paddle/fluid/inference/api/api_impl.h @@ -1,16 +1,16 @@ /* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - Licensed under the Apache License, Version 2.0 (the "License"); - you may not use this file except in compliance with the License. - You may obtain a copy of the License at +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at - http://www.apache.org/licenses/LICENSE-2.0 +http://www.apache.org/licenses/LICENSE-2.0 - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. */ +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ #pragma once @@ -30,6 +30,8 @@ #include "paddle/fluid/framework/ddim.h" #include "paddle/fluid/framework/lod_tensor.h" +#include "paddle/fluid/framework/naive_executor.h" +#include "paddle/fluid/inference/api/paddle_inference_api.h" #include "paddle/fluid/inference/io.h" #include "paddle/fluid/platform/init.h" #include "paddle/fluid/platform/profiler.h" @@ -52,6 +54,8 @@ class NativePaddlePredictor : public PaddlePredictor { ~NativePaddlePredictor() override; + framework::Scope *scope() { return sub_scope_ ? sub_scope_ : scope_.get(); } + protected: bool SetFeed(const std::vector &input_datas, framework::Scope *scope); diff --git a/paddle/fluid/inference/api/api_impl_tester.cc b/paddle/fluid/inference/api/api_impl_tester.cc index fc1364b80ac..106a941b295 100644 --- a/paddle/fluid/inference/api/api_impl_tester.cc +++ b/paddle/fluid/inference/api/api_impl_tester.cc @@ -43,7 +43,7 @@ PaddleTensor LodTensorToPaddleTensor(framework::LoDTensor* t) { NativeConfig GetConfig() { NativeConfig config; - config.model_dir = FLAGS_dirname + "word2vec.inference.model"; + config.model_dir = FLAGS_dirname + "/word2vec.inference.model"; LOG(INFO) << "dirname " << config.model_dir; config.fraction_of_gpu_memory = 0.15; #ifdef PADDLE_WITH_CUDA @@ -110,7 +110,7 @@ void MainImageClassification(bool use_gpu) { NativeConfig config = GetConfig(); config.use_gpu = use_gpu; config.model_dir = - FLAGS_dirname + "image_classification_resnet.inference.model"; + FLAGS_dirname + "/image_classification_resnet.inference.model"; const bool is_combined = false; std::vector> feed_target_shapes = @@ -214,7 +214,7 @@ void MainThreadsImageClassification(bool use_gpu) { NativeConfig config = GetConfig(); config.use_gpu = use_gpu; config.model_dir = - FLAGS_dirname + "image_classification_resnet.inference.model"; + FLAGS_dirname + "/image_classification_resnet.inference.model"; auto main_predictor = CreatePaddlePredictor(config); std::vector jobs(num_jobs); diff --git a/paddle/fluid/inference/api/details/zero_copy_tensor.cc b/paddle/fluid/inference/api/details/zero_copy_tensor.cc new file mode 100644 index 00000000000..14698f6dfc8 --- /dev/null +++ b/paddle/fluid/inference/api/details/zero_copy_tensor.cc @@ -0,0 +1,111 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/framework/lod_tensor.h" +#include "paddle/fluid/framework/scope.h" +#include "paddle/fluid/inference/api/paddle_inference_api.h" +#include "paddle/fluid/platform/enforce.h" + +namespace paddle { + +void ZeroCopyTensor::Reshape(const std::vector &shape) { + PADDLE_ENFORCE(!name_.empty(), + "Need to SetName first, so that the corresponding tensor can " + "be retrieved."); + PADDLE_ENFORCE(input_or_output_, + "Can't reshape the output tensor, it is readonly"); + PADDLE_ENFORCE(scope_); + auto *scope = static_cast(scope_); + auto *var = scope->FindVar(name_); + PADDLE_ENFORCE(var, "No tensor called [%s] in the runtime scope", name_); + auto *tensor = var->GetMutable(); + tensor->Resize(framework::make_ddim(shape)); +} + +template +T *ZeroCopyTensor::mutable_data(PaddlePlace place) { + auto *tensor = static_cast(FindTensor()); + switch (static_cast(place)) { + case static_cast(PaddlePlace::kCPU): { + return tensor->mutable_data(platform::CPUPlace()); + } + case static_cast(PaddlePlace::kGPU): { + return tensor->mutable_data(platform::CUDAPlace()); + } + default: + PADDLE_THROW("Unsupported place: %d", static_cast(place)); + break; + } + return nullptr; +} + +template +T *ZeroCopyTensor::data(PaddlePlace *place, int *size) { + auto *tensor = static_cast(FindTensor()); + auto *res = tensor->data(); + + if (platform::is_cpu_place(tensor->place())) { + *place = PaddlePlace::kCPU; + } else if (platform::is_gpu_place(tensor->place())) { + *place = PaddlePlace::kGPU; + } else { + *place = PaddlePlace::kUNK; + } + + *size = tensor->numel(); + return res; +} + +template float *ZeroCopyTensor::data(PaddlePlace *place, int *size); +template int64_t *ZeroCopyTensor::data(PaddlePlace *place, int *size); +template float *ZeroCopyTensor::mutable_data(PaddlePlace place); +template int64_t *ZeroCopyTensor::mutable_data(PaddlePlace place); + +void *ZeroCopyTensor::FindTensor() const { + PADDLE_ENFORCE(!name_.empty(), + "Need to SetName first, so that the corresponding tensor can " + "be retrieved."); + PADDLE_ENFORCE(scope_); + auto *scope = static_cast(scope_); + auto *var = scope->FindVar(name_); + PADDLE_ENFORCE(var, "No tensor called [%s] in the runtime scope", name_); + auto *tensor = var->GetMutable(); + return tensor; +} + +std::vector ZeroCopyTensor::shape() { + auto *tensor = static_cast(FindTensor()); + PADDLE_ENFORCE(tensor, "not found tensor called %s in the scope", name_); + return framework::vectorize(tensor->dims()); +} + +void ZeroCopyTensor::SetLoD(const std::vector> &x) { + auto *tensor = static_cast(FindTensor()); + framework::LoD lod; + for (auto &level : x) { + lod.emplace_back(level); + } + tensor->set_lod(lod); +} + +std::vector> ZeroCopyTensor::lod() const { + std::vector> res; + auto *tensor = static_cast(FindTensor()); + for (auto &level : tensor->lod()) { + res.emplace_back(level); + } + return res; +} + +} // namespace paddle diff --git a/paddle/fluid/inference/api/details/zero_copy_tensor_dummy.cc b/paddle/fluid/inference/api/details/zero_copy_tensor_dummy.cc new file mode 100644 index 00000000000..2d5b561d801 --- /dev/null +++ b/paddle/fluid/inference/api/details/zero_copy_tensor_dummy.cc @@ -0,0 +1,46 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/inference/api/paddle_inference_api.h" + +namespace paddle { + +void ZeroCopyTensor::Reshape(const std::vector &shape) {} + +template +T *ZeroCopyTensor::mutable_data(PaddlePlace place) { + return nullptr; +} + +template +T *ZeroCopyTensor::data(PaddlePlace *place, int *size) { + return nullptr; +} + +template float *ZeroCopyTensor::data(PaddlePlace *place, int *size); +template int64_t *ZeroCopyTensor::data(PaddlePlace *place, int *size); +template float *ZeroCopyTensor::mutable_data(PaddlePlace place); +template int64_t *ZeroCopyTensor::mutable_data(PaddlePlace place); + +void *ZeroCopyTensor::FindTensor() const { return nullptr; } + +std::vector ZeroCopyTensor::shape() { return {}; } + +void ZeroCopyTensor::SetLoD(const std::vector> &x) {} + +std::vector> ZeroCopyTensor::lod() const { + return std::vector>(); +} + +} // namespace paddle diff --git a/paddle/fluid/inference/api/helper.h b/paddle/fluid/inference/api/helper.h index 1fec2f96da0..dbbd3f6a678 100644 --- a/paddle/fluid/inference/api/helper.h +++ b/paddle/fluid/inference/api/helper.h @@ -21,8 +21,10 @@ #include #include #include +#include "paddle/fluid/framework/lod_tensor.h" #include "paddle/fluid/inference/api/paddle_inference_api.h" #include "paddle/fluid/inference/api/timer.h" +#include "paddle/fluid/string/printf.h" namespace paddle { namespace inference { @@ -93,6 +95,20 @@ static void TensorAssignData(PaddleTensor *tensor, } } +template +static int ZeroCopyTensorAssignData(ZeroCopyTensor *tensor, + const std::vector> &data) { + int size{0}; + auto *ptr = tensor->mutable_data(PaddlePlace::kCPU); + int c = 0; + for (const auto &f : data) { + for (T v : f) { + ptr[c++] = v; + } + } + return size; +} + static std::string DescribeTensor(const PaddleTensor &tensor) { std::stringstream os; os << "Tensor [" << tensor.name << "]\n"; @@ -138,5 +154,127 @@ static void PrintTime(int batch_size, int repeat, int num_threads, int tid, } } +template +std::string LoDTensorSummary(const framework::LoDTensor &tensor) { + std::stringstream ss; + ss << "\n---- tensor ---" << '\n'; + ss << "lod: ["; + for (const auto &level : tensor.lod()) { + ss << "[ "; + for (auto i : level) { + ss << i << ", "; + } + ss << "]"; + } + ss << "]\n"; + + ss << "shape: ["; + int size = 1; + for (int i = 0; i < tensor.dims().size(); i++) { + int dim = tensor.dims()[i]; + ss << dim << ", "; + size *= dim; + } + ss << "]\n"; + + ss << "data: "; + for (int i = 0; i < std::min(20, size); i++) { + ss << tensor.data()[i] << " "; + } + ss << "\n"; + + return ss.str(); +} + +static bool CompareLoD(const framework::LoD &a, const framework::LoD &b) { + if (a.size() != b.size()) { + LOG(ERROR) << string::Sprintf("lod size not match %d != %d", a.size(), + b.size()); + return false; + } + for (size_t i = 0; i < a.size(); i++) { + auto &al = a[i]; + auto &bl = b[i]; + if (al.size() != bl.size()) { + LOG(ERROR) << string::Sprintf("level size %d != %d", al.size(), + bl.size()); + return false; + } + } + return true; +} + +static bool CompareShape(const std::vector &a, + const std::vector &b) { + if (a.size() != b.size()) { + LOG(ERROR) << string::Sprintf("shape size not match %d != %d", a.size(), + b.size()); + return false; + } + for (size_t i = 0; i < a.size(); i++) { + if (a[i] != b[i]) { + LOG(ERROR) << string::Sprintf("shape %d-th element not match %d != %d", i, + a[i], b[i]); + return false; + } + } + return true; +} + +static bool CompareTensorData(const framework::LoDTensor &a, + const framework::LoDTensor &b) { + auto a_shape = framework::vectorize(a.dims()); + auto b_shape = framework::vectorize(b.dims()); + size_t a_size = std::accumulate(a_shape.begin(), a_shape.end(), 1, + [](int a, int b) { return a * b; }); + size_t b_size = std::accumulate(b_shape.begin(), b_shape.end(), 1, + [](int a, int b) { return a * b; }); + if (a_size != b_size) { + LOG(ERROR) << string::Sprintf("tensor data size not match, %d != %d", + a_size, b_size); + } + + for (size_t i = 0; i < a_size; i++) { + if (a.type() == typeid(float)) { + const auto *a_data = a.data(); + const auto *b_data = b.data(); + if (std::abs(a_data[i] - b_data[i]) > 1e-3) { + LOG(ERROR) << string::Sprintf( + "tensor data %d-th element not match, %f != %f", i, a_data[i], + b_data[i]); + return false; + } + } else if (a.type() == typeid(int64_t)) { + const auto *a_data = a.data(); + const auto *b_data = b.data(); + if (std::abs(a_data[i] - b_data[i]) > 1e-3) { + LOG(ERROR) << string::Sprintf( + "tensor data %d-th element not match, %f != %f", i, a_data[i], + b_data[i]); + return false; + } + } + } + + return true; +} + +static bool CompareTensor(const framework::LoDTensor &a, + const framework::LoDTensor &b) { + if (!CompareLoD(a.lod(), b.lod())) { + return false; + } + if (!CompareShape(framework::vectorize(a.dims()), + framework::vectorize(b.dims()))) { + return false; + } + + if (!CompareTensorData(a, b)) { + return false; + } + + return true; +} + } // namespace inference } // namespace paddle diff --git a/paddle/fluid/inference/api/paddle_inference_api.h b/paddle/fluid/inference/api/paddle_inference_api.h index 984358b2bd9..3aa5c614687 100644 --- a/paddle/fluid/inference/api/paddle_inference_api.h +++ b/paddle/fluid/inference/api/paddle_inference_api.h @@ -101,6 +101,40 @@ struct PaddleTensor { std::vector> lod; // Tensor+LoD equals LoDTensor }; +enum class PaddlePlace { kUNK = -1, kCPU, kGPU }; +// Tensor without copy, currently only supports AnalysisPredictor. +class ZeroCopyTensor { + public: + void Reshape(const std::vector& shape); + + // Get the memory in CPU or GPU with specific data type, should Reshape first + // to tell the data size. + // Once can directly call this data to feed the data. + // This is for write the input tensor. + template + T* mutable_data(PaddlePlace place); + // Get the memory directly, will return the place and memory size by pointer. + // This is for reading the output tensor. + template + T* data(PaddlePlace* place, int* size); + + std::vector shape(); + + void SetLoD(const std::vector>& x); + std::vector> lod() const; + + protected: + ZeroCopyTensor(void* scope) : scope_{scope} {} + void SetName(const std::string& name) { name_ = name; } + void* FindTensor() const; + + private: + std::string name_; + bool input_or_output_; + friend class AnalysisPredictor; + void* scope_{nullptr}; +}; + /* * A simple Inference API for Paddle. */ @@ -120,6 +154,19 @@ class PaddlePredictor { std::vector* output_data, int batch_size = -1) = 0; + // Zero copy input and output optimization. + // Get the input or output tensors, and operate on their memory directly, + // without copy. + virtual std::unique_ptr GetInputTensor( + const std::string& name) { + return nullptr; + } + virtual std::unique_ptr GetOutputTensor( + const std::string& name) { + return nullptr; + } + virtual bool ZeroCopyRun() { return false; } + // Clone a predictor that share the model weights, the Cloned predictor should // be thread-safe. virtual std::unique_ptr Clone() = 0; @@ -218,7 +265,12 @@ struct AnalysisConfig : public NativeConfig { IrPassMode ir_mode{IrPassMode::kExclude}; std::vector ir_passes; - // NOTE this is just for internal development, please not use it. + // NOT stable yet. + bool use_feed_fetch_ops{true}; + + // NOTE this is just for internal development, please not use it. NOT + // stable + // yet. bool _use_mkldnn{false}; }; diff --git a/paddle/fluid/inference/tests/api/analyzer_lac_tester.cc b/paddle/fluid/inference/tests/api/analyzer_lac_tester.cc index 36bbec47311..5fb551810fd 100644 --- a/paddle/fluid/inference/tests/api/analyzer_lac_tester.cc +++ b/paddle/fluid/inference/tests/api/analyzer_lac_tester.cc @@ -18,6 +18,8 @@ namespace paddle { namespace inference { namespace analysis { +using contrib::AnalysisConfig; + struct DataRecord { std::vector data; std::vector lod; @@ -78,6 +80,7 @@ struct DataRecord { } } } + DataRecord NextBatch() { DataRecord data; data.data = batched_datas[batch_iter]; @@ -155,7 +158,9 @@ TEST(Analyzer_LAC, fuse_statis) { SetConfig(&cfg); int num_ops; - auto fuse_statis = GetFuseStatis(cfg, &num_ops); + auto predictor = CreatePaddlePredictor(cfg); + auto fuse_statis = GetFuseStatis( + static_cast(predictor.get()), &num_ops); ASSERT_TRUE(fuse_statis.count("fc_fuse")); ASSERT_TRUE(fuse_statis.count("fc_gru_fuse")); EXPECT_EQ(fuse_statis.at("fc_fuse"), 1); diff --git a/paddle/fluid/inference/tests/api/analyzer_ner_tester.cc b/paddle/fluid/inference/tests/api/analyzer_ner_tester.cc index 59020545cd6..577b97e271a 100644 --- a/paddle/fluid/inference/tests/api/analyzer_ner_tester.cc +++ b/paddle/fluid/inference/tests/api/analyzer_ner_tester.cc @@ -16,6 +16,7 @@ namespace paddle { namespace inference { +using contrib::AnalysisConfig; struct DataRecord { std::vector> word_data_all, mention_data_all; @@ -145,7 +146,9 @@ TEST(Analyzer_Chinese_ner, fuse_statis) { SetConfig(&cfg); int num_ops; - auto fuse_statis = GetFuseStatis(cfg, &num_ops); + auto predictor = CreatePaddlePredictor(cfg); + auto fuse_statis = GetFuseStatis( + static_cast(predictor.get()), &num_ops); ASSERT_TRUE(fuse_statis.count("fc_fuse")); ASSERT_TRUE(fuse_statis.count("fc_gru_fuse")); EXPECT_EQ(fuse_statis.at("fc_fuse"), 1); diff --git a/paddle/fluid/inference/tests/api/analyzer_rnn1_tester.cc b/paddle/fluid/inference/tests/api/analyzer_rnn1_tester.cc index 3bf5383d8f3..d2e344111bd 100644 --- a/paddle/fluid/inference/tests/api/analyzer_rnn1_tester.cc +++ b/paddle/fluid/inference/tests/api/analyzer_rnn1_tester.cc @@ -12,12 +12,16 @@ // See the License for the specific language governing permissions and // limitations under the License. +#include "paddle/fluid/inference/api/analysis_predictor.h" #include "paddle/fluid/inference/tests/api/tester_helper.h" +DEFINE_bool(with_precision_check, true, "turn on test"); + namespace paddle { namespace inference { using namespace framework; // NOLINT +using namespace contrib; // NOLINT struct DataRecord { std::vector>> link_step_data_all; @@ -29,10 +33,12 @@ struct DataRecord { size_t batch_iter{0}; size_t batch_size{1}; DataRecord() = default; + explicit DataRecord(const std::string &path, int batch_size = 1) : batch_size(batch_size) { Load(path); } + DataRecord NextBatch() { DataRecord data; size_t batch_end = batch_iter + batch_size; @@ -101,6 +107,7 @@ struct DataRecord { num_samples = num_lines; } }; + void PrepareInputs(std::vector *input_slots, DataRecord *data, int batch_size) { PaddleTensor lod_attention_tensor, init_zero_tensor, lod_tensor_tensor, @@ -149,7 +156,55 @@ void PrepareInputs(std::vector *input_slots, DataRecord *data, } } -void SetConfig(contrib::AnalysisConfig *cfg) { +void PrepareZeroCopyInputs(ZeroCopyTensor *lod_attention_tensor, + ZeroCopyTensor *cell_init_tensor, + ZeroCopyTensor *data_tensor, + ZeroCopyTensor *hidden_init_tensor, + ZeroCopyTensor *week_tensor, + ZeroCopyTensor *minute_tensor, + DataRecord *data_record, int batch_size) { + auto one_batch = data_record->NextBatch(); + std::vector rnn_link_data_shape( + {static_cast(one_batch.rnn_link_data.size()), + static_cast(one_batch.rnn_link_data.front().size())}); + lod_attention_tensor->Reshape({1, 2}); + lod_attention_tensor->SetLoD({one_batch.lod1, one_batch.lod2}); + + cell_init_tensor->Reshape({batch_size, 15}); + cell_init_tensor->SetLoD({one_batch.lod3}); + + hidden_init_tensor->Reshape({batch_size, 15}); + hidden_init_tensor->SetLoD({one_batch.lod3}); + + data_tensor->Reshape(rnn_link_data_shape); + data_tensor->SetLoD({one_batch.lod1}); + + week_tensor->Reshape( + {static_cast(one_batch.rnn_week_datas.size()), + static_cast(one_batch.rnn_week_datas.front().size())}); + week_tensor->SetLoD({one_batch.lod3}); + + minute_tensor->Reshape( + {static_cast(one_batch.rnn_minute_datas.size()), + static_cast(one_batch.rnn_minute_datas.front().size())}); + minute_tensor->SetLoD({one_batch.lod3}); + + // assign data + float arr0[] = {0, 0}; + std::vector zeros(batch_size * 15, 0); + std::copy_n(arr0, 2, + lod_attention_tensor->mutable_data(PaddlePlace::kCPU)); + std::copy_n(arr0, 2, data_tensor->mutable_data(PaddlePlace::kCPU)); + std::copy_n(zeros.begin(), zeros.size(), + cell_init_tensor->mutable_data(PaddlePlace::kCPU)); + std::copy_n(zeros.begin(), zeros.size(), + hidden_init_tensor->mutable_data(PaddlePlace::kCPU)); + ZeroCopyTensorAssignData(data_tensor, one_batch.rnn_link_data); + ZeroCopyTensorAssignData(week_tensor, one_batch.rnn_week_datas); + ZeroCopyTensorAssignData(minute_tensor, one_batch.rnn_minute_datas); +} + +void SetConfig(AnalysisConfig *cfg) { cfg->prog_file = FLAGS_infer_model + "/__model__"; cfg->param_file = FLAGS_infer_model + "/param"; cfg->use_gpu = false; @@ -187,7 +242,9 @@ TEST(Analyzer_rnn1, fuse_statis) { SetConfig(&cfg); int num_ops; - auto fuse_statis = GetFuseStatis(cfg, &num_ops); + auto predictor = CreatePaddlePredictor(cfg); + auto fuse_statis = GetFuseStatis( + static_cast(predictor.get()), &num_ops); ASSERT_TRUE(fuse_statis.count("fc_fuse")); EXPECT_EQ(fuse_statis.at("fc_fuse"), 1); EXPECT_EQ(fuse_statis.at("fc_nobias_lstm_fuse"), 2); // bi-directional LSTM @@ -214,7 +271,229 @@ TEST(Analyzer_rnn1, multi_thread) { std::vector> input_slots_all; SetInput(&input_slots_all); - TestPrediction(cfg, input_slots_all, &outputs, 4 /* num_threads */); + TestPrediction(cfg, input_slots_all, &outputs, FLAGS_num_threads); +} + +bool CompareTensors(framework::Scope &a_scope, framework::Scope &b_scope, + const std::vector &tensors) { + for (auto &x : tensors) { + auto *a_var = a_scope.FindVar(x); + auto *b_var = b_scope.FindVar(x); + if (a_var && b_var) { + if (a_var->Type() == typeid(framework::LoDTensor) || + a_var->Type() == typeid(framework::Tensor)) { + LOG(INFO) << "comparing tensor " << x; + auto &a_t = a_var->Get(); + auto &b_t = b_var->Get(); + if (!inference::CompareTensor(a_t, b_t)) { + LOG(ERROR) << string::Sprintf("tensor %s not match in two scopes", x); + } + } else { + LOG(INFO) << "skip no tensor " << x; + } + } else { + LOG(INFO) << "skip tensor " << x; + } + } + return true; +} + +// Validate that the AnalysisPredictor + ZeroCopyTensor really works by testing +// on the complex RNN1 model. +TEST(Analyzer_rnn1, ZeroCopy) { + AnalysisConfig config; + SetConfig(&config); + config.use_feed_fetch_ops = false; + + PaddlePlace place; + int output_size{0}; + + auto predictor = + CreatePaddlePredictor( + config); + + config.use_feed_fetch_ops = true; + auto native_predictor = + CreatePaddlePredictor(config); + + config.use_feed_fetch_ops = true; // the analysis predictor needs feed/fetch. + auto analysis_predictor = + CreatePaddlePredictor( + config); + +#define NEW_TENSOR(name__) \ + auto name__##_tensor = predictor->GetInputTensor(#name__); + NEW_TENSOR(data_lod_attention); + NEW_TENSOR(cell_init); + NEW_TENSOR(data); + NEW_TENSOR(week); + NEW_TENSOR(minute); + NEW_TENSOR(hidden_init); + + // Prepare data for AnalysisPredictor + DataRecord data(FLAGS_infer_data, FLAGS_batch_size); + PrepareZeroCopyInputs(data_lod_attention_tensor.get(), cell_init_tensor.get(), + data_tensor.get(), hidden_init_tensor.get(), + week_tensor.get(), minute_tensor.get(), &data, + FLAGS_batch_size); + + // Prepare data for NativePredictor + std::vector> native_inputs; + SetInput(&native_inputs); + std::vector native_outputs; + std::vector analysis_outputs; + + auto output_tensor = predictor->GetOutputTensor("final_output.tmp_1"); + // Run analysis predictor + + int num_ops; + auto fuse_statis = GetFuseStatis(predictor.get(), &num_ops); + ASSERT_TRUE(fuse_statis.count("fc_fuse")); + ASSERT_EQ(fuse_statis.at("fc_fuse"), 1); + ASSERT_EQ(fuse_statis.at("fc_nobias_lstm_fuse"), 2); // bi-directional LSTM + ASSERT_EQ(fuse_statis.at("seq_concat_fc_fuse"), 1); + ASSERT_EQ(num_ops, + 13); // After graph optimization, only 13 operators exists. + + Timer timer; + double total_time{0}; + double native_total_time{0}; + double analysis_total_time{0.}; + + for (int i = 0; i < FLAGS_repeat; i++) { + timer.tic(); + predictor->ZeroCopyRun(); + total_time += timer.toc(); + } + + auto *output_data = output_tensor->data(&place, &output_size); + ASSERT_GT(output_size, 0); // more than one output! + + for (int i = 0; i < FLAGS_repeat; i++) { + // Run native predictor. + timer.tic(); + ASSERT_TRUE(native_predictor->Run(native_inputs.front(), &native_outputs)); + native_total_time += timer.toc(); + } + + for (int i = 0; i < FLAGS_repeat; i++) { + timer.tic(); + ASSERT_TRUE( + analysis_predictor->Run(native_inputs.front(), &analysis_outputs)); + analysis_total_time += timer.toc(); + } + + if (!FLAGS_with_precision_check) { + return; + } + int native_output_size = VecReduceToInt(native_outputs.front().shape); + + EXPECT_EQ(native_output_size, output_size); + + // Compare tensors between analysis and zerocopy + auto *p0 = static_cast(predictor.get()); + auto *p1 = static_cast(analysis_predictor.get()); + auto *p2 = static_cast(native_predictor.get()); + + std::vector tensor_names; + for (auto &var_desc : p0->program().Block(0).AllVars()) { + tensor_names.push_back(var_desc->Name()); + } + + LOG(INFO) << "Comparing tensors"; + ASSERT_TRUE( + CompareTensors(*p0->scope(), *p1->scope(), {"final_output.tmp_1"})); + ASSERT_TRUE( + CompareTensors(*p0->scope(), *p2->scope(), {"final_output.tmp_1"})); + + LOG(INFO) << "output1 " << inference::LoDTensorSummary( + p0->scope() + ->FindVar("final_output.tmp_1") + ->Get()); + LOG(INFO) << "output2 " << inference::LoDTensorSummary( + p1->scope() + ->FindVar("final_output.tmp_1") + ->Get()); + LOG(INFO) << "output3 " << inference::LoDTensorSummary( + p2->scope() + ->FindVar("final_output.tmp_1") + ->Get()); + + for (int i = 0; i < output_size; i++) { + LOG(INFO) << output_data[i] << " " + << static_cast(native_outputs.front().data.data())[i] + << " " + << static_cast(analysis_outputs.front().data.data())[i]; + EXPECT_NEAR(output_data[i], + static_cast(native_outputs.front().data.data())[i], + 1e-3); + } + + LOG(INFO) << "batch_size: " << FLAGS_batch_size; + + LOG(INFO) << "zero average time: " + << total_time / (FLAGS_repeat * FLAGS_batch_size); + LOG(INFO) << "analysis average time: " + << analysis_total_time / (FLAGS_repeat * FLAGS_batch_size); + LOG(INFO) << "native average time: " + << native_total_time / (FLAGS_repeat * FLAGS_batch_size); +} + +TEST(Analyzer_rnn1, ZeroCopyMultiThread) { + AnalysisConfig config; + SetConfig(&config); + config.use_feed_fetch_ops = false; + +#define NEW_TENSOR(name__) \ + auto name__##_tensor = predictor->GetInputTensor(#name__); + + auto base_predictor = CreatePaddlePredictor(config); + double total_time_of_threads{0}; + std::vector threads; + std::vector> predictors; + for (int tid = 0; tid < FLAGS_num_threads; tid++) { + predictors.emplace_back(CreatePaddlePredictor(config)); + } + + for (int tid = 0; tid < FLAGS_num_threads; tid++) { + threads.emplace_back([config, &total_time_of_threads, &predictors, tid] { + // auto predictor = base_predictor->Clone(); + auto &predictor = predictors[tid]; + NEW_TENSOR(data_lod_attention); + NEW_TENSOR(cell_init); + NEW_TENSOR(data); + NEW_TENSOR(week); + NEW_TENSOR(minute); + NEW_TENSOR(hidden_init); + + // Prepare data for AnalysisPredictor + DataRecord data(FLAGS_infer_data, FLAGS_batch_size); + Timer timer; + double total_time{0}; + + for (int i = 0; i < FLAGS_repeat; i++) { + PrepareZeroCopyInputs(data_lod_attention_tensor.get(), + cell_init_tensor.get(), data_tensor.get(), + hidden_init_tensor.get(), week_tensor.get(), + minute_tensor.get(), &data, FLAGS_batch_size); + + timer.tic(); + predictor->ZeroCopyRun(); + total_time += timer.toc(); + } + + total_time_of_threads += total_time; + + LOG(INFO) << "thread time: " << total_time / FLAGS_repeat; + }); + } + + for (auto &t : threads) { + t.join(); + } + + LOG(INFO) << "average time: " + << total_time_of_threads / FLAGS_num_threads / FLAGS_repeat; } } // namespace inference diff --git a/paddle/fluid/inference/tests/api/analyzer_seq_conv1_tester.cc b/paddle/fluid/inference/tests/api/analyzer_seq_conv1_tester.cc index 2f71ed46ffc..cb4671c4379 100644 --- a/paddle/fluid/inference/tests/api/analyzer_seq_conv1_tester.cc +++ b/paddle/fluid/inference/tests/api/analyzer_seq_conv1_tester.cc @@ -182,7 +182,8 @@ TEST(Analyzer_seq_conv1, fuse_statis) { AnalysisConfig cfg; SetConfig(&cfg); int num_ops; - auto fuse_statis = GetFuseStatis(cfg, &num_ops); + auto predictor = CreatePaddlePredictor(cfg); + GetFuseStatis(predictor.get(), &num_ops); } // Compare result of NativeConfig and AnalysisConfig diff --git a/paddle/fluid/inference/tests/api/analyzer_vis_tester.cc b/paddle/fluid/inference/tests/api/analyzer_vis_tester.cc index 483ae66c5b2..a2e86305b85 100644 --- a/paddle/fluid/inference/tests/api/analyzer_vis_tester.cc +++ b/paddle/fluid/inference/tests/api/analyzer_vis_tester.cc @@ -19,6 +19,7 @@ limitations under the License. */ namespace paddle { namespace inference { namespace analysis { +using contrib::AnalysisConfig; struct Record { std::vector data; @@ -114,7 +115,8 @@ TEST(Analyzer_vis, fuse_statis) { AnalysisConfig cfg; SetConfig(&cfg); int num_ops; - GetFuseStatis(cfg, &num_ops); + auto predictor = CreatePaddlePredictor(cfg); + GetFuseStatis(predictor.get(), &num_ops); } // Compare result of NativeConfig and AnalysisConfig diff --git a/paddle/fluid/inference/tests/api/tester_helper.h b/paddle/fluid/inference/tests/api/tester_helper.h index 9fcb5129d26..cb36ddc8c87 100644 --- a/paddle/fluid/inference/tests/api/tester_helper.h +++ b/paddle/fluid/inference/tests/api/tester_helper.h @@ -86,11 +86,9 @@ std::unique_ptr CreateTestPredictor( size_t GetSize(const PaddleTensor &out) { return VecReduceToInt(out.shape); } -std::unordered_map GetFuseStatis(AnalysisConfig config, +std::unordered_map GetFuseStatis(PaddlePredictor *predictor, int *num_ops) { - auto predictor = CreateTestPredictor(config); - AnalysisPredictor *analysis_predictor = - dynamic_cast(predictor.get()); + auto *analysis_predictor = static_cast(predictor); auto &fuse_statis = analysis_predictor->analysis_argument() .Get>( framework::ir::kFuseStatisAttr); diff --git a/paddle/fluid/memory/malloc.cc b/paddle/fluid/memory/malloc.cc index 283745e9775..0f13a4ea9c1 100644 --- a/paddle/fluid/memory/malloc.cc +++ b/paddle/fluid/memory/malloc.cc @@ -36,6 +36,8 @@ namespace memory { using BuddyAllocator = detail::BuddyAllocator; BuddyAllocator* GetCPUBuddyAllocator() { + // We tried thread_local for inference::RNN1 model, but that not works much + // for multi-thread test. static std::once_flag init_flag; static detail::BuddyAllocator* a = nullptr; @@ -48,6 +50,25 @@ BuddyAllocator* GetCPUBuddyAllocator() { return a; } +// We compared the NaiveAllocator with BuddyAllocator in CPU memory allocation, +// seems they are almost the same overhead. +struct NaiveAllocator { + void* Alloc(size_t size) { return malloc(size); } + + void Free(void* p) { + PADDLE_ENFORCE(p); + free(p); + } + + static NaiveAllocator* Instance() { + static NaiveAllocator x; + return &x; + } + + private: + std::mutex lock_; +}; + template <> void* Alloc(platform::CPUPlace place, size_t size) { VLOG(10) << "Allocate " << size << " bytes on " << platform::Place(place); diff --git a/paddle/fluid/string/pretty_log.h b/paddle/fluid/string/pretty_log.h index a3b4e38f453..10c9eb80d0a 100644 --- a/paddle/fluid/string/pretty_log.h +++ b/paddle/fluid/string/pretty_log.h @@ -56,13 +56,13 @@ struct Style { }; template -static void PrettyLogEndl(const std::string& style, const char* fmt, - const Args&... args) { +static void PrettyLogEndl(const std::string &style, const char *fmt, + const Args &... args) { std::cerr << style << Sprintf(fmt, args...) << reset() << std::endl; } template -static void PrettyLog(const std::string& style, const char* fmt, - const Args&... args) { +static void PrettyLog(const std::string &style, const char *fmt, + const Args &... args) { std::cerr << style << Sprintf(fmt, args...) << reset(); } diff --git a/python/paddle/fluid/tests/unittests/CMakeLists.txt b/python/paddle/fluid/tests/unittests/CMakeLists.txt index d02c890209e..723f9eb9c97 100644 --- a/python/paddle/fluid/tests/unittests/CMakeLists.txt +++ b/python/paddle/fluid/tests/unittests/CMakeLists.txt @@ -28,7 +28,6 @@ list(REMOVE_ITEM TEST_OPS test_cond_op) # FIXME(qijun): https://github.com/Paddl list(REMOVE_ITEM TEST_OPS op_test) # op_test is a helper python file, not a test list(REMOVE_ITEM TEST_OPS decorators) # decorators is a helper python file, not a test - if(APPLE) if(NOT WITH_DISTRIBUTE) list(REMOVE_ITEM TEST_OPS test_desc_clone) -- GitLab