diff --git a/cmake/external/anakin.cmake b/cmake/external/anakin.cmake index ed054ff41ae0ec5a4b31dd256e397129cba3e8f1..84354c446e2f54fa13b90fa37221eed90968b251 100644 --- a/cmake/external/anakin.cmake +++ b/cmake/external/anakin.cmake @@ -52,6 +52,7 @@ ExternalProject_Add( PREFIX ${ANAKIN_SOURCE_DIR} UPDATE_COMMAND "" CMAKE_ARGS ${CMAKE_ARGS_PREFIX} + -DUSE_LOGGER=YES -DUSE_X86_PLACE=YES -DBUILD_WITH_UNIT_TEST=NO -DPROTOBUF_ROOT=${THIRD_PARTY_PATH}/install/protobuf diff --git a/paddle/fluid/API.spec b/paddle/fluid/API.spec index d26eebc8ff64784bdd9c37e123f49f7471ee0d50..452806a20e08c518b0f5aab7f63366eeb9341561 100644 --- a/paddle/fluid/API.spec +++ b/paddle/fluid/API.spec @@ -21,7 +21,7 @@ paddle.fluid.DistributeTranspiler.get_pserver_programs ArgSpec(args=['self', 'en paddle.fluid.DistributeTranspiler.get_startup_program ArgSpec(args=['self', 'endpoint', 'pserver_program', 'startup_program'], varargs=None, keywords=None, defaults=(None, None)) paddle.fluid.DistributeTranspiler.get_trainer_program ArgSpec(args=['self', 'wait_port'], varargs=None, keywords=None, defaults=(True,)) paddle.fluid.DistributeTranspiler.transpile ArgSpec(args=['self', 'trainer_id', 'program', 'pservers', 'trainers', 'sync_mode', 'startup_program', 'current_endpoint'], varargs=None, keywords=None, defaults=(None, '127.0.0.1:6174', 1, True, None, '127.0.0.1:6174')) -paddle.fluid.memory_optimize ArgSpec(args=['input_program', 'skip_opt_set', 'print_log', 'level'], varargs=None, keywords=None, defaults=(None, False, 0)) +paddle.fluid.memory_optimize ArgSpec(args=['input_program', 'skip_opt_set', 'print_log', 'level', 'skip_grads'], varargs=None, keywords=None, defaults=(None, False, 0, False)) paddle.fluid.release_memory ArgSpec(args=['input_program', 'skip_opt_set'], varargs=None, keywords=None, defaults=(None,)) paddle.fluid.DistributeTranspilerConfig.__init__ paddle.fluid.ParallelExecutor.__init__ ArgSpec(args=['self', 'use_cuda', 'loss_name', 'main_program', 'share_vars_from', 'exec_strategy', 'build_strategy', 'num_trainers', 'trainer_id', 'scope'], varargs=None, keywords=None, defaults=(None, None, None, None, None, 1, 0, None)) @@ -299,13 +299,17 @@ paddle.fluid.contrib.BeamSearchDecoder.read_array ArgSpec(args=['self', 'init', paddle.fluid.contrib.BeamSearchDecoder.update_array ArgSpec(args=['self', 'array', 'value'], varargs=None, keywords=None, defaults=None) paddle.fluid.contrib.memory_usage ArgSpec(args=['program', 'batch_size'], varargs=None, keywords=None, defaults=None) paddle.fluid.contrib.op_freq_statistic ArgSpec(args=['program'], varargs=None, keywords=None, defaults=None) +paddle.fluid.contrib.QuantizeTranspiler.__init__ ArgSpec(args=['self', 'weight_bits', 'activation_bits', 'activation_quantize_type', 'weight_quantize_type', 'window_size'], varargs=None, keywords=None, defaults=(8, 8, 'abs_max', 'abs_max', 10000)) +paddle.fluid.contrib.QuantizeTranspiler.convert_to_int8 ArgSpec(args=['self', 'program', 'place', 'scope'], varargs=None, keywords=None, defaults=(None,)) +paddle.fluid.contrib.QuantizeTranspiler.freeze_program ArgSpec(args=['self', 'program', 'place', 'fuse_bn', 'scope'], varargs=None, keywords=None, defaults=(False, None)) +paddle.fluid.contrib.QuantizeTranspiler.training_transpile ArgSpec(args=['self', 'program', 'startup_program'], varargs=None, keywords=None, defaults=(None, None)) paddle.fluid.transpiler.DistributeTranspiler.__init__ ArgSpec(args=['self', 'config'], varargs=None, keywords=None, defaults=(None,)) paddle.fluid.transpiler.DistributeTranspiler.get_pserver_program ArgSpec(args=['self', 'endpoint'], varargs=None, keywords=None, defaults=None) paddle.fluid.transpiler.DistributeTranspiler.get_pserver_programs ArgSpec(args=['self', 'endpoint'], varargs=None, keywords=None, defaults=None) paddle.fluid.transpiler.DistributeTranspiler.get_startup_program ArgSpec(args=['self', 'endpoint', 'pserver_program', 'startup_program'], varargs=None, keywords=None, defaults=(None, None)) paddle.fluid.transpiler.DistributeTranspiler.get_trainer_program ArgSpec(args=['self', 'wait_port'], varargs=None, keywords=None, defaults=(True,)) paddle.fluid.transpiler.DistributeTranspiler.transpile ArgSpec(args=['self', 'trainer_id', 'program', 'pservers', 'trainers', 'sync_mode', 'startup_program', 'current_endpoint'], varargs=None, keywords=None, defaults=(None, '127.0.0.1:6174', 1, True, None, '127.0.0.1:6174')) -paddle.fluid.transpiler.memory_optimize ArgSpec(args=['input_program', 'skip_opt_set', 'print_log', 'level'], varargs=None, keywords=None, defaults=(None, False, 0)) +paddle.fluid.transpiler.memory_optimize ArgSpec(args=['input_program', 'skip_opt_set', 'print_log', 'level', 'skip_grads'], varargs=None, keywords=None, defaults=(None, False, 0, False)) paddle.fluid.transpiler.release_memory ArgSpec(args=['input_program', 'skip_opt_set'], varargs=None, keywords=None, defaults=(None,)) paddle.fluid.transpiler.HashName.__init__ ArgSpec(args=['self', 'pserver_endpoints'], varargs=None, keywords=None, defaults=None) paddle.fluid.transpiler.HashName.dispatch ArgSpec(args=['self', 'varlist'], varargs=None, keywords=None, defaults=None) diff --git a/paddle/fluid/framework/CMakeLists.txt b/paddle/fluid/framework/CMakeLists.txt index 69c6dd02005902d9ee486a0b7f8f0c9a343be255..39898dd23643c5742f209858c7d3dfad89968f7d 100644 --- a/paddle/fluid/framework/CMakeLists.txt +++ b/paddle/fluid/framework/CMakeLists.txt @@ -56,9 +56,9 @@ else() cc_test(mixed_vector_test SRCS mixed_vector_test.cc DEPS place memory device_context tensor) endif() if (NOT WIN32) -cc_library(lod_tensor SRCS lod_tensor.cc DEPS ddim place tensor framework_proto recordio version) + cc_library(lod_tensor SRCS lod_tensor.cc DEPS ddim place tensor framework_proto recordio version) else() -cc_library(lod_tensor SRCS lod_tensor.cc DEPS ddim place tensor framework_proto version) + cc_library(lod_tensor SRCS lod_tensor.cc DEPS ddim place tensor framework_proto version) endif (NOT WIN32) cc_test(lod_tensor_test SRCS lod_tensor_test.cc DEPS lod_tensor memory) @@ -141,12 +141,15 @@ cc_library(lod_rank_table SRCS lod_rank_table.cc DEPS lod_tensor) cc_library(feed_fetch_method SRCS feed_fetch_method.cc DEPS lod_tensor scope glog) +cc_library(naive_executor SRCS naive_executor.cc DEPS op_registry device_context scope framework_proto glog lod_rank_table feed_fetch_method graph_to_program_pass) + if(WITH_DISTRIBUTE) cc_library(executor SRCS executor.cc DEPS op_registry device_context scope framework_proto glog lod_rank_table feed_fetch_method sendrecvop_grpc cares grpc++_unsecure grpc_unsecure gpr graph_to_program_pass) set(DISTRIBUTE_COMPILE_FLAGS "-Wno-non-virtual-dtor -Wno-error=non-virtual-dtor -Wno-error=delete-non-virtual-dtor") set_source_files_properties(executor.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS}) else() cc_library(executor SRCS executor.cc DEPS op_registry device_context scope framework_proto glog lod_rank_table feed_fetch_method graph_to_program_pass) + cc_test(test_naive_executor SRCS naive_executor_test.cc DEPS naive_executor op_registry device_context scope framework_proto glog lod_rank_table feed_fetch_method graph_to_program_pass elementwise_add_op) endif() if (NOT WIN32) diff --git a/paddle/fluid/framework/ir/CMakeLists.txt b/paddle/fluid/framework/ir/CMakeLists.txt index 9796f2778955ce47fe9fd1b13baa36404b1574c5..a0bf1afd402c4e4eebe13cc3fc43f44f23dccaed 100644 --- a/paddle/fluid/framework/ir/CMakeLists.txt +++ b/paddle/fluid/framework/ir/CMakeLists.txt @@ -28,9 +28,9 @@ cc_library(graph_pattern_detector SRCS graph_pattern_detector.cc DEPS graph grap pass_library(graph_to_program_pass base) pass_library(graph_viz_pass base) pass_library(fc_fuse_pass inference) -if(WITH_MKLDNN) - pass_library(conv_relu_mkldnn_fuse_pass inference) -endif() +if (WITH_MKLDNN) + pass_library(conv_relu_mkldnn_fuse_pass inference) +endif () pass_library(attention_lstm_fuse_pass inference) pass_library(infer_clean_graph_pass inference) pass_library(fc_lstm_fuse_pass inference) @@ -49,6 +49,6 @@ cc_test(graph_helper_test SRCS graph_helper_test.cc DEPS graph graph_helper op_r cc_test(graph_to_program_pass_test SRCS graph_to_program_pass_test.cc DEPS graph_to_program_pass) cc_test(test_graph_pattern_detector SRCS graph_pattern_detector_tester.cc DEPS graph_pattern_detector) cc_test(test_fc_fuse_pass SRCS fc_fuse_pass_tester.cc DEPS fc_fuse_pass framework_proto) -if(WITH_MKLDNN) - cc_test(test_conv_relu_mkldnn_fuse_pass SRCS conv_relu_mkldnn_fuse_pass_tester.cc DEPS conv_relu_mkldnn_fuse_pass) -endif() +if (WITH_MKLDNN) + cc_test(test_conv_relu_mkldnn_fuse_pass SRCS conv_relu_mkldnn_fuse_pass_tester.cc DEPS conv_relu_mkldnn_fuse_pass) +endif () diff --git a/paddle/fluid/framework/naive_executor.cc b/paddle/fluid/framework/naive_executor.cc new file mode 100644 index 0000000000000000000000000000000000000000..f681d4ecef9efe2b51c7154787230e8be2fb2702 --- /dev/null +++ b/paddle/fluid/framework/naive_executor.cc @@ -0,0 +1,150 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/framework/naive_executor.h" +#include "paddle/fluid/framework/channel.h" +#include "paddle/fluid/framework/feed_fetch_method.h" +#include "paddle/fluid/framework/lod_rank_table.h" +#include "paddle/fluid/framework/lod_tensor_array.h" +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/framework/reader.h" +#include "paddle/fluid/string/pretty_log.h" + +namespace paddle { +namespace framework { + +// These code can be shared with Executor. +static void InitializeVariable(Variable *var, proto::VarType::Type var_type) { + if (var_type == proto::VarType::LOD_TENSOR) { + var->GetMutable(); + } else if (var_type == proto::VarType::SELECTED_ROWS) { + var->GetMutable(); + } else if (var_type == proto::VarType::FEED_MINIBATCH) { + var->GetMutable(); + } else if (var_type == proto::VarType::FETCH_LIST) { + var->GetMutable(); + } else if (var_type == proto::VarType::STEP_SCOPES) { + var->GetMutable>(); + } else if (var_type == proto::VarType::LOD_RANK_TABLE) { + var->GetMutable(); + } else if (var_type == proto::VarType::LOD_TENSOR_ARRAY) { + var->GetMutable(); + } else if (var_type == proto::VarType::PLACE_LIST) { + var->GetMutable(); + } else if (var_type == proto::VarType::READER) { + var->GetMutable(); + } else if (var_type == proto::VarType::CHANNEL) { + var->GetMutable(); + } else if (var_type == proto::VarType::RAW) { + // GetMutable will be called in operator + } else { + PADDLE_THROW( + "Variable type %d is not in " + "[LOD_TENSOR, SELECTED_ROWS, FEED_MINIBATCH, FETCH_LIST, " + "LOD_RANK_TABLE, PLACE_LIST, READER, CHANNEL, RAW]", + var_type); + } +} + +void NaiveExecutor::Prepare(Scope *parent_scope, + const ProgramDesc &program_desc, int block_id, + bool with_feed_fetch_ops) { + if (!parent_scope) { + scope_ = new framework::Scope; + } else { + scope_ = &parent_scope->NewScope(); + } + CreateVariables(program_desc, scope_, block_id); + CreateOps(program_desc, block_id, with_feed_fetch_ops); +} + +void NaiveExecutor::Run() { + for (auto &op : ops_) { + VLOG(4) << "run " << op->Type(); + op->Run(*scope_, place_); + } +} + +void NaiveExecutor::CreateVariables(const ProgramDesc &desc, Scope *scope, + int block_id) { + PADDLE_ENFORCE(scope); + auto &global_block = desc.Block(block_id); + + const Scope *ancestor_scope = scope; + while (ancestor_scope->parent()) { + ancestor_scope = ancestor_scope->parent(); + } + + if (ancestor_scope != scope) { + for (auto &var : global_block.AllVars()) { + if (var->Name() == framework::kEmptyVarName) { + continue; + } + // Create persistable vars in ancestor scope. + if (var->Persistable()) { + auto *ptr = const_cast(ancestor_scope)->Var(var->Name()); + InitializeVariable(ptr, var->GetType()); + VLOG(3) << "Create Variable " << var->Name() + << " global, which pointer is " << ptr; + } else { // Create temporary variables in local scope. + auto *ptr = scope->Var(var->Name()); + InitializeVariable(ptr, var->GetType()); + VLOG(3) << "Create Variable " << var->Name() + << " locally, which pointer is " << ptr; + } + } + } else { + for (auto &var : global_block.AllVars()) { + auto *ptr = scope->Var(var->Name()); + InitializeVariable(ptr, var->GetType()); + VLOG(3) << "Create variable " << var->Name() << ", which pointer is " + << ptr; + } + } +} + +void NaiveExecutor::CreateOps(const ProgramDesc &desc, int block_id, + bool with_feed_fetch_ops) { + for (const auto &op_desc : desc.Block(block_id).AllOps()) { + if (!with_feed_fetch_ops && + (op_desc->Type() == "feed" || op_desc->Type() == "fetch")) { + string::PrettyLogEndl(string::Style::detail(), "--- skip [%s], %s -> %s", + op_desc->Input("X")[0], op_desc->Type(), + op_desc->Output("Out")[0]); + continue; + } + ops_.emplace_back(OpRegistry::CreateOp(*op_desc)); + } +} + +LoDTensor *NaiveExecutor::FindTensor(const std::string &name) { + PADDLE_ENFORCE(scope_, "Need to init scope first"); + auto *var = scope_->FindVar(name); + PADDLE_ENFORCE(var, "No variable [%s] in the scope"); + auto *tensor = const_cast(&var->Get()); + return tensor; +} + +void NaiveExecutor::CleanFeedFetchOps() { + std::vector> ops; + for (auto &op : ops_) { + if (op->Type() != "feed" && op->Type() != "fetch") { + ops.emplace_back(std::move(op)); + } + } + ops_.swap(ops); +} + +} // namespace framework +} // namespace paddle diff --git a/paddle/fluid/framework/naive_executor.h b/paddle/fluid/framework/naive_executor.h new file mode 100644 index 0000000000000000000000000000000000000000..9355e9e36a6358aa91553dca35aaf1b658516a0a --- /dev/null +++ b/paddle/fluid/framework/naive_executor.h @@ -0,0 +1,63 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include "paddle/fluid/framework/operator.h" +#include "paddle/fluid/framework/program_desc.h" +#include "paddle/fluid/framework/scope.h" +#include "paddle/fluid/platform/device_context.h" + +namespace paddle { +namespace framework { + +/* + * Simple, intuitive and effective. Only single thread is supported, and + * currently designed for inference. + */ +class NaiveExecutor { + public: + explicit NaiveExecutor(const platform::Place& place) : place_(place) {} + + // Create child scope. + // Create variables. + // @with_feed_fetch_ops: whether to work with the feed and fetch operators. + void Prepare(Scope* parent_scope, const ProgramDesc& program_desc, + int block_id, bool with_feed_fetch_ops); + + // Run all the operators. + void Run(); + + // Get an tensor to operating directly, without the need for feed_ops. + LoDTensor* FindTensor(const std::string& name); + + Scope* scope() { return scope_; } + + void CleanFeedFetchOps(); + + protected: + void CreateVariables(const ProgramDesc& desc, Scope* scope, int block_id); + + void CreateOps(const ProgramDesc& desc, int block_id, + bool with_feed_fetch_ops); + + private: + const platform::Place place_; + // Catch the required resource to avoid recreate. + std::vector> ops_; + Scope* scope_; +}; + +} // namespace framework +} // namespace paddle diff --git a/paddle/fluid/framework/naive_executor_test.cc b/paddle/fluid/framework/naive_executor_test.cc new file mode 100644 index 0000000000000000000000000000000000000000..6b9f79b9d398bc5a0ee6ba66587924daad0dbbc5 --- /dev/null +++ b/paddle/fluid/framework/naive_executor_test.cc @@ -0,0 +1,70 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/framework/naive_executor.h" +#include +#include +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/framework/program_desc.h" + +namespace paddle { +namespace framework { + +TEST(NaiveExecutor, Basic) { + ProgramDesc program; + auto* main_block = program.MutableBlock(0); + auto* a = main_block->Var("a"); // input + auto* b = main_block->Var("b"); // input + auto* c = main_block->Var("c"); // input + a->SetType(proto::VarType::LOD_TENSOR); + b->SetType(proto::VarType::LOD_TENSOR); + c->SetType(proto::VarType::LOD_TENSOR); + + auto* add = main_block->AppendOp(); + add->SetType("elementwise_add"); + add->SetInput("X", {"a"}); + add->SetInput("Y", {"b"}); + add->SetOutput("Out", {"c"}); + + auto place = platform::CPUPlace(); + NaiveExecutor exe(place); + exe.Prepare(nullptr, program, 0, false /*with feed fetch ops*/); + auto* a_tensor = exe.FindTensor("a"); + auto* b_tensor = exe.FindTensor("b"); + auto* c_tensor = exe.FindTensor("c"); + + a_tensor->Resize({1, 4}); + b_tensor->Resize({1, 4}); + c_tensor->Resize({1, 4}); + b_tensor->mutable_data(place); + a_tensor->mutable_data(place); + + float a_arr[] = {0, 1, 2, 3}; + float b_arr[] = {0.0, .1, .2, .3}; + + std::copy_n(a_arr, 4, a_tensor->mutable_data(place)); + std::copy_n(b_arr, 4, b_tensor->mutable_data(place)); + + exe.Run(); + + auto* c_data = c_tensor->mutable_data(place); + for (int i = 0; i < 4; i++) { + EXPECT_NEAR(c_data[i], 1.1 * i, 1e-3); + } +} + +} // namespace framework +} // namespace paddle + +USE_OP(elementwise_add); diff --git a/paddle/fluid/framework/operator.cc b/paddle/fluid/framework/operator.cc index e800cb9993ddde45de7c33b11994359e77710daf..96624e33c6323dee7b6534673278b6b1b6343ae0 100644 --- a/paddle/fluid/framework/operator.cc +++ b/paddle/fluid/framework/operator.cc @@ -154,9 +154,15 @@ void OperatorBase::Run(const Scope& scope, const platform::Place& place) { platform::SetDeviceId(dev_id); #endif } - platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance(); - platform::RecordEvent record_event(Type(), pool.Get(place)); + + if (platform::IsProfileEnabled()) { + platform::DeviceContextPool& pool = + platform::DeviceContextPool::Instance(); + platform::RecordEvent record_event(Type(), pool.Get(place)); + } + RunImpl(scope, place); + if (VLOG_IS_ON(3)) { VLOG(3) << place << " " << DebugStringEx(&scope); } diff --git a/paddle/fluid/framework/scope.cc b/paddle/fluid/framework/scope.cc index 1a727a2c8c759d010606d5b605823b7252b35c69..40dee143f5d8f64a44bc2469bd5f38b89338ea5d 100644 --- a/paddle/fluid/framework/scope.cc +++ b/paddle/fluid/framework/scope.cc @@ -20,6 +20,13 @@ limitations under the License. */ #include "paddle/fluid/framework/threadpool.h" #include "paddle/fluid/string/printf.h" +// The mutex is not needed by training and inference, only for distribution. +#if PADDLE_WITH_DISTRIBUTE +#define WITH_LOCK 1 +#else +#define WITH_LOCK 0 +#endif + DEFINE_bool(benchmark, false, "Doing memory benchmark. It will make deleting scope synchronized, " "and add some memory usage logs." @@ -49,18 +56,24 @@ int64_t GetEagerDeletionThreshold() { Scope::~Scope() { DropKids(); } Scope& Scope::NewScope() const { +#if WITH_LOCK std::unique_lock lock(mutex_); +#endif kids_.push_back(new Scope(this)); return *kids_.back(); } Variable* Scope::Var(const std::string& name) { +#if WITH_LOCK std::unique_lock lock(mutex_); +#endif return VarInternal(name); } Variable* Scope::Var(std::string* name) { +#if WITH_LOCK std::unique_lock lock(mutex_); +#endif auto new_name = string::Sprintf("%p.%d", this, vars_.size()); if (name != nullptr) { *name = new_name; @@ -69,29 +82,39 @@ Variable* Scope::Var(std::string* name) { } Variable* Scope::FindVar(const std::string& name) const { +#if WITH_LOCK std::unique_lock lock(mutex_); +#endif return FindVarInternal(name); } const Scope* Scope::FindScope(const Variable* var) const { +#if WITH_LOCK std::unique_lock lock(mutex_); +#endif return FindScopeInternal(var); } void Scope::DropKids() { +#if WITH_LOCK std::unique_lock lock(mutex_); +#endif for (Scope* s : kids_) delete s; kids_.clear(); } bool Scope::HasKid(const Scope* scope) const { +#if WITH_LOCK std::unique_lock lock(mutex_); +#endif auto it = std::find(this->kids_.begin(), this->kids_.end(), scope); return it != this->kids_.end(); } std::vector Scope::LocalVarNames() const { +#if WITH_LOCK std::unique_lock lock(mutex_); +#endif std::vector known_vars; known_vars.reserve(this->vars_.size()); for (auto& p : vars_) { @@ -101,7 +124,9 @@ std::vector Scope::LocalVarNames() const { } void Scope::DeleteScope(Scope* scope) const { +#if WITH_LOCK std::unique_lock lock(mutex_); +#endif auto it = std::find(this->kids_.begin(), this->kids_.end(), scope); PADDLE_ENFORCE(it != this->kids_.end(), "Cannot find %p as kid scope", scope); this->kids_.erase(it); @@ -114,7 +139,9 @@ void Scope::DeleteScope(Scope* scope) const { } void Scope::EraseVars(const std::vector& var_names) { +#if WITH_LOCK std::unique_lock lock(mutex_); +#endif std::set var_set(var_names.begin(), var_names.end()); for (auto it = vars_.begin(); it != vars_.end();) { if (var_set.find(it->first) != var_set.end()) { @@ -127,12 +154,16 @@ void Scope::EraseVars(const std::vector& var_names) { void Scope::Rename(const std::string& origin_name, const std::string& new_name) const { +#if WITH_LOCK std::unique_lock lock(mutex_); +#endif RenameInternal(origin_name, new_name); } std::string Scope::Rename(const std::string& origin_name) const { +#if WITH_LOCK std::unique_lock lock(mutex_); +#endif auto new_name = string::Sprintf("%p.%d", this, vars_.size()); RenameInternal(origin_name, new_name); return new_name; diff --git a/paddle/fluid/inference/CMakeLists.txt b/paddle/fluid/inference/CMakeLists.txt index 6698efd1fa773127a84b4bcb28f57f4226dd7ae2..db381bbc3911ad9650162d9b9012580e5b638828 100644 --- a/paddle/fluid/inference/CMakeLists.txt +++ b/paddle/fluid/inference/CMakeLists.txt @@ -53,7 +53,7 @@ if(NOT APPLE) endif() if(WITH_TESTING) - # tests/book depends the models that generated by python/paddle/fluid/tests/book + # tests/book depends the models that generated by python/paddle/fluid/tests/book add_subdirectory(tests/book) if(WITH_INFERENCE_API_TEST) add_subdirectory(tests/api) diff --git a/paddle/fluid/inference/analysis/CMakeLists.txt b/paddle/fluid/inference/analysis/CMakeLists.txt index c2a1c6634bd8f8de0796456e91cb3c530d4c6823..c740ea009f6cfc2ea250d8f1abdd7d442c2a0bb0 100644 --- a/paddle/fluid/inference/analysis/CMakeLists.txt +++ b/paddle/fluid/inference/analysis/CMakeLists.txt @@ -1,6 +1,6 @@ cc_library(ir_pass_manager SRCS ir_pass_manager.cc DEPS graph pass) set(analysis_deps - framework_proto proto_desc ir_pass_manager graph pass paddle_fluid_api executor pretty_log) + framework_proto proto_desc ir_pass_manager graph pass paddle_fluid_api executor pretty_log) cc_library(analysis SRCS pass_manager.cc node.cc data_flow_graph.cc graph_traits.cc subgraph_splitter.cc analyzer.cc diff --git a/paddle/fluid/inference/api/CMakeLists.txt b/paddle/fluid/inference/api/CMakeLists.txt index e569df94c54c304852dab7c7496804c1b08d665c..32d58b87413c95908644ffba31bbec22d8e23201 100644 --- a/paddle/fluid/inference/api/CMakeLists.txt +++ b/paddle/fluid/inference/api/CMakeLists.txt @@ -18,10 +18,10 @@ if(APPLE) endif(APPLE) -set(inference_deps paddle_inference_api paddle_fluid_api analysis pass ir_pass_manager ${GLOB_PASS_LIB}) +set(inference_deps paddle_inference_api paddle_fluid_api analysis pass ir_pass_manager naive_executor ${GLOB_PASS_LIB}) if(WITH_GPU AND TENSORRT_FOUND) - set(inference_deps ${inference_deps} paddle_inference_tensorrt_subgraph_engine) + set(inference_deps ${inference_deps} paddle_inference_tensorrt_subgraph_engine analysis_predictor) endif() function(inference_api_test TARGET_NAME) @@ -43,8 +43,10 @@ function(inference_api_test TARGET_NAME) endif(WITH_TESTING) endfunction(inference_api_test) -cc_library(paddle_inference_api SRCS api.cc api_impl.cc helper.cc DEPS lod_tensor) -cc_library(analysis_predictor SRCS analysis_predictor.cc DEPS paddle_inference_api analysis) +cc_library(paddle_inference_api SRCS api.cc api_impl.cc helper.cc DEPS lod_tensor scope) +cc_library(analysis_predictor SRCS analysis_predictor.cc DEPS paddle_inference_api analysis naive_executor zero_copy_tensor) +cc_library(zero_copy_tensor SRCS details/zero_copy_tensor.cc DEPS paddle_inference_api) +cc_library(zero_copy_tensor_dummy SRCS details/zero_copy_tensor_dummy.cc DEPS paddle_inference_api) cc_test(test_paddle_inference_api SRCS api_tester.cc DEPS paddle_inference_api) @@ -52,18 +54,22 @@ cc_test(test_paddle_inference_api inference_api_test(test_api_impl SRC api_impl_tester.cc ARGS test_word2vec test_image_classification) +set(PYTHON_TESTS_DIR ${PADDLE_BINARY_DIR}/python/paddle/fluid/tests) +cc_test(test_analysis_predictor SRCS analysis_predictor_tester.cc DEPS analysis_predictor ${inference_deps} paddle_inference_api + ARGS --dirname=${PYTHON_TESTS_DIR}/book) + if(WITH_GPU AND TENSORRT_FOUND) cc_library(paddle_inference_tensorrt_subgraph_engine SRCS api_tensorrt_subgraph_engine.cc - DEPS paddle_inference_api analysis tensorrt_engine paddle_inference_api paddle_fluid_api tensorrt_converter) + DEPS paddle_inference_api analysis tensorrt_engine paddle_inference_api paddle_fluid_api tensorrt_converter zero_copy_tensor_dummy) inference_api_test(test_api_tensorrt_subgraph_engine SRC api_tensorrt_subgraph_engine_tester.cc ARGS test_word2vec) endif() if (WITH_ANAKIN AND WITH_MKL) # only needed in CI # compile the libinference_anakin_api.a and anakin.so. - cc_library(inference_anakin_api SRCS api.cc api_anakin_engine.cc DEPS anakin_shared anakin_saber mklml) - cc_library(inference_anakin_api_shared SHARED SRCS api.cc api_anakin_engine.cc DEPS anakin_shared anakin_saber) + cc_library(inference_anakin_api SRCS api.cc api_anakin_engine.cc DEPS anakin_shared anakin_saber mklml scope zero_copy_tensor_dummy) + cc_library(inference_anakin_api_shared SHARED SRCS api.cc api_anakin_engine.cc DEPS anakin_shared anakin_saber scope) function(anakin_target target_name) target_compile_options(${target_name} BEFORE PUBLIC ${ANAKIN_COMPILE_EXTRA_FLAGS}) endfunction() diff --git a/paddle/fluid/inference/api/analysis_predictor.cc b/paddle/fluid/inference/api/analysis_predictor.cc index 1032aadcbda4f1b05841e08e1abe7c737c3aeb9c..0c11694d5a905be4d9f0c6ebbc6159a4dc4a346e 100644 --- a/paddle/fluid/inference/api/analysis_predictor.cc +++ b/paddle/fluid/inference/api/analysis_predictor.cc @@ -16,11 +16,15 @@ #include #include #include +#include "paddle/fluid/framework/feed_fetch_method.h" #include "paddle/fluid/framework/ir/fuse_pass_base.h" #include "paddle/fluid/framework/ir/pass.h" +#include "paddle/fluid/framework/naive_executor.h" #include "paddle/fluid/framework/scope.h" +#include "paddle/fluid/inference/api/helper.h" #include "paddle/fluid/inference/api/paddle_inference_api.h" #include "paddle/fluid/inference/api/paddle_inference_pass.h" +#include "paddle/fluid/inference/api/timer.h" #include "paddle/fluid/inference/utils/singleton.h" #include "paddle/fluid/platform/profiler.h" @@ -28,8 +32,11 @@ DECLARE_bool(profile); namespace paddle { +using contrib::AnalysisConfig; + bool AnalysisPredictor::Init( - const std::shared_ptr& parent_scope) { + const std::shared_ptr &parent_scope, + const std::shared_ptr &program) { VLOG(3) << "Predictor::init()"; #if !defined(_WIN32) if (FLAGS_profile) { @@ -43,7 +50,8 @@ bool AnalysisPredictor::Init( if (config_.use_gpu) { place_ = paddle::platform::CUDAPlace(config_.device); - LOG(WARNING) << "ir optimize only supports CPU currently"; + LOG(WARNING) << "ir optimize only supports CPU currently, enable_ir_optim " + "is turned false."; config_.enable_ir_optim = false; } else { place_ = paddle::platform::CPUPlace(); @@ -56,37 +64,134 @@ bool AnalysisPredictor::Init( scope_.reset(new paddle::framework::Scope()); } - executor_.reset(new paddle::framework::Executor(place_)); + executor_.reset(new paddle::framework::NaiveExecutor(place_)); - // Initialize the inference program - if (!config_.model_dir.empty()) { - // Parameters are saved in separate files sited in - // the specified `dirname`. - inference_program_ = paddle::inference::Load(executor_.get(), scope_.get(), - config_.model_dir); - } else if (!config_.prog_file.empty() && !config_.param_file.empty()) { - // All parameters are saved in a single file. - // The file names should be consistent with that used - // in Python API `fluid.io.save_inference_model`. - inference_program_ = paddle::inference::Load( - executor_.get(), scope_.get(), config_.prog_file, config_.param_file); + if (!program) { + if (!LoadProgramDesc()) return false; + OptimizeInferenceProgram(); } else { - LOG(ERROR) << "fail to load inference model from " << config_.model_dir; + inference_program_ = program; + } + executor_->Prepare(scope_.get(), *inference_program_, 0, + config_.use_feed_fetch_ops); + + // Get the feed_target_names and fetch_target_names + PrepareFeedFetch(); + return true; +} + +bool AnalysisPredictor::Run(const std::vector &inputs, + std::vector *output_data, + int batch_size) { + VLOG(3) << "Predictor::predict"; + inference::Timer timer; + timer.tic(); + // set feed variable + std::vector feeds; + framework::Scope *scope = sub_scope_ ? sub_scope_ : scope_.get(); + if (!SetFeed(inputs, scope)) { + LOG(ERROR) << "fail to set feed"; return false; } + // Run the inference program + // if share variables, we need not create variables + executor_->Run(); - OptimizeInferenceProgram(); - if (config_._use_mkldnn) { - executor_->EnableMKLDNN(*inference_program_); + // get fetch variable + if (!GetFetch(output_data, scope)) { + LOG(ERROR) << "fail to get fetches"; + return false; } - ctx_ = executor_->Prepare(*inference_program_, 0); + VLOG(3) << "predict cost: " << timer.toc() << "ms"; + return true; +} - VLOG(5) << "to create variables"; - PADDLE_ENFORCE(scope_.get()); - executor_->CreateVariables(*inference_program_, - sub_scope_ ? sub_scope_ : scope_.get(), 0); - // Get the feed_target_names and fetch_target_names - PrepareFeedFetch(); +bool AnalysisPredictor::SetFeed(const std::vector &inputs, + framework::Scope *scope) { + VLOG(3) << "Predictor::set_feed"; + if (inputs.size() != feeds_.size()) { + LOG(ERROR) << "wrong feed input size, need " << feeds_.size() << " but get " + << inputs.size(); + return false; + } + + // Cache the inputs memory for better concurrency performance. + feed_tensors_.resize(inputs.size()); + + for (size_t i = 0; i < inputs.size(); ++i) { + auto &input = feed_tensors_[i]; + framework::DDim ddim = framework::make_ddim(inputs[i].shape); + void *input_ptr; + if (inputs[i].dtype == PaddleDType::INT64) { + input_ptr = input.mutable_data(ddim, platform::CPUPlace()); + } else if (inputs[i].dtype == PaddleDType::FLOAT32) { + input_ptr = input.mutable_data(ddim, platform::CPUPlace()); + } else { + LOG(ERROR) << "unsupported feed type " << inputs[i].dtype; + return false; + } + + // TODO(panyx0718): Init LoDTensor from existing memcpy to save a copy. + std::memcpy(static_cast(input_ptr), inputs[i].data.data(), + inputs[i].data.length()); + // TODO(Superjomn) Low performance, need optimization for heavy LoD copy. + framework::LoD lod; + for (auto &level : inputs[i].lod) { + lod.emplace_back(level); + } + input.set_lod(lod); + int idx = -1; + if (config_.specify_input_name) { + idx = feed_names_[inputs[i].name]; + } else { + idx = boost::get(feeds_[i]->GetAttr("col")); + } + framework::SetFeedVariable(scope, input, "feed", idx); + } + return true; +} + +template +void AnalysisPredictor::GetFetchOne(const framework::LoDTensor &fetch, + PaddleTensor *output) { + // set shape. + auto shape = framework::vectorize(fetch.dims()); + output->shape.assign(shape.begin(), shape.end()); + // set data. + const T *data = fetch.data(); + int num_elems = inference::VecReduceToInt(shape); + output->data.Resize(num_elems * sizeof(T)); + // The fetched tensor output by fetch op, should always in CPU memory, so just + // copy. + memcpy(output->data.data(), data, num_elems * sizeof(T)); + // set lod + output->lod.clear(); + for (auto &level : fetch.lod()) { + output->lod.emplace_back(level.begin(), level.end()); + } +} + +bool AnalysisPredictor::GetFetch(std::vector *outputs, + framework::Scope *scope) { + VLOG(3) << "Predictor::get_fetch"; + outputs->resize(fetchs_.size()); + for (size_t i = 0; i < fetchs_.size(); ++i) { + int idx = boost::get(fetchs_[i]->GetAttr("col")); + PADDLE_ENFORCE((size_t)idx == i); + framework::LoDTensor &fetch = + framework::GetFetchVariable(*scope, "fetch", idx); + auto type = fetch.type(); + auto output = &(outputs->at(i)); + if (type == typeid(float)) { + GetFetchOne(fetch, output); + output->dtype = PaddleDType::FLOAT32; + } else if (type == typeid(int64_t)) { + GetFetchOne(fetch, output); + output->dtype = PaddleDType::INT64; + } else { + LOG(ERROR) << "unknown type, only support float32 and int64 now."; + } + } return true; } @@ -107,6 +212,7 @@ void AnalysisPredictor::OptimizeInferenceProgram() { new std::string(config_.prog_file)); argument_.fluid_model_param_path.reset(new std::string(config_.param_file)); } + argument_.origin_program_desc.reset( new ProgramDesc(*inference_program_->Proto())); PADDLE_ENFORCE( @@ -127,9 +233,8 @@ void AnalysisPredictor::OptimizeInferenceProgram() { } template <> -std::unique_ptr -CreatePaddlePredictor( - const contrib::AnalysisConfig& config) { +std::unique_ptr CreatePaddlePredictor< + AnalysisConfig, PaddleEngineKind::kAnalysis>(const AnalysisConfig &config) { VLOG(3) << "create AnalysisConfig"; if (config.use_gpu) { // 1. GPU memeroy @@ -150,15 +255,90 @@ CreatePaddlePredictor( } std::unique_ptr predictor(new AnalysisPredictor(config)); - if (!dynamic_cast(predictor.get())->Init(nullptr)) { + if (!dynamic_cast(predictor.get())->Init(nullptr)) { return nullptr; } return predictor; } +void AnalysisPredictor::PrepareFeedFetch() { + for (auto *op : inference_program_->Block(0).AllOps()) { + if (op->Type() == "feed") { + int idx = boost::get(op->GetAttr("col")); + if (feeds_.size() <= static_cast(idx)) { + feeds_.resize(idx + 1); + } + feeds_[idx] = op; + feed_names_[op->Output("Out")[0]] = idx; + } else if (op->Type() == "fetch") { + int idx = boost::get(op->GetAttr("col")); + if (fetchs_.size() <= static_cast(idx)) { + fetchs_.resize(idx + 1); + } + fetchs_[idx] = op; + } + } +} + +std::unique_ptr AnalysisPredictor::GetInputTensor( + const std::string &name) { + PADDLE_ENFORCE(executor_->scope()->FindVar(name), "no name called %s", name); + std::unique_ptr res( + new ZeroCopyTensor(static_cast(executor_->scope()))); + res->input_or_output_ = true; + res->SetName(name); + return res; +} + +std::unique_ptr AnalysisPredictor::GetOutputTensor( + const std::string &name) { + PADDLE_ENFORCE(executor_->scope()->FindVar(name), "no name called %s", name); + std::unique_ptr res( + new ZeroCopyTensor(static_cast(executor_->scope()))); + res->input_or_output_ = false; + res->SetName(name); + return res; +} + +bool AnalysisPredictor::ZeroCopyRun() { + executor_->Run(); + return true; +} + +bool AnalysisPredictor::LoadProgramDesc() { + // Initialize the inference program + std::unique_ptr tmp_exe( + new framework::Executor(platform::CPUPlace())); + if (!config_.model_dir.empty()) { + // Parameters are saved in separate files sited in + // the specified `dirname`. + inference_program_ = paddle::inference::Load( + static_cast(tmp_exe.get()), scope_.get(), + config_.model_dir); + } else if (!config_.prog_file.empty() && !config_.param_file.empty()) { + // All parameters are saved in a single file. + // The file names should be consistent with that used + // in Python API `fluid.io.save_inference_model`. + inference_program_ = paddle::inference::Load( + static_cast(tmp_exe.get()), scope_.get(), + config_.prog_file, config_.param_file); + } else { + LOG(ERROR) << string::Sprintf( + "not valid model path '%s' or program path '%s'.", config_.model_dir, + config_.param_file); + return false; + } + return true; +} +std::unique_ptr AnalysisPredictor::Clone() { + auto *x = new AnalysisPredictor(config_); + x->Init(scope_, inference_program_); + return std::unique_ptr(x); +} + template <> std::unique_ptr CreatePaddlePredictor( - const contrib::AnalysisConfig& config) { + const contrib::AnalysisConfig &config) { return CreatePaddlePredictor(config); } diff --git a/paddle/fluid/inference/api/analysis_predictor.h b/paddle/fluid/inference/api/analysis_predictor.h index aa00e8be5c28c2e3bfe74fa0bff2c72210bd106e..0d01d7ac2b29ea6364b07af9bb3bdeb5ced6bd00 100644 --- a/paddle/fluid/inference/api/analysis_predictor.h +++ b/paddle/fluid/inference/api/analysis_predictor.h @@ -12,42 +12,81 @@ // See the License for the specific language governing permissions and // limitations under the License. +#pragma once #include #include +#include "paddle/fluid/framework/naive_executor.h" #include "paddle/fluid/inference/analysis/analyzer.h" #include "paddle/fluid/inference/api/api_impl.h" #include "paddle/fluid/inference/api/paddle_inference_api.h" +#include "paddle/fluid/string/printf.h" namespace paddle { using inference::analysis::Argument; using inference::analysis::Analyzer; using framework::proto::ProgramDesc; +using framework::NaiveExecutor; +using contrib::AnalysisConfig; /* This predictor is based on the original native predictor with IR and Analysis * support. It will optimize IR and Parameters in the runtime. * TODO(Superjomn) Replace the Navive predictor? */ -class AnalysisPredictor : public NativePaddlePredictor { +class AnalysisPredictor : public PaddlePredictor { public: - explicit AnalysisPredictor(const contrib::AnalysisConfig& config) - : NativePaddlePredictor(config), config_(config) {} + explicit AnalysisPredictor(const AnalysisConfig &config) : config_(config) {} - bool Init(const std::shared_ptr& parent_scope); + bool Init(const std::shared_ptr &parent_scope, + const std::shared_ptr &program = nullptr); - bool Run(const std::vector& inputs, - std::vector* output_data, - int batch_size = -1) override { - return NativePaddlePredictor::Run(inputs, output_data, batch_size); - } + bool Run(const std::vector &inputs, + std::vector *output_data, + int batch_size = -1) override; + + std::unique_ptr GetInputTensor( + const std::string &name) override; + std::unique_ptr GetOutputTensor( + const std::string &name) override; + + bool ZeroCopyRun() override; + + void PrepareFeedFetch(); void OptimizeInferenceProgram(); - Argument& analysis_argument() { return argument_; } + Argument &analysis_argument() { return argument_; } + + std::unique_ptr Clone() override; + + framework::Scope *scope() { return executor_->scope(); } + framework::ProgramDesc &program() { return *inference_program_; } + + protected: + bool LoadProgramDesc(); + + bool SetFeed(const std::vector &input_datas, + framework::Scope *scope); + bool GetFetch(std::vector *output_data, + framework::Scope *scope); + template + void GetFetchOne(const framework::LoDTensor &fetchs, + PaddleTensor *output_data); private: contrib::AnalysisConfig config_; Argument argument_; + std::unique_ptr executor_; + platform::Place place_; + std::shared_ptr scope_; + framework::Scope *sub_scope_{nullptr}; + std::shared_ptr inference_program_; + std::vector feeds_; + std::map feed_names_; + std::vector fetchs_; + // Memory buffer for feed inputs. The temporary LoDTensor will cause serious + // concurrency problems, so cache them. + std::vector feed_tensors_; }; } // namespace paddle diff --git a/paddle/fluid/inference/api/analysis_predictor_tester.cc b/paddle/fluid/inference/api/analysis_predictor_tester.cc new file mode 100644 index 0000000000000000000000000000000000000000..1d25f55b3188a684fe38df1417d114348cfa2e8a --- /dev/null +++ b/paddle/fluid/inference/api/analysis_predictor_tester.cc @@ -0,0 +1,67 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include +#include +#include "paddle/fluid/inference/api/paddle_inference_api.h" + +DEFINE_string(dirname, "", "dirname to tests."); + +namespace paddle { +namespace inference { +using contrib::AnalysisConfig; + +TEST(AnalysisPredictor, ZeroCopy) { + AnalysisConfig config; + config.model_dir = FLAGS_dirname + "/word2vec.inference.model"; + config.use_feed_fetch_ops = false; + + auto predictor = + CreatePaddlePredictor( + config); + + auto w0 = predictor->GetInputTensor("firstw"); + auto w1 = predictor->GetInputTensor("secondw"); + auto w2 = predictor->GetInputTensor("thirdw"); + auto w3 = predictor->GetInputTensor("forthw"); + + w0->Reshape({4, 1}); + w1->Reshape({4, 1}); + w2->Reshape({4, 1}); + w3->Reshape({4, 1}); + + auto* w0_data = w0->mutable_data(PaddlePlace::kCPU); + auto* w1_data = w1->mutable_data(PaddlePlace::kCPU); + auto* w2_data = w2->mutable_data(PaddlePlace::kCPU); + auto* w3_data = w3->mutable_data(PaddlePlace::kCPU); + + for (int i = 0; i < 4; i++) { + w0_data[i] = i; + w1_data[i] = i; + w2_data[i] = i; + w3_data[i] = i; + } + + predictor->ZeroCopyRun(); + + auto out = predictor->GetOutputTensor("fc_1.tmp_2"); + PaddlePlace place; + int size = 0; + auto* out_data = out->data(&place, &size); + LOG(INFO) << "output size: " << size / sizeof(float); + LOG(INFO) << "output_data: " << out_data; +} + +} // namespace inference +} // namespace paddle diff --git a/paddle/fluid/inference/api/api.cc b/paddle/fluid/inference/api/api.cc index c71769a32f604358fe68c927546591310649f116..01ea942d3c8d20180cfc9664b8601ba87a898e86 100644 --- a/paddle/fluid/inference/api/api.cc +++ b/paddle/fluid/inference/api/api.cc @@ -1,16 +1,22 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at -http://www.apache.org/licenses/LICENSE-2.0 -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +#include "paddle/fluid/framework/lod_tensor.h" +#include "paddle/fluid/framework/scope.h" #include "paddle/fluid/inference/api/paddle_inference_api.h" #include "paddle/fluid/platform/enforce.h" +#include "paddle_inference_api.h" namespace paddle { @@ -26,7 +32,7 @@ int PaddleDtypeSize(PaddleDType dtype) { } } -PaddleBuf::PaddleBuf(PaddleBuf&& other) +PaddleBuf::PaddleBuf(PaddleBuf &&other) : data_(other.data_), length_(other.length_), memory_owned_(other.memory_owned_) { @@ -35,9 +41,9 @@ PaddleBuf::PaddleBuf(PaddleBuf&& other) other.length_ = 0; } -PaddleBuf::PaddleBuf(const PaddleBuf& other) { *this = other; } +PaddleBuf::PaddleBuf(const PaddleBuf &other) { *this = other; } -PaddleBuf& PaddleBuf::operator=(const PaddleBuf& other) { +PaddleBuf &PaddleBuf::operator=(const PaddleBuf &other) { if (!other.memory_owned_) { data_ = other.data_; length_ = other.length_; @@ -51,7 +57,7 @@ PaddleBuf& PaddleBuf::operator=(const PaddleBuf& other) { return *this; } -PaddleBuf& PaddleBuf::operator=(PaddleBuf&& other) { +PaddleBuf &PaddleBuf::operator=(PaddleBuf &&other) { // only the buffer with external memory can be copied data_ = other.data_; length_ = other.length_; @@ -75,7 +81,7 @@ void PaddleBuf::Resize(size_t length) { } } -void PaddleBuf::Reset(void* data, size_t length) { +void PaddleBuf::Reset(void *data, size_t length) { Free(); memory_owned_ = false; data_ = data; @@ -85,7 +91,7 @@ void PaddleBuf::Reset(void* data, size_t length) { void PaddleBuf::Free() { if (memory_owned_ && data_) { PADDLE_ENFORCE_GT(length_, 0); - free(static_cast(data_)); + free(static_cast(data_)); data_ = nullptr; length_ = 0; } diff --git a/paddle/fluid/inference/api/api_impl.cc b/paddle/fluid/inference/api/api_impl.cc index dca4386b21b4a064c21b52218682321258f368c4..53740899cd4176ae007c09b7728e504675d13248 100644 --- a/paddle/fluid/inference/api/api_impl.cc +++ b/paddle/fluid/inference/api/api_impl.cc @@ -145,7 +145,7 @@ bool NativePaddlePredictor::Run(const std::vector &inputs, VLOG(4) << "Run prepared context"; executor_->RunPreparedContext(ctx_.get(), scope, false, /* don't create local scope each time*/ - false /* don't create variable eatch time */); + false /* don't create variable each time */); VLOG(4) << "Finish prepared context"; // get fetch variable if (!GetFetch(output_data, scope)) { diff --git a/paddle/fluid/inference/api/api_impl.h b/paddle/fluid/inference/api/api_impl.h index 6386d601262b3dac0e957fae991d23768b52f2c0..7882f6a53c7ce9a2486158ea9b50c018d1814091 100644 --- a/paddle/fluid/inference/api/api_impl.h +++ b/paddle/fluid/inference/api/api_impl.h @@ -1,16 +1,16 @@ /* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - Licensed under the Apache License, Version 2.0 (the "License"); - you may not use this file except in compliance with the License. - You may obtain a copy of the License at +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at - http://www.apache.org/licenses/LICENSE-2.0 +http://www.apache.org/licenses/LICENSE-2.0 - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. */ +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ #pragma once @@ -30,6 +30,8 @@ #include "paddle/fluid/framework/ddim.h" #include "paddle/fluid/framework/lod_tensor.h" +#include "paddle/fluid/framework/naive_executor.h" +#include "paddle/fluid/inference/api/paddle_inference_api.h" #include "paddle/fluid/inference/io.h" #include "paddle/fluid/platform/init.h" #include "paddle/fluid/platform/profiler.h" @@ -52,6 +54,8 @@ class NativePaddlePredictor : public PaddlePredictor { ~NativePaddlePredictor() override; + framework::Scope *scope() { return sub_scope_ ? sub_scope_ : scope_.get(); } + protected: bool SetFeed(const std::vector &input_datas, framework::Scope *scope); diff --git a/paddle/fluid/inference/api/api_impl_tester.cc b/paddle/fluid/inference/api/api_impl_tester.cc index fc1364b80ac1ee2d304eb2fe429eae5f56967516..106a941b2954bc7490c4ee6380b5249e126fbfb3 100644 --- a/paddle/fluid/inference/api/api_impl_tester.cc +++ b/paddle/fluid/inference/api/api_impl_tester.cc @@ -43,7 +43,7 @@ PaddleTensor LodTensorToPaddleTensor(framework::LoDTensor* t) { NativeConfig GetConfig() { NativeConfig config; - config.model_dir = FLAGS_dirname + "word2vec.inference.model"; + config.model_dir = FLAGS_dirname + "/word2vec.inference.model"; LOG(INFO) << "dirname " << config.model_dir; config.fraction_of_gpu_memory = 0.15; #ifdef PADDLE_WITH_CUDA @@ -110,7 +110,7 @@ void MainImageClassification(bool use_gpu) { NativeConfig config = GetConfig(); config.use_gpu = use_gpu; config.model_dir = - FLAGS_dirname + "image_classification_resnet.inference.model"; + FLAGS_dirname + "/image_classification_resnet.inference.model"; const bool is_combined = false; std::vector> feed_target_shapes = @@ -214,7 +214,7 @@ void MainThreadsImageClassification(bool use_gpu) { NativeConfig config = GetConfig(); config.use_gpu = use_gpu; config.model_dir = - FLAGS_dirname + "image_classification_resnet.inference.model"; + FLAGS_dirname + "/image_classification_resnet.inference.model"; auto main_predictor = CreatePaddlePredictor(config); std::vector jobs(num_jobs); diff --git a/paddle/fluid/inference/api/details/zero_copy_tensor.cc b/paddle/fluid/inference/api/details/zero_copy_tensor.cc new file mode 100644 index 0000000000000000000000000000000000000000..14698f6dfc8885ec1d35f1912bad10a9caa13db4 --- /dev/null +++ b/paddle/fluid/inference/api/details/zero_copy_tensor.cc @@ -0,0 +1,111 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/framework/lod_tensor.h" +#include "paddle/fluid/framework/scope.h" +#include "paddle/fluid/inference/api/paddle_inference_api.h" +#include "paddle/fluid/platform/enforce.h" + +namespace paddle { + +void ZeroCopyTensor::Reshape(const std::vector &shape) { + PADDLE_ENFORCE(!name_.empty(), + "Need to SetName first, so that the corresponding tensor can " + "be retrieved."); + PADDLE_ENFORCE(input_or_output_, + "Can't reshape the output tensor, it is readonly"); + PADDLE_ENFORCE(scope_); + auto *scope = static_cast(scope_); + auto *var = scope->FindVar(name_); + PADDLE_ENFORCE(var, "No tensor called [%s] in the runtime scope", name_); + auto *tensor = var->GetMutable(); + tensor->Resize(framework::make_ddim(shape)); +} + +template +T *ZeroCopyTensor::mutable_data(PaddlePlace place) { + auto *tensor = static_cast(FindTensor()); + switch (static_cast(place)) { + case static_cast(PaddlePlace::kCPU): { + return tensor->mutable_data(platform::CPUPlace()); + } + case static_cast(PaddlePlace::kGPU): { + return tensor->mutable_data(platform::CUDAPlace()); + } + default: + PADDLE_THROW("Unsupported place: %d", static_cast(place)); + break; + } + return nullptr; +} + +template +T *ZeroCopyTensor::data(PaddlePlace *place, int *size) { + auto *tensor = static_cast(FindTensor()); + auto *res = tensor->data(); + + if (platform::is_cpu_place(tensor->place())) { + *place = PaddlePlace::kCPU; + } else if (platform::is_gpu_place(tensor->place())) { + *place = PaddlePlace::kGPU; + } else { + *place = PaddlePlace::kUNK; + } + + *size = tensor->numel(); + return res; +} + +template float *ZeroCopyTensor::data(PaddlePlace *place, int *size); +template int64_t *ZeroCopyTensor::data(PaddlePlace *place, int *size); +template float *ZeroCopyTensor::mutable_data(PaddlePlace place); +template int64_t *ZeroCopyTensor::mutable_data(PaddlePlace place); + +void *ZeroCopyTensor::FindTensor() const { + PADDLE_ENFORCE(!name_.empty(), + "Need to SetName first, so that the corresponding tensor can " + "be retrieved."); + PADDLE_ENFORCE(scope_); + auto *scope = static_cast(scope_); + auto *var = scope->FindVar(name_); + PADDLE_ENFORCE(var, "No tensor called [%s] in the runtime scope", name_); + auto *tensor = var->GetMutable(); + return tensor; +} + +std::vector ZeroCopyTensor::shape() { + auto *tensor = static_cast(FindTensor()); + PADDLE_ENFORCE(tensor, "not found tensor called %s in the scope", name_); + return framework::vectorize(tensor->dims()); +} + +void ZeroCopyTensor::SetLoD(const std::vector> &x) { + auto *tensor = static_cast(FindTensor()); + framework::LoD lod; + for (auto &level : x) { + lod.emplace_back(level); + } + tensor->set_lod(lod); +} + +std::vector> ZeroCopyTensor::lod() const { + std::vector> res; + auto *tensor = static_cast(FindTensor()); + for (auto &level : tensor->lod()) { + res.emplace_back(level); + } + return res; +} + +} // namespace paddle diff --git a/paddle/fluid/inference/api/details/zero_copy_tensor_dummy.cc b/paddle/fluid/inference/api/details/zero_copy_tensor_dummy.cc new file mode 100644 index 0000000000000000000000000000000000000000..2d5b561d801cd9e734cab13b28e7285493e30f94 --- /dev/null +++ b/paddle/fluid/inference/api/details/zero_copy_tensor_dummy.cc @@ -0,0 +1,46 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/inference/api/paddle_inference_api.h" + +namespace paddle { + +void ZeroCopyTensor::Reshape(const std::vector &shape) {} + +template +T *ZeroCopyTensor::mutable_data(PaddlePlace place) { + return nullptr; +} + +template +T *ZeroCopyTensor::data(PaddlePlace *place, int *size) { + return nullptr; +} + +template float *ZeroCopyTensor::data(PaddlePlace *place, int *size); +template int64_t *ZeroCopyTensor::data(PaddlePlace *place, int *size); +template float *ZeroCopyTensor::mutable_data(PaddlePlace place); +template int64_t *ZeroCopyTensor::mutable_data(PaddlePlace place); + +void *ZeroCopyTensor::FindTensor() const { return nullptr; } + +std::vector ZeroCopyTensor::shape() { return {}; } + +void ZeroCopyTensor::SetLoD(const std::vector> &x) {} + +std::vector> ZeroCopyTensor::lod() const { + return std::vector>(); +} + +} // namespace paddle diff --git a/paddle/fluid/inference/api/helper.h b/paddle/fluid/inference/api/helper.h index 1fec2f96da0f9d978a3537b2d78e4ce5ef628c81..dbbd3f6a6786a4a4849002878263353919e8f31b 100644 --- a/paddle/fluid/inference/api/helper.h +++ b/paddle/fluid/inference/api/helper.h @@ -21,8 +21,10 @@ #include #include #include +#include "paddle/fluid/framework/lod_tensor.h" #include "paddle/fluid/inference/api/paddle_inference_api.h" #include "paddle/fluid/inference/api/timer.h" +#include "paddle/fluid/string/printf.h" namespace paddle { namespace inference { @@ -93,6 +95,20 @@ static void TensorAssignData(PaddleTensor *tensor, } } +template +static int ZeroCopyTensorAssignData(ZeroCopyTensor *tensor, + const std::vector> &data) { + int size{0}; + auto *ptr = tensor->mutable_data(PaddlePlace::kCPU); + int c = 0; + for (const auto &f : data) { + for (T v : f) { + ptr[c++] = v; + } + } + return size; +} + static std::string DescribeTensor(const PaddleTensor &tensor) { std::stringstream os; os << "Tensor [" << tensor.name << "]\n"; @@ -138,5 +154,127 @@ static void PrintTime(int batch_size, int repeat, int num_threads, int tid, } } +template +std::string LoDTensorSummary(const framework::LoDTensor &tensor) { + std::stringstream ss; + ss << "\n---- tensor ---" << '\n'; + ss << "lod: ["; + for (const auto &level : tensor.lod()) { + ss << "[ "; + for (auto i : level) { + ss << i << ", "; + } + ss << "]"; + } + ss << "]\n"; + + ss << "shape: ["; + int size = 1; + for (int i = 0; i < tensor.dims().size(); i++) { + int dim = tensor.dims()[i]; + ss << dim << ", "; + size *= dim; + } + ss << "]\n"; + + ss << "data: "; + for (int i = 0; i < std::min(20, size); i++) { + ss << tensor.data()[i] << " "; + } + ss << "\n"; + + return ss.str(); +} + +static bool CompareLoD(const framework::LoD &a, const framework::LoD &b) { + if (a.size() != b.size()) { + LOG(ERROR) << string::Sprintf("lod size not match %d != %d", a.size(), + b.size()); + return false; + } + for (size_t i = 0; i < a.size(); i++) { + auto &al = a[i]; + auto &bl = b[i]; + if (al.size() != bl.size()) { + LOG(ERROR) << string::Sprintf("level size %d != %d", al.size(), + bl.size()); + return false; + } + } + return true; +} + +static bool CompareShape(const std::vector &a, + const std::vector &b) { + if (a.size() != b.size()) { + LOG(ERROR) << string::Sprintf("shape size not match %d != %d", a.size(), + b.size()); + return false; + } + for (size_t i = 0; i < a.size(); i++) { + if (a[i] != b[i]) { + LOG(ERROR) << string::Sprintf("shape %d-th element not match %d != %d", i, + a[i], b[i]); + return false; + } + } + return true; +} + +static bool CompareTensorData(const framework::LoDTensor &a, + const framework::LoDTensor &b) { + auto a_shape = framework::vectorize(a.dims()); + auto b_shape = framework::vectorize(b.dims()); + size_t a_size = std::accumulate(a_shape.begin(), a_shape.end(), 1, + [](int a, int b) { return a * b; }); + size_t b_size = std::accumulate(b_shape.begin(), b_shape.end(), 1, + [](int a, int b) { return a * b; }); + if (a_size != b_size) { + LOG(ERROR) << string::Sprintf("tensor data size not match, %d != %d", + a_size, b_size); + } + + for (size_t i = 0; i < a_size; i++) { + if (a.type() == typeid(float)) { + const auto *a_data = a.data(); + const auto *b_data = b.data(); + if (std::abs(a_data[i] - b_data[i]) > 1e-3) { + LOG(ERROR) << string::Sprintf( + "tensor data %d-th element not match, %f != %f", i, a_data[i], + b_data[i]); + return false; + } + } else if (a.type() == typeid(int64_t)) { + const auto *a_data = a.data(); + const auto *b_data = b.data(); + if (std::abs(a_data[i] - b_data[i]) > 1e-3) { + LOG(ERROR) << string::Sprintf( + "tensor data %d-th element not match, %f != %f", i, a_data[i], + b_data[i]); + return false; + } + } + } + + return true; +} + +static bool CompareTensor(const framework::LoDTensor &a, + const framework::LoDTensor &b) { + if (!CompareLoD(a.lod(), b.lod())) { + return false; + } + if (!CompareShape(framework::vectorize(a.dims()), + framework::vectorize(b.dims()))) { + return false; + } + + if (!CompareTensorData(a, b)) { + return false; + } + + return true; +} + } // namespace inference } // namespace paddle diff --git a/paddle/fluid/inference/api/paddle_inference_api.h b/paddle/fluid/inference/api/paddle_inference_api.h index 984358b2bd90daf768cea0a6e36b5805d81050d6..3aa5c614687953f824fc5a94e8bde29090dbeb5d 100644 --- a/paddle/fluid/inference/api/paddle_inference_api.h +++ b/paddle/fluid/inference/api/paddle_inference_api.h @@ -101,6 +101,40 @@ struct PaddleTensor { std::vector> lod; // Tensor+LoD equals LoDTensor }; +enum class PaddlePlace { kUNK = -1, kCPU, kGPU }; +// Tensor without copy, currently only supports AnalysisPredictor. +class ZeroCopyTensor { + public: + void Reshape(const std::vector& shape); + + // Get the memory in CPU or GPU with specific data type, should Reshape first + // to tell the data size. + // Once can directly call this data to feed the data. + // This is for write the input tensor. + template + T* mutable_data(PaddlePlace place); + // Get the memory directly, will return the place and memory size by pointer. + // This is for reading the output tensor. + template + T* data(PaddlePlace* place, int* size); + + std::vector shape(); + + void SetLoD(const std::vector>& x); + std::vector> lod() const; + + protected: + ZeroCopyTensor(void* scope) : scope_{scope} {} + void SetName(const std::string& name) { name_ = name; } + void* FindTensor() const; + + private: + std::string name_; + bool input_or_output_; + friend class AnalysisPredictor; + void* scope_{nullptr}; +}; + /* * A simple Inference API for Paddle. */ @@ -120,6 +154,19 @@ class PaddlePredictor { std::vector* output_data, int batch_size = -1) = 0; + // Zero copy input and output optimization. + // Get the input or output tensors, and operate on their memory directly, + // without copy. + virtual std::unique_ptr GetInputTensor( + const std::string& name) { + return nullptr; + } + virtual std::unique_ptr GetOutputTensor( + const std::string& name) { + return nullptr; + } + virtual bool ZeroCopyRun() { return false; } + // Clone a predictor that share the model weights, the Cloned predictor should // be thread-safe. virtual std::unique_ptr Clone() = 0; @@ -218,7 +265,12 @@ struct AnalysisConfig : public NativeConfig { IrPassMode ir_mode{IrPassMode::kExclude}; std::vector ir_passes; - // NOTE this is just for internal development, please not use it. + // NOT stable yet. + bool use_feed_fetch_ops{true}; + + // NOTE this is just for internal development, please not use it. NOT + // stable + // yet. bool _use_mkldnn{false}; }; diff --git a/paddle/fluid/inference/tests/api/analyzer_lac_tester.cc b/paddle/fluid/inference/tests/api/analyzer_lac_tester.cc index 36bbec473114cfd2e68c97a53264957477ade3fb..5fb551810fd4d1c56547a8aa581cb6c4587df031 100644 --- a/paddle/fluid/inference/tests/api/analyzer_lac_tester.cc +++ b/paddle/fluid/inference/tests/api/analyzer_lac_tester.cc @@ -18,6 +18,8 @@ namespace paddle { namespace inference { namespace analysis { +using contrib::AnalysisConfig; + struct DataRecord { std::vector data; std::vector lod; @@ -78,6 +80,7 @@ struct DataRecord { } } } + DataRecord NextBatch() { DataRecord data; data.data = batched_datas[batch_iter]; @@ -155,7 +158,9 @@ TEST(Analyzer_LAC, fuse_statis) { SetConfig(&cfg); int num_ops; - auto fuse_statis = GetFuseStatis(cfg, &num_ops); + auto predictor = CreatePaddlePredictor(cfg); + auto fuse_statis = GetFuseStatis( + static_cast(predictor.get()), &num_ops); ASSERT_TRUE(fuse_statis.count("fc_fuse")); ASSERT_TRUE(fuse_statis.count("fc_gru_fuse")); EXPECT_EQ(fuse_statis.at("fc_fuse"), 1); diff --git a/paddle/fluid/inference/tests/api/analyzer_ner_tester.cc b/paddle/fluid/inference/tests/api/analyzer_ner_tester.cc index 59020545cd609961487cafc4a08c20951a02c8ce..577b97e271aacab5d6740de7c8bc00bc87ae54dd 100644 --- a/paddle/fluid/inference/tests/api/analyzer_ner_tester.cc +++ b/paddle/fluid/inference/tests/api/analyzer_ner_tester.cc @@ -16,6 +16,7 @@ namespace paddle { namespace inference { +using contrib::AnalysisConfig; struct DataRecord { std::vector> word_data_all, mention_data_all; @@ -145,7 +146,9 @@ TEST(Analyzer_Chinese_ner, fuse_statis) { SetConfig(&cfg); int num_ops; - auto fuse_statis = GetFuseStatis(cfg, &num_ops); + auto predictor = CreatePaddlePredictor(cfg); + auto fuse_statis = GetFuseStatis( + static_cast(predictor.get()), &num_ops); ASSERT_TRUE(fuse_statis.count("fc_fuse")); ASSERT_TRUE(fuse_statis.count("fc_gru_fuse")); EXPECT_EQ(fuse_statis.at("fc_fuse"), 1); diff --git a/paddle/fluid/inference/tests/api/analyzer_rnn1_tester.cc b/paddle/fluid/inference/tests/api/analyzer_rnn1_tester.cc index 3bf5383d8f35347c767d6caee83e0dcc5fb0a446..d2e344111bdf84c936bbef7ff51246b0f248f41d 100644 --- a/paddle/fluid/inference/tests/api/analyzer_rnn1_tester.cc +++ b/paddle/fluid/inference/tests/api/analyzer_rnn1_tester.cc @@ -12,12 +12,16 @@ // See the License for the specific language governing permissions and // limitations under the License. +#include "paddle/fluid/inference/api/analysis_predictor.h" #include "paddle/fluid/inference/tests/api/tester_helper.h" +DEFINE_bool(with_precision_check, true, "turn on test"); + namespace paddle { namespace inference { using namespace framework; // NOLINT +using namespace contrib; // NOLINT struct DataRecord { std::vector>> link_step_data_all; @@ -29,10 +33,12 @@ struct DataRecord { size_t batch_iter{0}; size_t batch_size{1}; DataRecord() = default; + explicit DataRecord(const std::string &path, int batch_size = 1) : batch_size(batch_size) { Load(path); } + DataRecord NextBatch() { DataRecord data; size_t batch_end = batch_iter + batch_size; @@ -101,6 +107,7 @@ struct DataRecord { num_samples = num_lines; } }; + void PrepareInputs(std::vector *input_slots, DataRecord *data, int batch_size) { PaddleTensor lod_attention_tensor, init_zero_tensor, lod_tensor_tensor, @@ -149,7 +156,55 @@ void PrepareInputs(std::vector *input_slots, DataRecord *data, } } -void SetConfig(contrib::AnalysisConfig *cfg) { +void PrepareZeroCopyInputs(ZeroCopyTensor *lod_attention_tensor, + ZeroCopyTensor *cell_init_tensor, + ZeroCopyTensor *data_tensor, + ZeroCopyTensor *hidden_init_tensor, + ZeroCopyTensor *week_tensor, + ZeroCopyTensor *minute_tensor, + DataRecord *data_record, int batch_size) { + auto one_batch = data_record->NextBatch(); + std::vector rnn_link_data_shape( + {static_cast(one_batch.rnn_link_data.size()), + static_cast(one_batch.rnn_link_data.front().size())}); + lod_attention_tensor->Reshape({1, 2}); + lod_attention_tensor->SetLoD({one_batch.lod1, one_batch.lod2}); + + cell_init_tensor->Reshape({batch_size, 15}); + cell_init_tensor->SetLoD({one_batch.lod3}); + + hidden_init_tensor->Reshape({batch_size, 15}); + hidden_init_tensor->SetLoD({one_batch.lod3}); + + data_tensor->Reshape(rnn_link_data_shape); + data_tensor->SetLoD({one_batch.lod1}); + + week_tensor->Reshape( + {static_cast(one_batch.rnn_week_datas.size()), + static_cast(one_batch.rnn_week_datas.front().size())}); + week_tensor->SetLoD({one_batch.lod3}); + + minute_tensor->Reshape( + {static_cast(one_batch.rnn_minute_datas.size()), + static_cast(one_batch.rnn_minute_datas.front().size())}); + minute_tensor->SetLoD({one_batch.lod3}); + + // assign data + float arr0[] = {0, 0}; + std::vector zeros(batch_size * 15, 0); + std::copy_n(arr0, 2, + lod_attention_tensor->mutable_data(PaddlePlace::kCPU)); + std::copy_n(arr0, 2, data_tensor->mutable_data(PaddlePlace::kCPU)); + std::copy_n(zeros.begin(), zeros.size(), + cell_init_tensor->mutable_data(PaddlePlace::kCPU)); + std::copy_n(zeros.begin(), zeros.size(), + hidden_init_tensor->mutable_data(PaddlePlace::kCPU)); + ZeroCopyTensorAssignData(data_tensor, one_batch.rnn_link_data); + ZeroCopyTensorAssignData(week_tensor, one_batch.rnn_week_datas); + ZeroCopyTensorAssignData(minute_tensor, one_batch.rnn_minute_datas); +} + +void SetConfig(AnalysisConfig *cfg) { cfg->prog_file = FLAGS_infer_model + "/__model__"; cfg->param_file = FLAGS_infer_model + "/param"; cfg->use_gpu = false; @@ -187,7 +242,9 @@ TEST(Analyzer_rnn1, fuse_statis) { SetConfig(&cfg); int num_ops; - auto fuse_statis = GetFuseStatis(cfg, &num_ops); + auto predictor = CreatePaddlePredictor(cfg); + auto fuse_statis = GetFuseStatis( + static_cast(predictor.get()), &num_ops); ASSERT_TRUE(fuse_statis.count("fc_fuse")); EXPECT_EQ(fuse_statis.at("fc_fuse"), 1); EXPECT_EQ(fuse_statis.at("fc_nobias_lstm_fuse"), 2); // bi-directional LSTM @@ -214,7 +271,229 @@ TEST(Analyzer_rnn1, multi_thread) { std::vector> input_slots_all; SetInput(&input_slots_all); - TestPrediction(cfg, input_slots_all, &outputs, 4 /* num_threads */); + TestPrediction(cfg, input_slots_all, &outputs, FLAGS_num_threads); +} + +bool CompareTensors(framework::Scope &a_scope, framework::Scope &b_scope, + const std::vector &tensors) { + for (auto &x : tensors) { + auto *a_var = a_scope.FindVar(x); + auto *b_var = b_scope.FindVar(x); + if (a_var && b_var) { + if (a_var->Type() == typeid(framework::LoDTensor) || + a_var->Type() == typeid(framework::Tensor)) { + LOG(INFO) << "comparing tensor " << x; + auto &a_t = a_var->Get(); + auto &b_t = b_var->Get(); + if (!inference::CompareTensor(a_t, b_t)) { + LOG(ERROR) << string::Sprintf("tensor %s not match in two scopes", x); + } + } else { + LOG(INFO) << "skip no tensor " << x; + } + } else { + LOG(INFO) << "skip tensor " << x; + } + } + return true; +} + +// Validate that the AnalysisPredictor + ZeroCopyTensor really works by testing +// on the complex RNN1 model. +TEST(Analyzer_rnn1, ZeroCopy) { + AnalysisConfig config; + SetConfig(&config); + config.use_feed_fetch_ops = false; + + PaddlePlace place; + int output_size{0}; + + auto predictor = + CreatePaddlePredictor( + config); + + config.use_feed_fetch_ops = true; + auto native_predictor = + CreatePaddlePredictor(config); + + config.use_feed_fetch_ops = true; // the analysis predictor needs feed/fetch. + auto analysis_predictor = + CreatePaddlePredictor( + config); + +#define NEW_TENSOR(name__) \ + auto name__##_tensor = predictor->GetInputTensor(#name__); + NEW_TENSOR(data_lod_attention); + NEW_TENSOR(cell_init); + NEW_TENSOR(data); + NEW_TENSOR(week); + NEW_TENSOR(minute); + NEW_TENSOR(hidden_init); + + // Prepare data for AnalysisPredictor + DataRecord data(FLAGS_infer_data, FLAGS_batch_size); + PrepareZeroCopyInputs(data_lod_attention_tensor.get(), cell_init_tensor.get(), + data_tensor.get(), hidden_init_tensor.get(), + week_tensor.get(), minute_tensor.get(), &data, + FLAGS_batch_size); + + // Prepare data for NativePredictor + std::vector> native_inputs; + SetInput(&native_inputs); + std::vector native_outputs; + std::vector analysis_outputs; + + auto output_tensor = predictor->GetOutputTensor("final_output.tmp_1"); + // Run analysis predictor + + int num_ops; + auto fuse_statis = GetFuseStatis(predictor.get(), &num_ops); + ASSERT_TRUE(fuse_statis.count("fc_fuse")); + ASSERT_EQ(fuse_statis.at("fc_fuse"), 1); + ASSERT_EQ(fuse_statis.at("fc_nobias_lstm_fuse"), 2); // bi-directional LSTM + ASSERT_EQ(fuse_statis.at("seq_concat_fc_fuse"), 1); + ASSERT_EQ(num_ops, + 13); // After graph optimization, only 13 operators exists. + + Timer timer; + double total_time{0}; + double native_total_time{0}; + double analysis_total_time{0.}; + + for (int i = 0; i < FLAGS_repeat; i++) { + timer.tic(); + predictor->ZeroCopyRun(); + total_time += timer.toc(); + } + + auto *output_data = output_tensor->data(&place, &output_size); + ASSERT_GT(output_size, 0); // more than one output! + + for (int i = 0; i < FLAGS_repeat; i++) { + // Run native predictor. + timer.tic(); + ASSERT_TRUE(native_predictor->Run(native_inputs.front(), &native_outputs)); + native_total_time += timer.toc(); + } + + for (int i = 0; i < FLAGS_repeat; i++) { + timer.tic(); + ASSERT_TRUE( + analysis_predictor->Run(native_inputs.front(), &analysis_outputs)); + analysis_total_time += timer.toc(); + } + + if (!FLAGS_with_precision_check) { + return; + } + int native_output_size = VecReduceToInt(native_outputs.front().shape); + + EXPECT_EQ(native_output_size, output_size); + + // Compare tensors between analysis and zerocopy + auto *p0 = static_cast(predictor.get()); + auto *p1 = static_cast(analysis_predictor.get()); + auto *p2 = static_cast(native_predictor.get()); + + std::vector tensor_names; + for (auto &var_desc : p0->program().Block(0).AllVars()) { + tensor_names.push_back(var_desc->Name()); + } + + LOG(INFO) << "Comparing tensors"; + ASSERT_TRUE( + CompareTensors(*p0->scope(), *p1->scope(), {"final_output.tmp_1"})); + ASSERT_TRUE( + CompareTensors(*p0->scope(), *p2->scope(), {"final_output.tmp_1"})); + + LOG(INFO) << "output1 " << inference::LoDTensorSummary( + p0->scope() + ->FindVar("final_output.tmp_1") + ->Get()); + LOG(INFO) << "output2 " << inference::LoDTensorSummary( + p1->scope() + ->FindVar("final_output.tmp_1") + ->Get()); + LOG(INFO) << "output3 " << inference::LoDTensorSummary( + p2->scope() + ->FindVar("final_output.tmp_1") + ->Get()); + + for (int i = 0; i < output_size; i++) { + LOG(INFO) << output_data[i] << " " + << static_cast(native_outputs.front().data.data())[i] + << " " + << static_cast(analysis_outputs.front().data.data())[i]; + EXPECT_NEAR(output_data[i], + static_cast(native_outputs.front().data.data())[i], + 1e-3); + } + + LOG(INFO) << "batch_size: " << FLAGS_batch_size; + + LOG(INFO) << "zero average time: " + << total_time / (FLAGS_repeat * FLAGS_batch_size); + LOG(INFO) << "analysis average time: " + << analysis_total_time / (FLAGS_repeat * FLAGS_batch_size); + LOG(INFO) << "native average time: " + << native_total_time / (FLAGS_repeat * FLAGS_batch_size); +} + +TEST(Analyzer_rnn1, ZeroCopyMultiThread) { + AnalysisConfig config; + SetConfig(&config); + config.use_feed_fetch_ops = false; + +#define NEW_TENSOR(name__) \ + auto name__##_tensor = predictor->GetInputTensor(#name__); + + auto base_predictor = CreatePaddlePredictor(config); + double total_time_of_threads{0}; + std::vector threads; + std::vector> predictors; + for (int tid = 0; tid < FLAGS_num_threads; tid++) { + predictors.emplace_back(CreatePaddlePredictor(config)); + } + + for (int tid = 0; tid < FLAGS_num_threads; tid++) { + threads.emplace_back([config, &total_time_of_threads, &predictors, tid] { + // auto predictor = base_predictor->Clone(); + auto &predictor = predictors[tid]; + NEW_TENSOR(data_lod_attention); + NEW_TENSOR(cell_init); + NEW_TENSOR(data); + NEW_TENSOR(week); + NEW_TENSOR(minute); + NEW_TENSOR(hidden_init); + + // Prepare data for AnalysisPredictor + DataRecord data(FLAGS_infer_data, FLAGS_batch_size); + Timer timer; + double total_time{0}; + + for (int i = 0; i < FLAGS_repeat; i++) { + PrepareZeroCopyInputs(data_lod_attention_tensor.get(), + cell_init_tensor.get(), data_tensor.get(), + hidden_init_tensor.get(), week_tensor.get(), + minute_tensor.get(), &data, FLAGS_batch_size); + + timer.tic(); + predictor->ZeroCopyRun(); + total_time += timer.toc(); + } + + total_time_of_threads += total_time; + + LOG(INFO) << "thread time: " << total_time / FLAGS_repeat; + }); + } + + for (auto &t : threads) { + t.join(); + } + + LOG(INFO) << "average time: " + << total_time_of_threads / FLAGS_num_threads / FLAGS_repeat; } } // namespace inference diff --git a/paddle/fluid/inference/tests/api/analyzer_seq_conv1_tester.cc b/paddle/fluid/inference/tests/api/analyzer_seq_conv1_tester.cc index 2f71ed46ffc9fd5f853f5b5b46de1446d28b9e69..cb4671c4379b5f6f144bfd5330866aa38163f4d4 100644 --- a/paddle/fluid/inference/tests/api/analyzer_seq_conv1_tester.cc +++ b/paddle/fluid/inference/tests/api/analyzer_seq_conv1_tester.cc @@ -182,7 +182,8 @@ TEST(Analyzer_seq_conv1, fuse_statis) { AnalysisConfig cfg; SetConfig(&cfg); int num_ops; - auto fuse_statis = GetFuseStatis(cfg, &num_ops); + auto predictor = CreatePaddlePredictor(cfg); + GetFuseStatis(predictor.get(), &num_ops); } // Compare result of NativeConfig and AnalysisConfig diff --git a/paddle/fluid/inference/tests/api/analyzer_vis_tester.cc b/paddle/fluid/inference/tests/api/analyzer_vis_tester.cc index 483ae66c5b24f6147b1b07da86494a914f80c34c..a2e86305b85dd893f578e97e0105fec828916fb4 100644 --- a/paddle/fluid/inference/tests/api/analyzer_vis_tester.cc +++ b/paddle/fluid/inference/tests/api/analyzer_vis_tester.cc @@ -19,6 +19,7 @@ limitations under the License. */ namespace paddle { namespace inference { namespace analysis { +using contrib::AnalysisConfig; struct Record { std::vector data; @@ -114,7 +115,8 @@ TEST(Analyzer_vis, fuse_statis) { AnalysisConfig cfg; SetConfig(&cfg); int num_ops; - GetFuseStatis(cfg, &num_ops); + auto predictor = CreatePaddlePredictor(cfg); + GetFuseStatis(predictor.get(), &num_ops); } // Compare result of NativeConfig and AnalysisConfig diff --git a/paddle/fluid/inference/tests/api/tester_helper.h b/paddle/fluid/inference/tests/api/tester_helper.h index 9fcb5129d268a7730c11e5910077ad233050484e..cb36ddc8c879b1aff9838bba90364b17d53aa84e 100644 --- a/paddle/fluid/inference/tests/api/tester_helper.h +++ b/paddle/fluid/inference/tests/api/tester_helper.h @@ -86,11 +86,9 @@ std::unique_ptr CreateTestPredictor( size_t GetSize(const PaddleTensor &out) { return VecReduceToInt(out.shape); } -std::unordered_map GetFuseStatis(AnalysisConfig config, +std::unordered_map GetFuseStatis(PaddlePredictor *predictor, int *num_ops) { - auto predictor = CreateTestPredictor(config); - AnalysisPredictor *analysis_predictor = - dynamic_cast(predictor.get()); + auto *analysis_predictor = static_cast(predictor); auto &fuse_statis = analysis_predictor->analysis_argument() .Get>( framework::ir::kFuseStatisAttr); diff --git a/paddle/fluid/memory/malloc.cc b/paddle/fluid/memory/malloc.cc index 283745e977533358ef52521b36e67f0ada950e61..0f13a4ea9c1af175771f5cc201ea5c0a8a0f7555 100644 --- a/paddle/fluid/memory/malloc.cc +++ b/paddle/fluid/memory/malloc.cc @@ -36,6 +36,8 @@ namespace memory { using BuddyAllocator = detail::BuddyAllocator; BuddyAllocator* GetCPUBuddyAllocator() { + // We tried thread_local for inference::RNN1 model, but that not works much + // for multi-thread test. static std::once_flag init_flag; static detail::BuddyAllocator* a = nullptr; @@ -48,6 +50,25 @@ BuddyAllocator* GetCPUBuddyAllocator() { return a; } +// We compared the NaiveAllocator with BuddyAllocator in CPU memory allocation, +// seems they are almost the same overhead. +struct NaiveAllocator { + void* Alloc(size_t size) { return malloc(size); } + + void Free(void* p) { + PADDLE_ENFORCE(p); + free(p); + } + + static NaiveAllocator* Instance() { + static NaiveAllocator x; + return &x; + } + + private: + std::mutex lock_; +}; + template <> void* Alloc(platform::CPUPlace place, size_t size) { VLOG(10) << "Allocate " << size << " bytes on " << platform::Place(place); diff --git a/paddle/fluid/string/pretty_log.h b/paddle/fluid/string/pretty_log.h index a3b4e38f453835828a4a53130e11c854ac3f4a74..10c9eb80d0a7e07d5974ca10d740e71e7717b5c5 100644 --- a/paddle/fluid/string/pretty_log.h +++ b/paddle/fluid/string/pretty_log.h @@ -56,13 +56,13 @@ struct Style { }; template -static void PrettyLogEndl(const std::string& style, const char* fmt, - const Args&... args) { +static void PrettyLogEndl(const std::string &style, const char *fmt, + const Args &... args) { std::cerr << style << Sprintf(fmt, args...) << reset() << std::endl; } template -static void PrettyLog(const std::string& style, const char* fmt, - const Args&... args) { +static void PrettyLog(const std::string &style, const char *fmt, + const Args &... args) { std::cerr << style << Sprintf(fmt, args...) << reset(); } diff --git a/python/CMakeLists.txt b/python/CMakeLists.txt index 9cdcb87df5dd1669066c204c86c269973df506f1..1c5ded943b3814688af1f177503d3bdc35073c3f 100644 --- a/python/CMakeLists.txt +++ b/python/CMakeLists.txt @@ -87,6 +87,7 @@ if (WITH_TESTING) endif() endif() add_subdirectory(paddle/fluid/tests) + add_subdirectory(paddle/fluid/contrib/tests) endif() install(DIRECTORY ${PADDLE_PYTHON_PACKAGE_DIR} DESTINATION opt/paddle/share/wheels diff --git a/python/paddle/fluid/contrib/__init__.py b/python/paddle/fluid/contrib/__init__.py index c82bc0b940a32bf584e87646442c2507864c2285..3bf2fe5db0cb2126295ebfda822eeac8762dbdb7 100644 --- a/python/paddle/fluid/contrib/__init__.py +++ b/python/paddle/fluid/contrib/__init__.py @@ -20,8 +20,11 @@ from . import memory_usage_calc from .memory_usage_calc import * from . import op_frequence from .op_frequence import * +from . import quantize +from .quantize import * __all__ = [] __all__ += decoder.__all__ __all__ += memory_usage_calc.__all__ __all__ += op_frequence.__all__ +__all__ += quantize.__all__ diff --git a/python/paddle/fluid/contrib/quantize/__init__.py b/python/paddle/fluid/contrib/quantize/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..14c208d0e7f35ebfbbe1c36d0b11a8d0f0efb4a6 --- /dev/null +++ b/python/paddle/fluid/contrib/quantize/__init__.py @@ -0,0 +1,20 @@ +# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import print_function + +from . import quantize_transpiler +from .quantize_transpiler import * + +__all__ = quantize_transpiler.__all__ diff --git a/python/paddle/fluid/contrib/quantize/quantize_transpiler.py b/python/paddle/fluid/contrib/quantize/quantize_transpiler.py new file mode 100644 index 0000000000000000000000000000000000000000..032d0353ea6d80c4356ea9a9886ea59c48feec7a --- /dev/null +++ b/python/paddle/fluid/contrib/quantize/quantize_transpiler.py @@ -0,0 +1,557 @@ +# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import collections +import numpy as np + +from paddle.fluid.framework import default_main_program, default_startup_program, program_guard +from paddle.fluid.layer_helper import LayerHelper +from paddle.fluid import unique_name +from paddle.fluid import core +from paddle.fluid.initializer import Constant +from paddle.fluid.param_attr import ParamAttr +from paddle.fluid.layer_helper import LayerHelper +from paddle.fluid.layers.nn import autoincreased_step_counter +from paddle.fluid.framework import Variable +from paddle.fluid.executor import global_scope +from paddle.fluid.transpiler.inference_transpiler import InferenceTranspiler + +__all__ = ['QuantizeTranspiler'] + +_QUANTIZABLE_OP_TYPES = ['conv2d', 'depthwise_conv2d', 'mul'] + + +def _quantized_var_name(var_name): + """ + Return quantized variable name for the input `var_name`. + """ + return "%s.quantized" % (var_name) + + +def _dequantized_var_name(var_name): + """ + Return dequantized variable name for the input `var_name`. + """ + return "%s.dequantized" % (var_name) + + +def _quantized_scale_name(var_name): + """ + Return quantized variable name for the input `var_name`. + """ + return "%s.scale" % (var_name) + + +def _original_var_name(var_name): + """ + Return the original variable name. + """ + if var_name.endswith('.quantized.dequantized'): + return var_name[:-len('.quantized.dequantized')] + if var_name.endswith('.quantized'): + return var_name[:-len('.quantized')] + if var_name.endswith('.dequantized'): + return var_name[:-len('.dequantized')] + if var_name.endswith('.scale'): + return var_name[:-len('.scale')] + else: + return var_name + + +def _is_float(v): + return isinstance(v, float) or isinstance(v, np.float32) + + +def quant(x, scale, num_bits): + y = np.round(x / scale * ((1 << (num_bits - 1)) - 1)) + return y + + +class QuantizeTranspiler(object): + def __init__(self, + weight_bits=8, + activation_bits=8, + activation_quantize_type='abs_max', + weight_quantize_type='abs_max', + window_size=10000): + """ + Convert and rewrite the fluid Program according to weight and + activation quantization type. + + Args: + weight_bits (int): quantization bit number for weights, + the bias is not quantized. + activation_bits (int): quantization bit number for activation. + activation_quantize_type (str): quantization type for activation, + now support 'abs_max', 'range_abs_max'. If use 'abs_max' mode, + the quantization scale will be calculated dynamically each step + in both training and testing period. If use 'range_abs_max', + a static quantization scale will be calculated during training + and used in inference. + weight_quantize_type (str): quantization type for weights, + support 'abs_max'. The 'range_abs_max' usually is not used for + weight, since weights are fixed once the model is well trained. + window_size (int): the window size for 'range_abs_max' quantization. + + Examples: + + .. code-block:: python + + # the original program will be rewrite, if you don't want to + # change it, please clone at first. + # quantize_program = program.clone() + t = fluid.QuantizeTranspiler() + t.transpile(quantize_program) + + """ + self.weight_bits = weight_bits + self.activation_bits = activation_bits + quant_type = ['abs_max', 'range_abs_max'] + if weight_quantize_type not in quant_type: + raise ValueError( + "Unknown weight_quantize_type: '%s'. It can only be ", + "'abs_max' or 'range_abs_max'.", str(weight_quantize_type)) + if activation_quantize_type not in quant_type: + raise ValueError( + "Unknown activation_quantize_type : '%s'. It can only be ", + "'abs_max' or 'range_abs_max'.", str(activation_quantize_type)) + + self.weight_quantize_type = weight_quantize_type + self.activation_quantize_type = activation_quantize_type + + self.window_size = window_size + self.helper = LayerHelper(self.__class__.__name__) + self.fake_quant_op_types = [ + 'fake_quantize_abs_max', 'fake_quantize_range_abs_max' + ] + self.fake_dequant_op_types = ['fake_dequantize_max_abs'] + self.is_test = None + self.global_step = None + + def training_transpile(self, program=None, startup_program=None): + """Rewrites a training input program in place for simulated + quantization. Insert fake quantization and de-quantization ops into + program to simulate the error introduced by quantization. And change + the graident ops' input by using the faked quantization weights and + activation. Since the program is transformed in place, the graph + connection will change. + + Args: + program (Program): the input program to be transpile. + """ + self.is_test = False + program = default_main_program() if program is None else program + startup_program = default_startup_program() if startup_program is \ + None else startup_program + + # marked the variable which has been quantized and dequantized. + dequanted_vars = [ + collections.OrderedDict() for _ in range(len(program.blocks)) + ] + grad_op_types = ['%s_grad' % (type) for type in _QUANTIZABLE_OP_TYPES] + + params = [p.name for p in program.global_block().iter_parameters()] + + def _transpile_forward(block, op): + idx = block.ops.index(op) + block_id = block.idx + # insert quant op and dequant op + for name in op.input_arg_names: + if name in dequanted_vars[block_id]: + dequant_var = dequanted_vars[block_id][name] + else: + var = block.var(name) + quant_bits = self.weight_bits if var.name in params \ + else self.activation_bits + quant_type = self.weight_quantize_type if var.name \ + in params else self.activation_quantize_type + + quant_var, scale_var = self._insert_quant_op( + block, idx, var, quant_bits, quant_type) + dequant_var = self._insert_dequant_op( + block, idx + 1, quant_var, scale_var, quant_bits) + dequanted_vars[block_id][name] = dequant_var + # rename the forward op inputs + op._rename_input(name, dequant_var.name) + + def _transpile_backward(block, op): + block_id = block.idx + no_dequanted_input_vars = True + for name in op.input_arg_names: + if name in dequanted_vars[block_id]: + dequant_var = dequanted_vars[block_id][name] + op._rename_input(name, dequant_var.name) + no_dequanted_input_vars = False + if no_dequanted_input_vars: + raise ValueError("There is no dequanted inputs for op %s." % + (op.type)) + + with program_guard(program, startup_program): + self._create_global_step() + for block in program.blocks: + ops = list(block.ops) + block_id = block.idx + for op in ops: + # rewrite the forward ProgramDes + if op.type in _QUANTIZABLE_OP_TYPES: + _transpile_forward(block, op) + # rename the backward op inputs + if op.type in grad_op_types: + _transpile_backward(block, op) + + def _create_global_step(self): + if self.weight_quantize_type == 'range_abs_max' or \ + self.activation_quantize_type == 'range_abs_max': + self.global_step = autoincreased_step_counter() + + def freeze_program(self, program, place, fuse_bn=False, scope=None): + """Freeze input training program for inference. + + Args: + program (Program): the input program to be transpile. + """ + + self.is_test = True + scope = global_scope() if scope is None else scope + program = default_main_program() if program is None else program + + if fuse_bn: + bn_fuse_transpiler = BNFuseTranspiler() + bn_fuse_transpiler.transpile(program, place) + + persistable_vars = [ + v.name + for v in filter(lambda var: var.persistable, program.list_vars()) + ] + op_in_rename_map = [ + collections.OrderedDict() for _ in range(len(program.blocks)) + ] + op_out_rename_map = [ + collections.OrderedDict() for _ in range(len(program.blocks)) + ] + var_scale_map = [ + collections.OrderedDict() for _ in range(len(program.blocks)) + ] + + def _remove_fake_quant_and_dequant_op(block, op): + idx = block.ops.index(op) + block_id = block.idx + k = op.output('Out')[0] + v = op.input('X')[0] + if v not in op_in_rename_map[block_id]: + op_in_rename_map[block_id][k] = v + else: + op_in_rename_map[block_id][k] = op_in_rename_map[block_id][v] + block._remove_op(idx) + + def _insert_post_dequant_op(block, op): + idx = block.ops.index(op) + block_id = block.idx + max_range = None + scale_var = None + for name in op.input_arg_names: + if name in op_in_rename_map[block_id]: + op._rename_input(name, op_in_rename_map[block_id][name]) + + scale_v = var_scale_map[block_id][_original_var_name(name)] + if _original_var_name(name) in persistable_vars: + param_range = (1 << (self.weight_bits - 1)) - 1 + act_range = (1 << (self.activation_bits - 1)) - 1 + assert _is_float(scale_v) + max_range = param_range * act_range / scale_v + else: + assert isinstance(scale_v, Variable) + scale_var = var_scale_map[block_id][_original_var_name( + name)] + + if len(op.output_arg_names) != 1: + raise ValueError("Only support one output, but op %s has" + " more than one output." % (op.type)) + out_var = block.var(op.output_arg_names[0]) + dequant_var = block.create_var( + name=_dequantized_var_name(out_var.name), + type=out_var.type, + shape=out_var.shape, + dtype=out_var.dtype) + # insert fake_dequantize_op + dequant_op = block._insert_op( + idx + 1, + type="fake_dequantize_max_abs", + attrs={'max_range': float(max_range)}, + inputs={"X": out_var, + 'Scale': scale_var}, + outputs={"Out": dequant_var}) + op_out_rename_map[block_id][out_var.name] = dequant_var.name + return dequant_var + + def _load_var(name): + return np.array(scope.find_var(name).get_tensor()) + + def _restore_var(name, arr): + t = scope.find_var(name).get_tensor() + t.set(arr, place) + + for block in program.blocks: + ops = list(block.ops) + block_id = block.idx + for op in ops: + op_type = op.type + + # insert dequant_op after fc/conv, need to rename + # input of the followed ops + for name in op.input_arg_names: + if name in op_out_rename_map[block_id]: + op._rename_input(name, + op_out_rename_map[block_id][name]) + + if op_type in self.fake_quant_op_types: + in_arg_name = op.input('X')[0] + if in_arg_name in persistable_vars: + if self.weight_quantize_type == 'abs_max': + param = _load_var(in_arg_name) + scale_v = np.max(np.abs(param)) + else: + scale_v = _load_var(op.output('OutScale')[0]) + var_scale_map[block_id][in_arg_name] = scale_v + else: + scale_v = block.var(op.output('OutScale')[0]) + var_scale_map[block_id][in_arg_name] = scale_v + + if in_arg_name in persistable_vars: + _remove_fake_quant_and_dequant_op(block, op) + # quantize weight and restore + param_t = _load_var(in_arg_name) + param_q_t = quant(param_t, scale_v, self.weight_bits) + _restore_var(in_arg_name, param_q_t) + + if op_type in self.fake_dequant_op_types: + _remove_fake_quant_and_dequant_op(block, op) + + if op_type in _QUANTIZABLE_OP_TYPES: + dequant_var = _insert_post_dequant_op(block, op) + + # remove the unused var in ProgramDesc + self._remove_unused_var(program) + #program = program.clone() + + def convert_to_int8(self, program, place, scope=None): + scope = global_scope() if scope is None else scope + program = default_main_program() if program is None else program + + def _load_var(name): + return np.array(scope.find_var(name).get_tensor()) + + global_block = program.global_block() + + def convert_to_int8(var): + int8_var_name = var.name + ".int8" + int8_var = global_block.create_parameter( + name=int8_var_name.encode('ascii'), + type=var.type, + dtype=core.VarDesc.VarType.INT8, + shape=var.shape) + + tensor = _load_var(var.name) + + scope.var(int8_var_name) + int8_tensor = scope.find_var(int8_var_name).get_tensor() + int8_tensor.set(tensor.astype(np.int8), place) + return int8_var + + input_map = {} + for block in program.blocks: + for op in list(block.ops): + if op.type in _QUANTIZABLE_OP_TYPES: + for name in op.input_arg_names: + var = block.var(name) + if var.persistable: + if name not in input_map: + int8_var = convert_to_int8(var) + input_map[name] = int8_var.name + op._rename_input(name, input_map[name]) + self._remove_unused_var(program) + + def _remove_unused_var(self, program): + all_remove_vars = [] + for block in program.blocks: + args = [] + for op in block.ops: + args += op.input_arg_names + args += op.output_arg_names + args = list(set(args)) + var_names = block.vars.keys() + sub_block_remove_vars = [] + for var in var_names: + if var not in args: + sub_block_remove_vars.append(var) + all_remove_vars.append(sub_block_remove_vars) + + remove_vars = [list(set(v)) for v in all_remove_vars] + for i, block in enumerate(program.blocks): + for v in remove_vars[i]: + block._remove_var(v) + + def _insert_quant_abs_max_op(self, block, idx, var, quant_bits): + """Insert fake_quantize_abs_max op. + """ + quant_var = block.create_var( + name=_quantized_var_name(var.name), + type=var.type, + shape=var.shape, + dtype=var.dtype) + scale = block.create_var( + name=_quantized_scale_name(var.name), + type=var.type, + shape=var.shape, + dtype=var.dtype) + quant_op = block._insert_op( + idx, + type='fake_quantize_abs_max', + attrs={'bit_length': quant_bits}, + inputs={'X': var}, + outputs={'Out': quant_var, + 'OutScale': scale}) + return quant_var, scale + + def _insert_quant_range_abs_max_op(self, block, idx, var, quant_bits): + """Insert fake_quantize_range_abs_max + """ + quant_var = block.create_var( + name=_quantized_var_name(var.name), + type=var.type, + shape=var.shape, + dtype=var.dtype) + scale = self.helper.create_parameter( + attr=ParamAttr( + name=_quantized_scale_name(var.name), + initializer=Constant(0.001), + trainable=False), + shape=[1], + dtype=var.dtype) + scale.stop_gradient = True + + ins = {'X': var, 'InScale': scale} + outs = {'Out': quant_var, 'OutScale': scale} + if not self.is_test: + # A global step counter variable with type int64 + scales = self.helper.create_global_variable( + name=unique_name.generate('scales'), + persistable=True, + dtype=var.dtype, + shape=[self.window_size]) + self.helper.set_variable_initializer( + scales, initializer=Constant(value=0)) + + ins['Iter'] = self.global_step + outs['OutScales'] = scales + + attrs = { + 'window_size': self.window_size, + 'bit_length': quant_bits, + 'is_test': self.is_test + } + + quant_op = block._insert_op( + idx, + type='fake_quantize_range_abs_max', + attrs=attrs, + inputs=ins, + outputs=outs) + + return quant_var, scale + + def _insert_quant_op(self, block, idx, var, quant_bits, quant_type): + """ + Insert fake_quantize_op + """ + if quant_type == 'abs_max': + return self._insert_quant_abs_max_op(block, idx, var, quant_bits) + elif quant_type == 'range_abs_max': + return self._insert_quant_range_abs_max_op(block, idx, var, + quant_bits) + + def _insert_dequant_op(self, block, idx, var, scale, quant_bits): + """ + Insert fake_quantize_op + """ + dequant_var = block.create_var( + name=_dequantized_var_name(var.name), + type=var.type, + shape=var.shape, + dtype=var.dtype) + # insert fake_dequantize_op + max_range = (1 << (quant_bits - 1)) - 1 + dequant_op = block._insert_op( + idx, + type="fake_dequantize_max_abs", + attrs={'max_range': float(max_range)}, + inputs={"X": var, + 'Scale': scale}, + outputs={"Out": dequant_var}) + return dequant_var + + +class BNFuseTranspiler(InferenceTranspiler): + def _fuse_param(self, current_op, bn_op, bias_op, with_bias): + def _update_param(op, param_name, new_param): + var = self.block.vars[param_name] + tensor = self.scope.find_var(param_name).get_tensor() + tensor.set(np.array(new_param), self.place) + + def _load_param(param_name): + return np.array(self.scope.find_var(param_name).get_tensor()) + + bias_bn = _load_param(bn_op.input("Bias")[0]) #Bias + scale_bn = _load_param(bn_op.input("Scale")[0]) #Scale + mean_bn = _load_param(bn_op.input("Mean")[0]) #Mean + var_bn = _load_param(bn_op.input("Variance")[0]) #Variance + + if current_op.type in ['conv2d', 'depthwise_conv2d']: + current_param = _load_param( + _original_var_name(current_op.input("Filter")[0])) + elif current_op.type == 'mul': + current_param = _load_param( + _original_var_name(current_op.input("Y")[0])) + + std_bn = np.float32(np.sqrt(np.add(var_bn, 1e-5))) + tmp = np.float32(np.divide(scale_bn, std_bn)) + + # add bias of batch_norm_op to conv2d + if with_bias: + bias = _load_param(bias_op.input("Y")) + else: + bias = np.zeros(bias_bn.shape) + bias = np.float32( + np.add(np.multiply(np.subtract(bias, mean_bn), tmp), bias_bn)) + + # re-compute weight of conv2d/fc + tmp = tmp.reshape(tmp.shape[0], -1) + dst_param = current_param.reshape((tmp.shape[0], -1)) + dst_param = np.float32(np.multiply(dst_param, tmp)) + dst_param = dst_param.reshape(current_param.shape) + + # update parameters + if current_op.type in ['conv2d', 'depthwise_conv2d']: + _update_param(current_op, + _original_var_name(current_op.input("Filter")[0]), + dst_param) + elif current_op.type == 'mul': + _update_param(current_op, + _original_var_name(current_op.input("Y")[0]), + dst_param) + + _update_param(bias_op, bias_op.input("Y")[0], bias) + + # collect the renamed input + self.input_map[bn_op.output("Y")[0]] = bias_op.output("Out")[0] diff --git a/python/paddle/fluid/contrib/tests/CMakeLists.txt b/python/paddle/fluid/contrib/tests/CMakeLists.txt new file mode 100644 index 0000000000000000000000000000000000000000..79bec8c4ad34d682895250bc29b1fddb3a569bd4 --- /dev/null +++ b/python/paddle/fluid/contrib/tests/CMakeLists.txt @@ -0,0 +1,6 @@ +file(GLOB TEST_OPS RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}" "test_*.py") +string(REPLACE ".py" "" TEST_OPS "${TEST_OPS}") + +foreach(src ${TEST_OPS}) + py_test(${src} SRCS ${src}.py) +endforeach() diff --git a/python/paddle/fluid/contrib/tests/test_quantize_transpiler.py b/python/paddle/fluid/contrib/tests/test_quantize_transpiler.py new file mode 100644 index 0000000000000000000000000000000000000000..9af3a6c9fda121d411a8a19f3928238be84fe8a6 --- /dev/null +++ b/python/paddle/fluid/contrib/tests/test_quantize_transpiler.py @@ -0,0 +1,272 @@ +# copyright (c) 2018 paddlepaddle authors. all rights reserved. +# +# licensed under the apache license, version 2.0 (the "license"); +# you may not use this file except in compliance with the license. +# you may obtain a copy of the license at +# +# http://www.apache.org/licenses/license-2.0 +# +# unless required by applicable law or agreed to in writing, software +# distributed under the license is distributed on an "as is" basis, +# without warranties or conditions of any kind, either express or implied. +# see the license for the specific language governing permissions and +# limitations under the license. + +import numpy as np +import six + +import unittest +import paddle +import paddle.fluid as fluid +from paddle.fluid.contrib.quantize.quantize_transpiler import _original_var_name +from paddle.fluid.contrib.quantize.quantize_transpiler import QuantizeTranspiler + + +def linear_fc(num): + data = fluid.layers.data(name='image', shape=[1, 32, 32], dtype='float32') + label = fluid.layers.data(name='label', shape=[1], dtype='int64') + hidden = data + for _ in six.moves.xrange(num): + hidden = fluid.layers.fc(hidden, size=128, act='relu') + loss = fluid.layers.cross_entropy(input=hidden, label=label) + loss = fluid.layers.mean(loss) + return loss + + +def residual_block(num): + def conv_bn_layer(input, + ch_out, + filter_size, + stride, + padding, + act='relu', + bias_attr=False): + tmp = fluid.layers.conv2d( + input=input, + filter_size=filter_size, + num_filters=ch_out, + stride=stride, + padding=padding, + act=None, + bias_attr=bias_attr) + return fluid.layers.batch_norm(input=tmp, act=act) + + data = fluid.layers.data(name='image', shape=[1, 32, 32], dtype='float32') + label = fluid.layers.data(name='label', shape=[1], dtype='int64') + hidden = data + for _ in six.moves.xrange(num): + conv = conv_bn_layer(hidden, 16, 3, 1, 1, act=None, bias_attr=True) + short = conv_bn_layer(hidden, 16, 1, 1, 0, act=None) + hidden = fluid.layers.elementwise_add(x=conv, y=short, act='relu') + fc = fluid.layers.fc(input=hidden, size=10) + loss = fluid.layers.cross_entropy(input=fc, label=label) + loss = fluid.layers.mean(loss) + return loss + + +def conv_net(img, label): + conv_pool_1 = fluid.nets.simple_img_conv_pool( + input=img, + filter_size=5, + num_filters=20, + pool_size=2, + pool_stride=2, + act="relu") + conv_pool_1 = fluid.layers.batch_norm(conv_pool_1) + conv_pool_2 = fluid.nets.simple_img_conv_pool( + input=conv_pool_1, + filter_size=5, + num_filters=50, + pool_size=2, + pool_stride=2, + act="relu") + prediction = fluid.layers.fc(input=conv_pool_2, size=10, act='softmax') + loss = fluid.layers.cross_entropy(input=prediction, label=label) + avg_loss = fluid.layers.mean(loss) + return avg_loss + + +class TestQuantizeTranspiler(unittest.TestCase): + def setUp(self): + # since quant_op and dequant_op is not ready, use cos and sin for test + self.weight_quant_op_type = 'fake_quantize_abs_max' + self.dequant_op_type = 'fake_dequantize_max_abs' + self.quantizable_op_and_inputs = { + 'conv2d': ['Input', 'Filter'], + 'depthwise_conv2d': ['Input', 'Filter'], + 'mul': ['X', 'Y'] + } + self.quantizable_op_grad_and_inputs = { + 'conv2d_grad': ['Input', 'Filter'], + 'depthwise_conv2d_grad': ['Input', 'Filter'], + 'mul_grad': ['X', 'Y'] + } + + def check_program(self, program): + quantized_ops = {} + + persistable_vars = [ + v.name + for v in filter(lambda var: var.persistable, program.list_vars()) + ] + + for block in program.blocks: + for idx, op in enumerate(block.ops): + # check forward + if op.type in self.quantizable_op_and_inputs: + for i, arg_name in enumerate(op.input_arg_names): + quant_op_type = self.weight_quant_op_type if \ + _original_var_name(arg_name) \ + in persistable_vars else self.act_quant_op_type + self.assertTrue( + arg_name.endswith('.quantized.dequantized')) + if arg_name not in quantized_ops: + self.assertEqual(block.ops[idx - 2 * i - 1].type, + self.dequant_op_type) + self.assertEqual(block.ops[idx - 2 * i - 2].type, + quant_op_type) + quantized_ops[arg_name] = block.ops[idx - 2 * i - 2] + else: + op_idx = block.ops.index(quantized_ops[arg_name]) + self.assertLess(op_idx, idx) + + # check backward + if op.type in self.quantizable_op_grad_and_inputs: + for pname in self.quantizable_op_grad_and_inputs[op.type]: + arg_name = op.input(pname)[0] + self.assertTrue( + arg_name.endswith('.quantized.dequantized')) + self.assertTrue(arg_name in quantized_ops) + + def linear_fc_quant(self, quant_type): + main = fluid.Program() + startup = fluid.Program() + with fluid.program_guard(main, startup): + loss = linear_fc(3) + opt = fluid.optimizer.Adam(learning_rate=0.001) + opt.minimize(loss) + t = QuantizeTranspiler(activation_quantize_type=quant_type) + t.training_transpile(main) + self.check_program(main) + + def test_linear_fc_quant_abs_max(self): + self.act_quant_op_type = 'fake_quantize_abs_max' + self.linear_fc_quant('abs_max') + + def test_linear_fc_quant_range_abs_max(self): + self.act_quant_op_type = 'fake_quantize_range_abs_max' + self.linear_fc_quant('range_abs_max') + + def residual_block_quant(self, quant_type): + main = fluid.Program() + startup = fluid.Program() + with fluid.program_guard(main, startup): + loss = residual_block(2) + opt = fluid.optimizer.Adam(learning_rate=0.001) + opt.minimize(loss) + t = QuantizeTranspiler(activation_quantize_type=quant_type) + t.training_transpile(main) + self.check_program(main) + + def test_residual_block_abs_max(self): + self.act_quant_op_type = 'fake_quantize_abs_max' + self.residual_block_quant('abs_max') + + def test_residual_block_range_abs_max(self): + self.act_quant_op_type = 'fake_quantize_range_abs_max' + self.residual_block_quant('range_abs_max') + + def freeze_program(self, use_cuda): + def build_program(main, startup, is_test): + with fluid.unique_name.guard(): + with fluid.program_guard(main, startup): + img = fluid.layers.data( + name='image', shape=[1, 28, 28], dtype='float32') + label = fluid.layers.data( + name='label', shape=[1], dtype='int64') + loss = conv_net(img, label) + if not is_test: + opt = fluid.optimizer.Adam(learning_rate=0.001) + opt.minimize(loss) + return [img, label], loss + + main = fluid.Program() + startup = fluid.Program() + test_program = fluid.Program() + + feeds, loss = build_program(main, startup, False) + build_program(test_program, startup, True) + test_program = test_program.clone(for_test=True) + + quant_transpiler = QuantizeTranspiler() + quant_transpiler.training_transpile(main) + quant_transpiler.training_transpile(test_program) + + place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace() + exe = fluid.Executor(place) + iter = 5 + batch_size = 8 + class_num = 10 + exe.run(startup) + + train_reader = paddle.batch( + paddle.reader.shuffle( + paddle.dataset.mnist.train(), buf_size=500), + batch_size=batch_size) + test_reader = paddle.batch( + paddle.dataset.mnist.test(), batch_size=batch_size) + feeder = fluid.DataFeeder(feed_list=feeds, place=place) + + with fluid.program_guard(main): + for _ in range(iter): + data = next(train_reader()) + loss_v = exe.run(program=main, + feed=feeder.feed(data), + fetch_list=[loss]) + + with fluid.program_guard(test_program): + test_data = next(test_reader()) + w_var = fluid.framework._get_var('conv2d_1.w_0.quantized', + test_program) + # Testing during training + test_loss1, w_quant = exe.run(program=test_program, + feed=feeder.feed(test_data), + fetch_list=[loss, w_var]) + + # Freeze program for inference, but the weight of fc/conv is still float type. + quant_transpiler.freeze_program(test_program, place) + test_loss2, = exe.run(program=test_program, + feed=feeder.feed(test_data), + fetch_list=[loss]) + self.assertAlmostEqual(test_loss1, test_loss2, delta=1e-3) + w_freeze = np.array(fluid.global_scope().find_var('conv2d_1.w_0') + .get_tensor()) + self.assertEqual(np.sum(w_freeze), np.sum(w_quant)) + + # Convert parameter to 8-bit. + quant_transpiler.convert_to_int8(test_program, place) + # Save the 8-bit parameter and model file. + fluid.io.save_inference_model('model_8bit', ['image', 'label'], + [loss], exe, test_program) + # Test whether the 8-bit parameter and model file can be loaded successfully. + [infer, feed, fetch] = fluid.io.load_inference_model('model_8bit', + exe) + # Check the loaded 8-bit weight. + w_8bit = np.array(fluid.global_scope().find_var('conv2d_1.w_0.int8') + .get_tensor()) + + self.assertEqual(w_8bit.dtype, np.int8) + self.assertEqual(np.sum(w_8bit), np.sum(w_freeze)) + + def test_freeze_program_cuda(self): + if fluid.core.is_compiled_with_cuda(): + with fluid.unique_name.guard(): + self.freeze_program(True) + + def test_freeze_program_cpu(self): + with fluid.unique_name.guard(): + self.freeze_program(False) + + +if __name__ == '__main__': + unittest.main() diff --git a/python/paddle/fluid/tests/unittests/CMakeLists.txt b/python/paddle/fluid/tests/unittests/CMakeLists.txt index d02c890209e65bdceb5da23ba5b9c7c0356174b8..723f9eb9c978755b77724100c266be199e0f301a 100644 --- a/python/paddle/fluid/tests/unittests/CMakeLists.txt +++ b/python/paddle/fluid/tests/unittests/CMakeLists.txt @@ -28,7 +28,6 @@ list(REMOVE_ITEM TEST_OPS test_cond_op) # FIXME(qijun): https://github.com/Paddl list(REMOVE_ITEM TEST_OPS op_test) # op_test is a helper python file, not a test list(REMOVE_ITEM TEST_OPS decorators) # decorators is a helper python file, not a test - if(APPLE) if(NOT WITH_DISTRIBUTE) list(REMOVE_ITEM TEST_OPS test_desc_clone) diff --git a/python/paddle/fluid/tests/unittests/test_dist_base.py b/python/paddle/fluid/tests/unittests/test_dist_base.py index 856980e546eb55f4cd83f7f3862c714e0e996207..0b9af6d7f6d5eb2ba81c04a51169127bbdba1b1a 100644 --- a/python/paddle/fluid/tests/unittests/test_dist_base.py +++ b/python/paddle/fluid/tests/unittests/test_dist_base.py @@ -50,9 +50,7 @@ class TestDistRunnerBase(object): def run_pserver(self, args): self.get_model(batch_size=2) - - if args.mem_opt: - fluid.memory_optimize(fluid.default_main_program()) + # NOTE: pserver should not call memory optimize t = self.get_transpiler(args.trainer_id, fluid.default_main_program(), args.endpoints, args.trainers, args.sync_mode) @@ -70,7 +68,7 @@ class TestDistRunnerBase(object): self.get_model(batch_size=2) if args.mem_opt: - fluid.memory_optimize(fluid.default_main_program()) + fluid.memory_optimize(fluid.default_main_program(), skip_grads=True) if args.is_dist: t = self.get_transpiler(args.trainer_id, fluid.default_main_program(), diff --git a/python/paddle/fluid/tests/unittests/test_dist_se_resnext.py b/python/paddle/fluid/tests/unittests/test_dist_se_resnext.py index d2d927aca8428acd88a6a73c05d70e93439f861c..c0989ca709e100d8f147a08970b0e858c81ce09b 100644 --- a/python/paddle/fluid/tests/unittests/test_dist_se_resnext.py +++ b/python/paddle/fluid/tests/unittests/test_dist_se_resnext.py @@ -26,14 +26,13 @@ class TestDistSeResneXt2x2(TestDistBase): self.check_with_place("dist_se_resnext.py", delta=100) -# TODO(typhoonzero): fix this test -# class TestDistseResnXt2x2WithMemopt(TestDistBase): -# def _setup_config(self): -# self._sync_mode = True -# self._mem_opt = True - -# def test_dist_train(self): -# self.check_with_place("dist_se_resnext.py", delta=1e-7) +class TestDistseResnXt2x2WithMemopt(TestDistBase): + def _setup_config(self): + self._sync_mode = True + self._mem_opt = True + + def test_dist_train(self): + self.check_with_place("dist_se_resnext.py", delta=100) class TestDistSeResneXt2x2Async(TestDistBase): diff --git a/python/paddle/fluid/transpiler/__init__.py b/python/paddle/fluid/transpiler/__init__.py index 28c7ae5341b20f0f79da8cf682d279fc4cc3fa19..c9a8176a72fb744963ae466e965a25bdfb0a44de 100644 --- a/python/paddle/fluid/transpiler/__init__.py +++ b/python/paddle/fluid/transpiler/__init__.py @@ -20,6 +20,10 @@ from .memory_optimization_transpiler import memory_optimize, release_memory from .ps_dispatcher import HashName, RoundRobin __all__ = [ - "DistributeTranspiler", "memory_optimize", "release_memory", "HashName", - "RoundRobin", "DistributeTranspilerConfig" + "DistributeTranspiler", + "memory_optimize", + "release_memory", + "HashName", + "RoundRobin", + "DistributeTranspilerConfig", ] diff --git a/python/paddle/fluid/transpiler/memory_optimization_transpiler.py b/python/paddle/fluid/transpiler/memory_optimization_transpiler.py index d5aa54d752305b188d292f95f05cd70d27702c35..861bb5fae5d7a8561ded1f547fbb86ae1e1a073e 100755 --- a/python/paddle/fluid/transpiler/memory_optimization_transpiler.py +++ b/python/paddle/fluid/transpiler/memory_optimization_transpiler.py @@ -14,10 +14,10 @@ from __future__ import print_function -from collections import defaultdict, OrderedDict, Callable +from collections import defaultdict, MutableSet from .. import core from ... import compat as cpt -from ..framework import Program, default_main_program, Parameter, Variable +from ..framework import Program, default_main_program, Parameter, Variable, core from ..backward import _rename_arg_ from functools import reduce from six.moves import range @@ -44,17 +44,82 @@ SUB_BLOCK_PAIR = [("while", "while_grad"), ("parallel_do", "parallel_do_grad"), PRINT_LOG = False +class OrderedSet(MutableSet): + def __init__(self, iterable=None): + self.end = end = [] + end += [None, end, end] # sentinel node for doubly linked list + self.map = {} # key --> [key, prev, next] + if iterable is not None: + self |= iterable + + def __len__(self): + return len(self.map) + + def __contains__(self, key): + return key in self.map + + def add(self, key): + if key not in self.map: + end = self.end + curr = end[1] + curr[2] = end[1] = self.map[key] = [key, curr, end] + + def update(self, other): + for e in other: + self.add(e) + + def discard(self, key): + if key in self.map: + key, prev, next = self.map.pop(key) + prev[2] = next + next[1] = prev + + def remove(self, key): + self.discard(key) + + def __iter__(self): + end = self.end + curr = end[2] + while curr is not end: + yield curr[0] + curr = curr[2] + + def __reversed__(self): + end = self.end + curr = end[1] + while curr is not end: + yield curr[0] + curr = curr[1] + + def pop(self, last=True): + if not self: + raise KeyError('set is empty') + key = self.end[1][0] if last else self.end[2][0] + self.discard(key) + return key + + def __repr__(self): + if not self: + return '%s()' % (self.__class__.__name__, ) + return '%s(%r)' % (self.__class__.__name__, list(self)) + + def __eq__(self, other): + if isinstance(other, OrderedSet): + return len(self) == len(other) and list(self) == list(other) + return set(self) == set(other) + + class ControlFlowGraph(object): def __init__(self, program, ops, forward_num, skip_opt): self._program = program self._ops = ops self._forward_num = forward_num - self._successors = defaultdict(set) - self._presuccessors = defaultdict(set) - self._uses = defaultdict(set) - self._defs = defaultdict(set) - self._live_in = defaultdict(set) - self._live_out = defaultdict(set) + self._successors = defaultdict(OrderedSet) + self._presuccessors = defaultdict(OrderedSet) + self._uses = defaultdict(OrderedSet) + self._defs = defaultdict(OrderedSet) + self._live_in = defaultdict(OrderedSet) + self._live_out = defaultdict(OrderedSet) self._skip_opt = skip_opt self.pool = [] @@ -116,7 +181,7 @@ class ControlFlowGraph(object): # NOTE: must sort the in_diff set for cases that get different cache var. # FIXME(typhoonzero): maybe use a "sorted set" is better than this. can_optimize = [ - x for x in sorted(list(in_diff)) + x for x in in_diff if self._check_var_validity(block_desc, x, is_forward) ] if can_optimize: @@ -224,7 +289,7 @@ class ControlFlowGraph(object): if self.pool: # NOTE: must sort the in_diff set for cases that get different cache var. defs_can_optimize = [ - x for x in sorted(list(self._defs[i])) + x for x in self._defs[i] if self._check_var_validity(block_desc, x, is_forward) ] out_pair = [ @@ -381,7 +446,19 @@ def _get_cfgs(input_program): return cfgs -def memory_optimize(input_program, skip_opt_set=None, print_log=False, level=0): +def _is_opt_role_op(op): + op_maker = core.op_proto_and_checker_maker + optimize_role = core.op_proto_and_checker_maker.OpRole.Optimize + if op_maker.kOpRoleAttrName() in op.attr_names and \ + int(op.all_attrs()[op_maker.kOpRoleAttrName()]) == int(optimize_role): + return True + + +def memory_optimize(input_program, + skip_opt_set=None, + print_log=False, + level=0, + skip_grads=False): """Optimize memory by reusing var memory. Note: it doesn't not support subblock nested in subblock. @@ -398,6 +475,19 @@ def memory_optimize(input_program, skip_opt_set=None, print_log=False, level=0): raise ValueError("only support opt_level 0 or 1.") global PRINT_LOG PRINT_LOG = print_log + if skip_grads: + grad_set = set() + OP_ROLE_VAR = core.op_proto_and_checker_maker.kOpRoleVarAttrName() + for op in input_program.global_block().ops: + if _is_opt_role_op(op): + if op.attr(OP_ROLE_VAR): + grad_name = op.attr(OP_ROLE_VAR)[1] + grad_set.add(grad_name) + if not skip_opt_set: + skip_opt_set = grad_set + else: + skip_opt_set.update(grad_set) + cfgs = _get_cfgs(input_program) for cfg in cfgs: cfg.memory_optimize(skip_opt_set=skip_opt_set, level=level) diff --git a/python/setup.py.in b/python/setup.py.in index 786c9f2e39880b68700b8acb94b3d35a48323958..b376be0ea373f089ef17f27435d979712fbdff72 100644 --- a/python/setup.py.in +++ b/python/setup.py.in @@ -106,6 +106,7 @@ packages=['paddle', 'paddle.fluid.layers', 'paddle.fluid.contrib', 'paddle.fluid.contrib.decoder', + 'paddle.fluid.contrib.quantize', 'paddle.fluid.transpiler', 'paddle.fluid.transpiler.details']