diff --git a/Dockerfile b/Dockerfile index fe0721e9b99b5e028df2f6228ff04cb56a567a3f..c248ac119caa1f493e4866b02551eb900d3bf391 100644 --- a/Dockerfile +++ b/Dockerfile @@ -75,8 +75,9 @@ RUN curl -s -q https://glide.sh/get | sh # and its size is only one-third of the official one. # 2. Manually add ~IPluginFactory() in IPluginFactory class of NvInfer.h, otherwise, it couldn't work in paddle. # See https://github.com/PaddlePaddle/Paddle/issues/10129 for details. -RUN wget -qO- http://paddlepaddledeps.cdn.bcebos.com/TensorRT-4.0.0.3.Ubuntu-16.04.4.x86_64-gnu.cuda-8.0.cudnn7.0.tar.gz | \ - tar -xz -C /usr/local && \ + +RUN wget -q https://paddlepaddledeps.cdn.bcebos.com/TensorRT-4.0.1.6-ubuntu14.04.x86_64-gnu.cuda.8.0.cudnn7.0.tar.gz --no-check-certificate && \ + tar -zxf TensorRT-4.0.1.6-ubuntu14.04.x86_64-gnu.cuda.8.0.cudnn7.0.tar.gz -C /usr/local && \ cp -rf /usr/local/TensorRT/include /usr && \ cp -rf /usr/local/TensorRT/lib /usr diff --git a/paddle/fluid/API.spec b/paddle/fluid/API.spec index afbff1e13cf7dc4a8f9f4bd9252e2194e0862c93..a4e683da0bc0ee6ab3bf920c07b512596bf7e9b6 100644 --- a/paddle/fluid/API.spec +++ b/paddle/fluid/API.spec @@ -238,7 +238,7 @@ paddle.fluid.layers.load (ArgSpec(args=['out', 'file_path', 'load_as_fp16'], var paddle.fluid.layers.create_tensor (ArgSpec(args=['dtype', 'name', 'persistable'], varargs=None, keywords=None, defaults=(None, False)), ('document', 'c0c3d0194f83fff8ea99ce0820657dae')) paddle.fluid.layers.create_parameter (ArgSpec(args=['shape', 'dtype', 'name', 'attr', 'is_bias', 'default_initializer'], varargs=None, keywords=None, defaults=(None, None, False, None)), ('document', 'd62b866c899bc1fedb5385f95b88e1f8')) paddle.fluid.layers.create_global_var (ArgSpec(args=['shape', 'value', 'dtype', 'persistable', 'force_cpu', 'name'], varargs=None, keywords=None, defaults=(False, False, None)), ('document', 'ab914fac893607e29ac6e52bbdbea1a4')) -paddle.fluid.layers.cast (ArgSpec(args=['x', 'dtype'], varargs=None, keywords=None, defaults=None), ('document', '60cb8f843d625abf33f8bf12455b8f99')) +paddle.fluid.layers.cast (ArgSpec(args=['x', 'dtype'], varargs=None, keywords=None, defaults=None), ('document', '992eb42590fc1c380841a6db72ce78b3')) paddle.fluid.layers.tensor_array_to_tensor (ArgSpec(args=['input', 'axis', 'name'], varargs=None, keywords=None, defaults=(1, None)), ('document', 'b12717d3d4567e6119589f7f655b0cbb')) paddle.fluid.layers.concat (ArgSpec(args=['input', 'axis', 'name'], varargs=None, keywords=None, defaults=(0, None)), ('document', 'b19b79be4f05e85d1d6cec642c9fb535')) paddle.fluid.layers.sums (ArgSpec(args=['input', 'out'], varargs=None, keywords=None, defaults=(None,)), ('document', '42912092418620b4be07f36af31e7816')) @@ -262,7 +262,7 @@ paddle.fluid.layers.Switch.default (ArgSpec(args=['self'], varargs=None, keyword paddle.fluid.layers.increment (ArgSpec(args=['x', 'value', 'in_place'], varargs=None, keywords=None, defaults=(1.0, True)), ('document', '73bb96ec4783ec1a11e760e8851b0e77')) paddle.fluid.layers.array_write (ArgSpec(args=['x', 'i', 'array'], varargs=None, keywords=None, defaults=(None,)), ('document', '40b6d15f4c86b2b09df340d7778ad713')) paddle.fluid.layers.create_array (ArgSpec(args=['dtype'], varargs=None, keywords=None, defaults=None), ('document', '2d4f20087080ba5105b55205ad5c5b6a')) -paddle.fluid.layers.less_than (ArgSpec(args=['x', 'y', 'force_cpu', 'cond'], varargs=None, keywords='ignored', defaults=(None, None)), ('document', '067bbc799c66289ca8b8924c26b6673f')) +paddle.fluid.layers.less_than (ArgSpec(args=['x', 'y', 'force_cpu', 'cond'], varargs=None, keywords=None, defaults=(None, None)), ('document', '067bbc799c66289ca8b8924c26b6673f')) paddle.fluid.layers.equal (ArgSpec(args=['x', 'y', 'cond'], varargs=None, keywords=None, defaults=(None,)), ('document', '80c29b1dc64718f0116de90d1ac88a77')) paddle.fluid.layers.array_read (ArgSpec(args=['array', 'i'], varargs=None, keywords=None, defaults=None), ('document', '0275133f1dde2aed528b4d3230edf823')) paddle.fluid.layers.array_length (ArgSpec(args=['array'], varargs=None, keywords=None, defaults=None), ('document', 'ffb8b9578ec66db565b223d313aa82a2')) @@ -287,7 +287,7 @@ paddle.fluid.layers.StaticRNN.step_output (ArgSpec(args=['self', 'o'], varargs=N paddle.fluid.layers.StaticRNN.update_memory (ArgSpec(args=['self', 'mem', 'var'], varargs=None, keywords=None, defaults=None), ('document', '6adf97f83acf6453d4a6a4b1070f3754')) paddle.fluid.layers.reorder_lod_tensor_by_rank (ArgSpec(args=['x', 'rank_table'], varargs=None, keywords=None, defaults=None), ('document', '3545f529ef04e8f6ecb76b47fa3df01a')) paddle.fluid.layers.Print (ArgSpec(args=['input', 'first_n', 'message', 'summarize', 'print_tensor_name', 'print_tensor_type', 'print_tensor_shape', 'print_tensor_lod', 'print_phase'], varargs=None, keywords=None, defaults=(-1, None, -1, True, True, True, True, 'both')), ('document', '5fef91b0e21c93610785f2b1f7161732')) -paddle.fluid.layers.is_empty (ArgSpec(args=['x', 'cond'], varargs=None, keywords='ignored', defaults=(None,)), ('document', 'bbe578dbb49ad13e15b014e98c22b519')) +paddle.fluid.layers.is_empty (ArgSpec(args=['x', 'cond'], varargs=None, keywords=None, defaults=(None,)), ('document', 'bbe578dbb49ad13e15b014e98c22b519')) paddle.fluid.layers.sigmoid (ArgSpec(args=['x', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', '29a25ba78de79152076cacfc5443137d')) paddle.fluid.layers.logsigmoid (ArgSpec(args=['x', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', '81ccb7acafd06c7728e11581f5d342e3')) paddle.fluid.layers.exp (ArgSpec(args=['x', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', 'e6b3e769413d96aab4176f96db25984b')) @@ -329,6 +329,8 @@ paddle.fluid.layers.polygon_box_transform (ArgSpec(args=['input', 'name'], varar paddle.fluid.layers.yolov3_loss (ArgSpec(args=['x', 'gtbox', 'gtlabel', 'anchors', 'anchor_mask', 'class_num', 'ignore_thresh', 'downsample_ratio', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', '991e934c3e09abf0edec7c9c978b4691')) paddle.fluid.layers.box_clip (ArgSpec(args=['input', 'im_info', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', '397e9e02b451d99c56e20f268fa03f2e')) paddle.fluid.layers.multiclass_nms (ArgSpec(args=['bboxes', 'scores', 'score_threshold', 'nms_top_k', 'keep_top_k', 'nms_threshold', 'normalized', 'nms_eta', 'background_label', 'name'], varargs=None, keywords=None, defaults=(0.3, True, 1.0, 0, None)), ('document', 'ca7d1107b6c5d2d6d8221039a220fde0')) +paddle.fluid.layers.distribute_fpn_proposals (ArgSpec(args=['fpn_rois', 'min_level', 'max_level', 'refer_level', 'refer_scale', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', '7bb011ec26bace2bc23235aa4a17647d')) +paddle.fluid.layers.box_decoder_and_assign (ArgSpec(args=['prior_box', 'prior_box_var', 'target_box', 'box_score', 'box_clip', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', '005a5ae47d6c8fff721931d69d072b9f')) paddle.fluid.layers.accuracy (ArgSpec(args=['input', 'label', 'k', 'correct', 'total'], varargs=None, keywords=None, defaults=(1, None, None)), ('document', '9808534c12c5e739a10f73ebb0b4eafd')) paddle.fluid.layers.auc (ArgSpec(args=['input', 'label', 'curve', 'num_thresholds', 'topk', 'slide_steps'], varargs=None, keywords=None, defaults=('ROC', 4095, 1, 1)), ('document', 'e0e95334fce92d16c2d9db6e7caffc47')) paddle.fluid.layers.exponential_decay (ArgSpec(args=['learning_rate', 'decay_steps', 'decay_rate', 'staircase'], varargs=None, keywords=None, defaults=(False,)), ('document', '98a5050bee8522fcea81aa795adaba51')) diff --git a/paddle/fluid/framework/details/memory_optimize_helper.cc b/paddle/fluid/framework/details/memory_optimize_helper.cc index 0d7cbf298118722b8f32ccc5a8016ae5e168700b..c89a33fc959247afb74dab49056fc3fca8b9bd89 100644 --- a/paddle/fluid/framework/details/memory_optimize_helper.cc +++ b/paddle/fluid/framework/details/memory_optimize_helper.cc @@ -20,6 +20,9 @@ #include #include #include +#include +#include +#include "paddle/fluid/framework/operator.h" #include "paddle/fluid/framework/var_desc.h" #include "paddle/fluid/platform/cpu_info.h" @@ -302,7 +305,10 @@ std::string OrderedSet::ToString() const { bool NodeCanReused(ir::Node* node) { // valid the node is a var node - if (node == nullptr || !node->IsVar() || node->IsCtrlVar()) return false; + // vars can be @EMPTY@, @LR_DECAY_REUSE_ID@. For example, while_grad + if (node == nullptr || !node->IsVar() || node->IsCtrlVar() || + node->Name() == kEmptyVarName) + return false; bool flag = true; // op output force generated in cpu, can not be reused. @@ -348,10 +354,6 @@ bool NodeCanReused(const VarDesc& node) { if (shape.empty() || size < MinChunkSize()) { return false; } - // vars can be @EMPTY@, @LR_DECAY_REUSE_ID@. For example, while_grad - std::string name = node.Name(); - if (!name.empty() && name[0] == '@' && name[name.size() - 1] == '@') - return false; return true; } diff --git a/paddle/fluid/framework/details/parallel_ssa_graph_executor.cc b/paddle/fluid/framework/details/parallel_ssa_graph_executor.cc index 5b8ae8b6770df79df309bb6be16e4f2a24ee0460..2afac32437dd79a54ef7d1ee2d203a34c1b5f30e 100644 --- a/paddle/fluid/framework/details/parallel_ssa_graph_executor.cc +++ b/paddle/fluid/framework/details/parallel_ssa_graph_executor.cc @@ -13,6 +13,8 @@ // limitations under the License. #include "paddle/fluid/framework/details/parallel_ssa_graph_executor.h" +#include +#include #include "paddle/fluid/framework/ir/graph_helper.h" namespace paddle { @@ -29,6 +31,11 @@ ParallelSSAGraphExecutor::SeparateMultiDevicesGraph(ir::Graph *graph) { auto &g = graphs.back(); g->Set(kGraphVars, new GraphVars(1UL)); g->Set(kGraphDepVars, new GraphDepVars); + auto &stale_ops = + graph->Get>(details::kStaleProgramOpDescs); + g->Erase(details::kStaleProgramOpDescs); + g->Set>(details::kStaleProgramOpDescs, + new std::vector(stale_ops)); } auto op_handles = ir::FilterByNodeWrapper(*graph); diff --git a/paddle/fluid/framework/small_stack.h b/paddle/fluid/framework/inlined_stack.h similarity index 97% rename from paddle/fluid/framework/small_stack.h rename to paddle/fluid/framework/inlined_stack.h index 6919ff7a28aac909511d0a8fd983df3eeaf3ca13..1083c9f77c5476dc20a8e0ccf5acd0f718436ef6 100644 --- a/paddle/fluid/framework/small_stack.h +++ b/paddle/fluid/framework/inlined_stack.h @@ -14,7 +14,6 @@ #pragma once -#include #include #include "paddle/fluid/platform/enforce.h" @@ -22,7 +21,7 @@ namespace paddle { namespace framework { template -class SmallStack { +class InlinedStack { static_assert(N > 0, "N must be larger than 0"); public: @@ -66,8 +65,8 @@ class SmallStack { private: T head_[N]; + size_t size_{0}; std::deque tail_; - size_t size_; }; } // namespace framework diff --git a/paddle/fluid/framework/ir/fuse_pass_base.h b/paddle/fluid/framework/ir/fuse_pass_base.h index c53b2a6186741d86f14faf1d21fa19aa09cec036..3a1022bbcbd671391fb034bdff7c3cf97952f84d 100644 --- a/paddle/fluid/framework/ir/fuse_pass_base.h +++ b/paddle/fluid/framework/ir/fuse_pass_base.h @@ -14,6 +14,7 @@ #pragma once +#include #include "paddle/fluid/framework/ir/graph.h" #include "paddle/fluid/framework/ir/pass.h" #include "paddle/fluid/framework/scope.h" @@ -24,6 +25,10 @@ namespace ir { static const char kParamScopeAttr[] = "__param_scope__"; static const char kFuseStatisAttr[] = "__fuse_statis__"; +// When we use trt or other third_party lib, the parameters are managed by +// the lib, but not the fluid. So we need to record them to avoid duplicate +// allocation. +static const char kRepetitiveParamAttr[] = "__repetitive_param__"; enum FuseOptions { DO_NOT_FUSE, // fusing will not be done diff --git a/paddle/fluid/framework/ir/graph_helper.cc b/paddle/fluid/framework/ir/graph_helper.cc index 22d4c0a91cc1638264a8c57aa2841ff4e65a1400..28a37f331c100695f0ffec7288db84f4493d68a0 100644 --- a/paddle/fluid/framework/ir/graph_helper.cc +++ b/paddle/fluid/framework/ir/graph_helper.cc @@ -130,15 +130,21 @@ std::map> BuildOperationAdjList( if (adj_list.find(n) == adj_list.end()) { adj_list[n] = std::unordered_set(); } + std::vector nodes; for (auto &var : n->inputs) { for (auto &adj_n : var->inputs) { PADDLE_ENFORCE(adj_n->NodeType() == ir::Node::Type::kOperation); VLOG(4) << "adj " << adj_n->Name() << reinterpret_cast(adj_n) << " -> " << n->Name() << reinterpret_cast(n) << " via " << var->Name() << reinterpret_cast(var); - adj_list[n].insert(adj_n); + nodes.push_back(adj_n); } } + std::sort(nodes.begin(), nodes.end(), [](ir::Node *node1, ir::Node *node2) { + return node1->id() > node2->id(); + }); + adj_list[n].insert(std::make_move_iterator(nodes.begin()), + std::make_move_iterator(nodes.end())); } return adj_list; } diff --git a/paddle/fluid/framework/operator.cc b/paddle/fluid/framework/operator.cc index 5a874fe437d83e2ba795a0b063d7f1811afa04d8..df1689764d21fcbb054a0bf32ef725541bdaefe3 100644 --- a/paddle/fluid/framework/operator.cc +++ b/paddle/fluid/framework/operator.cc @@ -467,12 +467,6 @@ const Variable* ExecutionContext::InputVar(const std::string& name) const { return it->second.empty() ? nullptr : it->second[0]; } -const Variable* ExecutionContext::LegacyInputVar( - const std::string& name) const { - auto ipt = op_.Input(name); - return ipt == kEmptyVarName ? nullptr : scope_.FindVar(ipt); -} - Variable* ExecutionContext::OutputVar(const std::string& name) const { auto it = ctx_.outputs.find(name); if (it == ctx_.outputs.end()) return nullptr; @@ -483,22 +477,11 @@ Variable* ExecutionContext::OutputVar(const std::string& name) const { return it->second.empty() ? nullptr : it->second[0]; } -Variable* ExecutionContext::LegacyOutputVar(const std::string& name) const { - auto opt = op_.Output(name); - return opt == kEmptyVarName ? nullptr : scope_.FindVar(opt); -} - template <> const Tensor* ExecutionContext::Input(const std::string& name) const { return Input(name); } -template <> -const Tensor* ExecutionContext::LegacyInput( - const std::string& name) const { - return LegacyInput(name); -} - template <> const std::vector ExecutionContext::MultiInput( const std::string& name) const { @@ -521,35 +504,11 @@ const std::vector ExecutionContext::MultiInput( return res; } -template <> -const std::vector ExecutionContext::LegacyMultiInput( - const std::string& name) const { - auto names = op().Inputs(name); - std::vector res; - res.reserve(names.size()); - std::transform(names.begin(), names.end(), std::back_inserter(res), - [&](const std::string& sub_name) -> const Tensor* { - auto var = scope_.FindVar(sub_name); - if (var == nullptr) return nullptr; - PADDLE_ENFORCE( - var->IsType(), - "%s should be LoDTensor, but the received type is %s", - sub_name, ToTypeName(var->Type())); - return &(var->Get()); - }); - return res; -} - template <> Tensor* ExecutionContext::Output(const std::string& name) const { return Output(name); } -template <> -Tensor* ExecutionContext::LegacyOutput(const std::string& name) const { - return LegacyOutput(name); -} - template <> std::vector ExecutionContext::MultiOutput( const std::string& name) const { diff --git a/paddle/fluid/framework/operator.h b/paddle/fluid/framework/operator.h index 24ab33a1442e261bf98de13c2dd4b1f9af630ec8..85119d38d133b68a9b2cd2512d25776f01faddac 100644 --- a/paddle/fluid/framework/operator.h +++ b/paddle/fluid/framework/operator.h @@ -16,9 +16,11 @@ limitations under the License. */ #include #include +#include #include #include #include +#include #include #include "glog/logging.h" // For VLOG @@ -253,31 +255,6 @@ class ExecutionContext { return it->second; } - const std::vector LegacyMultiInputVar( - const std::string& name) const { - auto names = op_.Inputs(name); - std::vector res; - res.reserve(names.size()); - std::transform(names.begin(), names.end(), std::back_inserter(res), - [this](const std::string& name) { - return name == kEmptyVarName ? nullptr - : scope_.FindVar(name); - }); - return res; - } - - std::vector LegacyMultiOutputVar(const std::string& name) const { - auto names = op_.Outputs(name); - std::vector res; - res.reserve(names.size()); - std::transform(names.begin(), names.end(), std::back_inserter(res), - [this](const std::string& name) { - return name == kEmptyVarName ? nullptr - : scope_.FindVar(name); - }); - return res; - } - template const T* Input(const std::string& name) const { auto* var = InputVar(name); @@ -290,22 +267,6 @@ class ExecutionContext { return var == nullptr ? nullptr : var->GetMutable(); } - template - const T* LegacyInput(const std::string& name) const { - auto* var = LegacyInputVar(name); - return var == nullptr ? nullptr : &var->Get(); - } - - template - T* LegacyOutput(const std::string& name) const { - auto var = LegacyOutputVar(name); - return var == nullptr ? nullptr : var->GetMutable(); - } - - const Variable* LegacyInputVar(const std::string& name) const; - - Variable* LegacyOutputVar(const std::string& name) const; - template const std::vector MultiInput(const std::string& name) const { auto it = ctx_.inputs.find(name); @@ -338,32 +299,6 @@ class ExecutionContext { return res; } - template - const std::vector LegacyMultiInput(const std::string& name) const { - auto names = op_.Inputs(name); - std::vector res; - res.reserve(names.size()); - std::transform(names.begin(), names.end(), std::back_inserter(res), - [&](const std::string& sub_name) -> const T* { - auto var = scope_.FindVar(sub_name); - return var == nullptr ? nullptr : &var->Get(); - }); - return res; - } - - template - std::vector LegacyMultiOutput(const std::string& name) const { - auto names = op_.Outputs(name); - std::vector res; - res.reserve(names.size()); - std::transform(names.begin(), names.end(), std::back_inserter(res), - [&](const std::string& sub_name) -> T* { - auto var = scope_.FindVar(sub_name); - return var == nullptr ? nullptr : var->GetMutable(); - }); - return res; - } - platform::Place GetPlace() const { return device_context_.GetPlace(); } template @@ -433,24 +368,13 @@ class ExecutionContext { template <> const Tensor* ExecutionContext::Input(const std::string& name) const; -template <> -const Tensor* ExecutionContext::LegacyInput( - const std::string& name) const; - template <> const std::vector ExecutionContext::MultiInput( const std::string& name) const; -template <> -const std::vector ExecutionContext::LegacyMultiInput( - const std::string& name) const; - template <> Tensor* ExecutionContext::Output(const std::string& name) const; -template <> -Tensor* ExecutionContext::LegacyOutput(const std::string& name) const; - template <> std::vector ExecutionContext::MultiOutput( const std::string& name) const; diff --git a/paddle/fluid/inference/analysis/argument.h b/paddle/fluid/inference/analysis/argument.h index 2f31b182af7293488719e41a92b2ea78709bda02..89e934ae27b9319d4e1d2d51586d5f8fa7dccfce 100644 --- a/paddle/fluid/inference/analysis/argument.h +++ b/paddle/fluid/inference/analysis/argument.h @@ -23,8 +23,12 @@ #pragma once +#include #include +#include +#include #include + #include "paddle/fluid/framework/ir/graph.h" #include "paddle/fluid/framework/program_desc.h" #include "paddle/fluid/framework/scope.h" @@ -133,6 +137,8 @@ struct Argument { DECL_ARGUMENT_FIELD(tensorrt_min_subgraph_size, TensorRtMinSubgraphSize, int); DECL_ARGUMENT_FIELD(tensorrt_precision_mode, TensorRtPrecisionMode, AnalysisConfig::Precision); + DECL_ARGUMENT_FIELD(tensorrt_use_static_engine, TensorRtUseStaticEngine, + bool); // Memory optimized related. DECL_ARGUMENT_FIELD(enable_memory_optim, EnableMemoryOptim, bool); diff --git a/paddle/fluid/inference/analysis/helper.h b/paddle/fluid/inference/analysis/helper.h index 59107f28080dceb0a58e17d42281db5f3773de56..a48058400241b030f17557156a4d973fca92fd8d 100644 --- a/paddle/fluid/inference/analysis/helper.h +++ b/paddle/fluid/inference/analysis/helper.h @@ -17,10 +17,12 @@ limitations under the License. */ #include #include #include +#include #include #include #include #include +#include #include #include "paddle/fluid/framework/framework.pb.h" @@ -217,6 +219,35 @@ static std::string GetTrtCalibTableData(const std::string &model_opt_cache_dir, return ""; } +static std::string GetTrtEngineSerializedPath(const std::string &model_root, + const std::string &engine_key) { + return model_root + "/trt_serialized_" + engine_key; +} + +static std::string GetTrtEngineSerializedData( + const std::string &model_opt_cache_dir, const std::string &engine_key) { + std::string trt_serialized_path = + GetTrtEngineSerializedPath(model_opt_cache_dir, engine_key); + if (FileExists(trt_serialized_path)) { + VLOG(3) << "Trt serialized file: " << trt_serialized_path + << "is found here"; + std::ifstream infile(trt_serialized_path, std::ios::in); + std::stringstream buffer; + buffer << infile.rdbuf(); + std::string trt_engine_serialized_data(buffer.str()); + return trt_engine_serialized_data; + } + return ""; +} + +static void SaveTrtEngineSerializedDataToFile( + const std::string &trt_serialized_path, + const std::string &engine_serialized_data) { + std::ofstream outfile(trt_serialized_path); + outfile << engine_serialized_data; + outfile.close(); +} + } // namespace analysis } // namespace inference } // namespace paddle diff --git a/paddle/fluid/inference/analysis/ir_pass_manager.cc b/paddle/fluid/inference/analysis/ir_pass_manager.cc index 8d5ee36ae627deccd7ddbd4bf8c5354a82c5e9db..1cdb4881fbc1e2c0249430f7148bf56261bd6c41 100644 --- a/paddle/fluid/inference/analysis/ir_pass_manager.cc +++ b/paddle/fluid/inference/analysis/ir_pass_manager.cc @@ -81,6 +81,9 @@ void IRPassManager::CreatePasses(Argument *argument, pass->Set( "model_opt_cache_dir", new std::string(GetOrCreateModelOptCacheDir(model_opt_cache_dir))); + pass->Set("gpu_device_id", new int(argument->gpu_device_id())); + pass->Set("use_static_engine", + new bool(argument->tensorrt_use_static_engine())); } pre_pass = pass_name; diff --git a/paddle/fluid/inference/analysis/ir_pass_manager.h b/paddle/fluid/inference/analysis/ir_pass_manager.h index 2a595cb36b8345157b3fd26afc62aabfa98b87bc..2d120679eedd392d78b4da66276297ff7280792b 100644 --- a/paddle/fluid/inference/analysis/ir_pass_manager.h +++ b/paddle/fluid/inference/analysis/ir_pass_manager.h @@ -22,7 +22,10 @@ #pragma once +#include #include +#include +#include #include #include "paddle/fluid/framework/ir/graph.h" #include "paddle/fluid/framework/ir/pass.h" diff --git a/paddle/fluid/inference/analysis/ir_passes/tensorrt_subgraph_pass.cc b/paddle/fluid/inference/analysis/ir_passes/tensorrt_subgraph_pass.cc index 69a9caec030600332c9f11ba255e4e642bd41e96..d4e2da8957f2057b21460d00b71e9717c63ed054 100644 --- a/paddle/fluid/inference/analysis/ir_passes/tensorrt_subgraph_pass.cc +++ b/paddle/fluid/inference/analysis/ir_passes/tensorrt_subgraph_pass.cc @@ -14,13 +14,13 @@ #include #include -#include -#include #include "paddle/fluid/framework/ir/graph_pattern_detector.h" #include "paddle/fluid/inference/analysis/helper.h" #include "paddle/fluid/inference/analysis/ir_passes/subgraph_detector.h" #include "paddle/fluid/inference/analysis/ir_passes/tensorrt_subgraph_pass.h" +#include "paddle/fluid/inference/tensorrt/convert/op_converter.h" +#include "paddle/fluid/inference/tensorrt/engine.h" #include "paddle/fluid/inference/tensorrt/op_teller.h" #include "paddle/fluid/string/pretty_log.h" @@ -33,8 +33,15 @@ using framework::ir::Node; std::vector ExtractParameters( const std::unordered_set &nodes); -std::unique_ptr analysis::TensorRtSubgraphPass::ApplyImpl( +void RenameAndGetOutputs( + const std::vector &subgraph_nodes, + framework::BlockDesc *block_desc, + const std::set &input_names_with_id, + std::set *output_names_with_id, + std::set *output_names, + std::unordered_map *output_name_map); +std::unique_ptr analysis::TensorRtSubgraphPass::ApplyImpl( std::unique_ptr graph) const { framework::ir::FusePassBase::Init("tensorrt_subgraph_pass", graph.get()); @@ -47,9 +54,16 @@ std::unique_ptr analysis::TensorRtSubgraphPass::ApplyImpl( Get("min_subgraph_size") /*min subgraph size*/); fuser(); + std::vector graph_param_names = + ExtractParameters(graph->Nodes()); + // those parameter already exist in trt, and should not have another copy in + // fluid. + std::vector repetitive_params; + for (auto *node : graph->Nodes()) { if (node->IsOp() && !Agent(node).subgraph()->empty()) { - CreateTensorRTOp(node, graph.get()); + CreateTensorRTOp(node, graph.get(), graph_param_names, + &repetitive_params); std::unordered_set nodes2remove( Agent(node).subgraph()->begin(), Agent(node).subgraph()->end()); @@ -64,12 +78,15 @@ std::unique_ptr analysis::TensorRtSubgraphPass::ApplyImpl( } } framework::ir::GraphSafeRemoveNodes(graph.get(), nodes2remove); + graph->Set(framework::ir::kRepetitiveParamAttr, + new std::vector(repetitive_params)); return graph; } std::string GenerateEngineKey(const std::set &engine_inputs, - const std::set &engine_outputs) { + const std::set &engine_outputs, + const std::string &predictor_id) { std::string engine_hash_key = ""; for (auto name : engine_inputs) { engine_hash_key += name; @@ -77,12 +94,15 @@ std::string GenerateEngineKey(const std::set &engine_inputs, for (auto name : engine_outputs) { engine_hash_key += name; } + engine_hash_key += predictor_id; auto engine_key = std::to_string(std::hash()(engine_hash_key)); return engine_key; } -void TensorRtSubgraphPass::CreateTensorRTOp(framework::ir::Node *node, - Graph *graph) const { +void TensorRtSubgraphPass::CreateTensorRTOp( + framework::ir::Node *node, Graph *graph, + const std::vector &graph_params, + std::vector *repetitive_params) const { auto *op_desc = node->Op(); auto &subgraph = *Agent(node).subgraph(); PADDLE_ENFORCE(!subgraph.empty()); @@ -116,12 +136,16 @@ void TensorRtSubgraphPass::CreateTensorRTOp(framework::ir::Node *node, // is unique. std::set input_names; std::set input_names_with_id; + std::vector params; + + // The node->inputs containes input tensors and parameters. for (auto *x : node->inputs) { input_names.insert(x->Name()); input_names_with_id.insert(x->Name() + std::to_string(x->id())); + if (std::count(graph_params.begin(), graph_params.end(), x->Name()) > 0) { + params.push_back(x->Name()); + } } - op_desc->SetInput( - "Xs", std::vector(input_names.begin(), input_names.end())); std::set output_names; std::set output_names_with_id; @@ -130,11 +154,8 @@ void TensorRtSubgraphPass::CreateTensorRTOp(framework::ir::Node *node, output_names_with_id.insert(x->Name() + std::to_string(x->id())); } - op_desc->SetOutput( - "Ys", std::vector(output_names.begin(), output_names.end())); - op_desc->SetType("tensorrt_engine"); - std::unordered_map output_name_map; + auto &subgraph_nodes = *Agent(node).subgraph(); // The following procedure is used to rename all the intermediate // variables and the output variables of the subgraph. @@ -148,61 +169,8 @@ void TensorRtSubgraphPass::CreateTensorRTOp(framework::ir::Node *node, // input of a OP, but also the output of a Op, there will be problems. // So we have to rename the variable in the subgraph to make sure // it is either an OP's input or an OP's output. - - auto &subgraph_nodes = *Agent(node).subgraph(); - for (size_t index = 0; index < block_desc.OpSize(); ++index) { - framework::proto::OpDesc *op = block_desc.Op(index)->Proto(); - auto correspond_node = subgraph_nodes[index]; - PADDLE_ENFORCE_EQ(correspond_node->Name(), op->type()); - - std::unordered_map var2id; - for (auto *in_var : correspond_node->inputs) { - var2id[in_var->Name()] = in_var->id(); - } - // rename for the input variables of op inside subgraph - for (int i = 0; i < op->inputs_size(); i++) { - // one input - auto *in_var = op->mutable_inputs(i); - std::vector replaced_names; - for (int k = 0; k < in_var->arguments_size(); k++) { // all the arguments - std::string arg_value = in_var->arguments(k); - std::string arg_value_with_id = - arg_value + std::to_string(var2id[arg_value]); - if (input_names_with_id.count(arg_value_with_id)) { - replaced_names.push_back(arg_value); - } else { - replaced_names.push_back(arg_value_with_id); - } - } - in_var->clear_arguments(); - for (size_t k = 0; k < replaced_names.size(); k++) { - in_var->add_arguments(replaced_names[k]); - } - } - var2id.clear(); - for (auto out_var : correspond_node->outputs) { - var2id[out_var->Name()] = out_var->id(); - } - - // rename for the output variables of op inside subgraph - for (int i = 0; i < op->outputs_size(); i++) { - framework::proto::OpDesc_Var *out_var = op->mutable_outputs(i); - std::vector replaced_names; - for (int k = 0; k < out_var->arguments_size(); k++) { - std::string arg_value = out_var->arguments(k); - std::string arg_value_with_id = - arg_value + std::to_string(var2id[arg_value]); - if (output_names_with_id.count(arg_value_with_id)) { - output_name_map[arg_value] = arg_value_with_id; - } - replaced_names.push_back(arg_value_with_id); - } - out_var->clear_arguments(); - for (size_t k = 0; k < replaced_names.size(); k++) { - out_var->add_arguments(replaced_names[k]); - } - } - } + RenameAndGetOutputs(subgraph_nodes, &block_desc, input_names_with_id, + &output_names_with_id, &output_names, &output_name_map); // When tensorrt engine runs at the end of the operation, // output_mapping help us copy the data from the renamed ITensor @@ -212,6 +180,7 @@ void TensorRtSubgraphPass::CreateTensorRTOp(framework::ir::Node *node, PADDLE_ENFORCE(output_name_map.count(name) != 0); output_mapping.push_back(output_name_map[name]); } + PADDLE_ENFORCE(!output_mapping.empty()); auto *vars = block_desc.Proto()->mutable_vars(); for (framework::ir::Node *node : graph->Nodes()) { @@ -222,26 +191,83 @@ void TensorRtSubgraphPass::CreateTensorRTOp(framework::ir::Node *node, PADDLE_ENFORCE(!block_desc.Proto()->vars().empty(), "the block has no var-desc"); - PADDLE_ENFORCE(!output_mapping.empty()); + + // Set attrs + op_desc->SetType("tensorrt_engine"); + op_desc->SetInput( + "Xs", std::vector(input_names.begin(), input_names.end())); + + op_desc->SetOutput( + "Ys", std::vector(output_names.begin(), output_names.end())); + op_desc->SetBlockAttr("sub_block", new_block); SetAttr(op_desc->Proto(), "subgraph", block_desc.Proto()->SerializeAsString()); - // Set attrs SetAttr(op_desc->Proto(), "max_batch_size", Get("max_batch_size")); SetAttr(op_desc->Proto(), "workspace_size", Get("workspace_size")); - SetAttr(op_desc->Proto(), "parameters", ExtractParameters(graph->Nodes())); SetAttr(op_desc->Proto(), "output_name_mapping", output_mapping); + SetAttr(op_desc->Proto(), "parameters", params); auto enable_int8 = Get("enable_int8"); - auto engine_key = - GenerateEngineKey(input_names_with_id, output_names_with_id); + auto engine_key = GenerateEngineKey(input_names_with_id, output_names_with_id, + std::to_string(0)); + // Get "" when there is no cached calibration table data. std::string calibration_data = GetTrtCalibTableData( Get("model_opt_cache_dir"), engine_key, enable_int8); SetAttr(op_desc->Proto(), "calibration_data", calibration_data); SetAttr(op_desc->Proto(), "enable_int8", enable_int8); SetAttr(op_desc->Proto(), "engine_key", engine_key); + SetAttr(op_desc->Proto(), "engine_serialized_data", std::string("")); + + std::unique_ptr calibrator; + if (enable_int8 && calibration_data.size() != 0) { + calibrator.reset(new tensorrt::TRTInt8Calibrator(calibration_data)); + } + + bool use_static_engine = Get("use_static_engine"); + // When in int8 mode and calibration_mode, the program just produce the + // calibration table data. + bool calibration_mode = (enable_int8 && calibration_data.size() == 0); + if (!calibration_mode && use_static_engine) { + std::copy(params.begin(), params.end(), + std::back_inserter(*repetitive_params)); + std::string trt_engine_serialized_data = GetTrtEngineSerializedData( + Get("model_opt_cache_dir"), engine_key); + + if (trt_engine_serialized_data.empty()) { + LOG(INFO) << "Prepare TRT engine (Optimize model structure, Select OP " + "kernel etc). This process may cost a lot of time."; + std::unique_ptr trt_engine( + new tensorrt::TensorRTEngine( + Get("max_batch_size"), Get("workspace_size"), + enable_int8, calibrator.get(), Get("gpu_device_id"))); + auto *scope = param_scope(); + framework::BlockDesc block_desc_temp(nullptr, block_desc.Proto()); + std::unordered_set param_set(params.begin(), params.end()); + inference::Singleton::Global() + .ConvertBlockToTRTEngine( + &block_desc_temp, *scope, + std::vector(input_names.begin(), input_names.end()), + param_set, output_mapping, trt_engine.get()); + nvinfer1::IHostMemory *serialized_engine_data = trt_engine->Serialize(); + trt_engine_serialized_data = + std::string((const char *)serialized_engine_data->data(), + serialized_engine_data->size()); + SaveTrtEngineSerializedDataToFile( + GetTrtEngineSerializedPath(Get("model_opt_cache_dir"), + engine_key), + trt_engine_serialized_data); + } else { + LOG(INFO) << "Load TRT Optimized Info from " + << GetTrtEngineSerializedPath( + Get("model_opt_cache_dir"), engine_key); + } + + SetAttr(op_desc->Proto(), "engine_serialized_data", + trt_engine_serialized_data); + } } std::vector ExtractParameters( @@ -253,7 +279,7 @@ std::vector ExtractParameters( for (const auto &node : nodes) { if (!node->IsOp()) continue; std::string op_type = node->Op()->Type(); - if (op_type == "feed") { + if (op_type == "feed" || op_type == "fetch") { std::vector output_names = node->Op()->OutputArgumentNames(); std::copy(output_names.begin(), output_names.end(), std::back_inserter(feed_outputs)); @@ -272,6 +298,99 @@ std::vector ExtractParameters( return parameters; } +void RenameAndGetOutputs( + const std::vector &subgraph_nodes, + framework::BlockDesc *block_desc, + const std::set &input_names_with_id, + std::set *output_names_with_id, + std::set *output_names, + std::unordered_map *output_name_map) { + //// In the normal case, the paddle-trt exists bug when runing the googlenet. + // When there are more than two convolutions of 1 * 1 with the same input, the + // paddle-tensorrt will do the merging optimization, which fuse those conv + // into one conv, and then trigger bug. So, We should use strategy to avoid + // this optimization for the time being. This bug will be fixed in the future. + std::unordered_map + same_hierarchy_conv2d_num_map; + + for (size_t index = 0; index < block_desc->OpSize(); ++index) { + framework::proto::OpDesc *op = block_desc->Op(index)->Proto(); + framework::OpDesc op_desc(*op, nullptr); + auto correspond_node = subgraph_nodes[index]; + PADDLE_ENFORCE_EQ(correspond_node->Name(), op->type()); + + std::unordered_map var2id; + std::unordered_map in_vars; + for (auto *in_var : correspond_node->inputs) { + var2id[in_var->Name()] = in_var->id(); + in_vars[in_var->Name()] = in_var; + } + // rename for the input variables of op inside subgraph + for (int i = 0; i < op->inputs_size(); i++) { + // one input + auto *in_var = op->mutable_inputs(i); + std::vector replaced_names; + for (int k = 0; k < in_var->arguments_size(); k++) { // all the arguments + std::string arg_value = in_var->arguments(k); + std::string arg_value_with_id = + arg_value + std::to_string(var2id[arg_value]); + if (input_names_with_id.count(arg_value_with_id)) { + replaced_names.push_back(arg_value); + } else { + replaced_names.push_back(arg_value_with_id); + } + } + in_var->clear_arguments(); + for (size_t k = 0; k < replaced_names.size(); k++) { + in_var->add_arguments(replaced_names[k]); + } + } + var2id.clear(); + for (auto out_var : correspond_node->outputs) { + var2id[out_var->Name()] = out_var->id(); + } + + if (op_desc.Type() == "conv2d") { + auto input_var_name = op_desc.Input("Input").front(); + auto filter_var_name = op_desc.Input("Filter").front(); + auto out_var_name = op_desc.Output("Output").front(); + auto filter_shape = in_vars[filter_var_name]->Var()->GetShape(); + const std::vector strides = + boost::get>(op_desc.GetAttr("strides")); + const std::vector paddings = + boost::get>(op_desc.GetAttr("paddings")); + if (same_hierarchy_conv2d_num_map[input_var_name] > 0) { + (*output_names_with_id) + .insert(out_var_name + std::to_string(var2id[out_var_name])); + (*output_names).insert(out_var_name); + } else if (filter_shape[2] == 1 && filter_shape[3] == 1 && + strides[0] == 1 && strides[1] == 1 && paddings[0] == 0 && + paddings[1] == 0) { + same_hierarchy_conv2d_num_map[input_var_name] += 1; + } + } + + // rename for the output variables of op inside subgraph + for (int i = 0; i < op->outputs_size(); i++) { + framework::proto::OpDesc_Var *out_var = op->mutable_outputs(i); + std::vector replaced_names; + for (int k = 0; k < out_var->arguments_size(); k++) { + std::string arg_value = out_var->arguments(k); + std::string arg_value_with_id = + arg_value + std::to_string(var2id[arg_value]); + if (output_names_with_id->count(arg_value_with_id)) { + (*output_name_map)[arg_value] = arg_value_with_id; + } + replaced_names.push_back(arg_value_with_id); + } + out_var->clear_arguments(); + for (size_t k = 0; k < replaced_names.size(); k++) { + out_var->add_arguments(replaced_names[k]); + } + } + } +} + } // namespace analysis } // namespace inference } // namespace paddle diff --git a/paddle/fluid/inference/analysis/ir_passes/tensorrt_subgraph_pass.h b/paddle/fluid/inference/analysis/ir_passes/tensorrt_subgraph_pass.h index 502353b95fc15e763900a0caf1649257508f0880..6689a668fc9313df4105875477424f1426637226 100644 --- a/paddle/fluid/inference/analysis/ir_passes/tensorrt_subgraph_pass.h +++ b/paddle/fluid/inference/analysis/ir_passes/tensorrt_subgraph_pass.h @@ -13,7 +13,12 @@ // limitations under the License. #pragma once -#include +#include +#include +#include +#include +#include +#include "paddle/fluid/framework/ir/fuse_pass_base.h" #include "paddle/fluid/framework/ir/pass.h" namespace paddle { @@ -26,8 +31,9 @@ class TensorRtSubgraphPass : public framework::ir::FusePassBase { std::unique_ptr graph) const override; private: - void CreateTensorRTOp(framework::ir::Node *x, - framework::ir::Graph *graph) const; + void CreateTensorRTOp(framework::ir::Node *x, framework::ir::Graph *graph, + const std::vector &graph_params, + std::vector *repetitive_params) const; void CleanIntermediateOutputs(framework::ir::Node *node); }; diff --git a/paddle/fluid/inference/analysis/passes/ir_params_sync_among_devices_pass.cc b/paddle/fluid/inference/analysis/passes/ir_params_sync_among_devices_pass.cc index 8be2d3ac0b105e50fe619a720929dedaacb75537..d13ec7608c3e8075c1ef62fd4d47fbeee06e9005 100644 --- a/paddle/fluid/inference/analysis/passes/ir_params_sync_among_devices_pass.cc +++ b/paddle/fluid/inference/analysis/passes/ir_params_sync_among_devices_pass.cc @@ -31,6 +31,13 @@ void IrParamsSyncAmongDevicesPass::RunImpl(Argument *argument) { // The parameters are on the cpu, therefore, synchronization is not necessary. if (!argument->use_gpu()) return; + auto &graph = argument->main_graph(); + std::vector repetitive_params; + + if (graph.Has(framework::ir::kRepetitiveParamAttr)) + repetitive_params = graph.Get>( + framework::ir::kRepetitiveParamAttr); + LOG(INFO) << "Sync params from CPU to GPU"; PADDLE_ENFORCE(argument->gpu_device_id_valid()); @@ -43,6 +50,10 @@ void IrParamsSyncAmongDevicesPass::RunImpl(Argument *argument) { // Because there exists the case that new parameter variables are not added to // the program in the analysis pass. for (auto &var_name : all_vars) { + if (std::count(repetitive_params.begin(), repetitive_params.end(), + var_name)) { + continue; + } auto *var = scope->FindLocalVar(var_name); PADDLE_ENFORCE(var != nullptr); if (var->IsType() || diff --git a/paddle/fluid/inference/analysis/passes/ir_params_sync_among_devices_pass.h b/paddle/fluid/inference/analysis/passes/ir_params_sync_among_devices_pass.h index a95f460df6f9636fc17a5cf76920f5f459385120..61990150a30db147418c4301359428cf3c6db541 100644 --- a/paddle/fluid/inference/analysis/passes/ir_params_sync_among_devices_pass.h +++ b/paddle/fluid/inference/analysis/passes/ir_params_sync_among_devices_pass.h @@ -17,6 +17,7 @@ #include #include +#include "paddle/fluid/framework/ir/fuse_pass_base.h" #include "paddle/fluid/framework/scope.h" #include "paddle/fluid/inference/analysis/analysis_pass.h" #include "paddle/fluid/platform/place.h" diff --git a/paddle/fluid/inference/api/analysis_config.cc b/paddle/fluid/inference/api/analysis_config.cc index 522ab495227e9b8c52b8d38db696fa9b785ba642..77411112220dcb722d4d3482bc844720981a2da2 100644 --- a/paddle/fluid/inference/api/analysis_config.cc +++ b/paddle/fluid/inference/api/analysis_config.cc @@ -103,6 +103,7 @@ AnalysisConfig::AnalysisConfig(const AnalysisConfig &other) { CP_MEMBER(tensorrt_max_batchsize_); CP_MEMBER(tensorrt_min_subgraph_size_); CP_MEMBER(tensorrt_precision_mode_); + CP_MEMBER(trt_use_static_engine_); // MKLDNN related. CP_MEMBER(use_mkldnn_); CP_MEMBER(mkldnn_enabled_op_types_); @@ -144,7 +145,7 @@ void AnalysisConfig::EnableMKLDNN() { void AnalysisConfig::EnableTensorRtEngine( int workspace_size, int max_batch_size, int min_subgraph_size, - AnalysisConfig::Precision precision_mode) { + AnalysisConfig::Precision precision_mode, bool use_static) { #ifdef PADDLE_WITH_CUDA if (!use_gpu()) { LOG(ERROR) << "To use TensorRT engine, please call EnableGpu() first"; @@ -156,6 +157,7 @@ void AnalysisConfig::EnableTensorRtEngine( tensorrt_max_batchsize_ = max_batch_size; tensorrt_min_subgraph_size_ = min_subgraph_size; tensorrt_precision_mode_ = precision_mode; + trt_use_static_engine_ = use_static; Update(); #else diff --git a/paddle/fluid/inference/api/analysis_predictor.cc b/paddle/fluid/inference/api/analysis_predictor.cc index e8964c4acea0d220deca048a018eb7de42d7e4e5..cb92bb8211b25f436c1c3a0da014f1dc40520fbb 100644 --- a/paddle/fluid/inference/api/analysis_predictor.cc +++ b/paddle/fluid/inference/api/analysis_predictor.cc @@ -183,6 +183,9 @@ void AnalysisPredictor::SetMkldnnThreadID(int tid) { bool AnalysisPredictor::Run(const std::vector &inputs, std::vector *output_data, int batch_size) { + if (UNLIKELY(config_.cpu_math_library_num_threads() > 1)) { + paddle::platform::SetNumThreads(config_.cpu_math_library_num_threads()); + } VLOG(3) << "Predictor::predict"; inference::Timer timer; timer.tic(); @@ -362,6 +365,7 @@ void AnalysisPredictor::OptimizeInferenceProgram() { argument_.SetTensorRtMaxBatchSize(config_.tensorrt_max_batchsize_); argument_.SetTensorRtMinSubgraphSize(config_.tensorrt_min_subgraph_size_); argument_.SetTensorRtPrecisionMode(config_.tensorrt_precision_mode_); + argument_.SetTensorRtUseStaticEngine(config_.trt_use_static_engine_); } if (config_.use_mkldnn_) { @@ -435,12 +439,14 @@ void AnalysisPredictor::PrepareFeedFetch() { } feeds_[idx] = op; feed_names_[op->Output("Out")[0]] = idx; + idx2feeds_[idx] = op->Output("Out")[0]; } else if (op->Type() == "fetch") { int idx = boost::get(op->GetAttr("col")); if (fetches_.size() <= static_cast(idx)) { fetches_.resize(idx + 1); } fetches_[idx] = op; + idx2fetches_[idx] = op->Input("X")[0]; } } } @@ -453,6 +459,22 @@ void AnalysisPredictor::CreateFeedFetchVar(framework::Scope *scope) { var->GetMutable(); } +std::vector AnalysisPredictor::GetInputNames() { + std::vector input_names; + for (auto &item : idx2feeds_) { + input_names.push_back(item.second); + } + return input_names; +} + +std::vector AnalysisPredictor::GetOutputNames() { + std::vector output_names; + for (auto &item : idx2fetches_) { + output_names.push_back(item.second); + } + return output_names; +} + std::unique_ptr AnalysisPredictor::GetInputTensor( const std::string &name) { PADDLE_ENFORCE(executor_->scope()->FindVar(name), "no name called %s", name); @@ -460,6 +482,13 @@ std::unique_ptr AnalysisPredictor::GetInputTensor( new ZeroCopyTensor(static_cast(executor_->scope()))); res->input_or_output_ = true; res->SetName(name); + if (platform::is_cpu_place(place_)) { + res->SetPlace(PaddlePlace::kCPU); + } else { + auto gpu_place = boost::get(place_); + res->SetPlace(PaddlePlace::kGPU, gpu_place.GetDeviceId()); + } + return res; } @@ -470,6 +499,12 @@ std::unique_ptr AnalysisPredictor::GetOutputTensor( new ZeroCopyTensor(static_cast(executor_->scope()))); res->input_or_output_ = false; res->SetName(name); + if (platform::is_cpu_place(place_)) { + res->SetPlace(PaddlePlace::kCPU); + } else { + auto gpu_place = boost::get(place_); + res->SetPlace(PaddlePlace::kGPU, gpu_place.GetDeviceId()); + } return res; } diff --git a/paddle/fluid/inference/api/analysis_predictor.h b/paddle/fluid/inference/api/analysis_predictor.h index d5445c58e45ae64a8cfab03cb610e3677729338b..5c0535d63e00c32ef82aa6d804459542d7da3e50 100644 --- a/paddle/fluid/inference/api/analysis_predictor.h +++ b/paddle/fluid/inference/api/analysis_predictor.h @@ -15,12 +15,14 @@ #pragma once #include #include +#include #include #include #include "paddle/fluid/framework/naive_executor.h" #include "paddle/fluid/inference/analysis/analyzer.h" #include "paddle/fluid/inference/api/api_impl.h" #include "paddle/fluid/inference/api/details/reset_tensor_array.h" +#include "paddle/fluid/inference/api/helper.h" #include "paddle/fluid/inference/api/paddle_inference_api.h" #include "paddle/fluid/string/printf.h" #ifdef PADDLE_WITH_TESTING @@ -53,6 +55,9 @@ class AnalysisPredictor : public PaddlePredictor { std::vector *output_data, int batch_size = -1) override; + std::vector GetInputNames(); + std::vector GetOutputNames(); + std::unique_ptr GetInputTensor( const std::string &name) override; std::unique_ptr GetOutputTensor( @@ -131,7 +136,11 @@ class AnalysisPredictor : public PaddlePredictor { std::shared_ptr inference_program_; std::vector feeds_; std::map feed_names_; + // Sorted according to the idx. + std::map idx2feeds_; std::vector fetches_; + std::map idx2fetches_; + // Memory buffer for feed inputs. The temporary LoDTensor will cause serious // concurrency problems, wrong results and memory leak, so cache them. std::vector feed_tensors_; diff --git a/paddle/fluid/inference/api/api_impl.cc b/paddle/fluid/inference/api/api_impl.cc index 97c164bdef7a4b3e66be78526793f3830ada398b..048286a843f0190a8139cb86eda4f3a3a40d89a1 100644 --- a/paddle/fluid/inference/api/api_impl.cc +++ b/paddle/fluid/inference/api/api_impl.cc @@ -131,6 +131,9 @@ NativePaddlePredictor::~NativePaddlePredictor() { bool NativePaddlePredictor::Run(const std::vector &inputs, std::vector *output_data, int batch_size) { + if (UNLIKELY(config_.cpu_math_library_num_threads() > 1)) { + paddle::platform::SetNumThreads(config_.cpu_math_library_num_threads()); + } VLOG(3) << "Predictor::predict"; Timer timer; timer.tic(); diff --git a/paddle/fluid/inference/api/details/zero_copy_tensor.cc b/paddle/fluid/inference/api/details/zero_copy_tensor.cc index f60ff40c5da3e9e03c2cb3583263394cb82db805..cf02901d963858d2a44b7c588a5c6a49358b0d3f 100644 --- a/paddle/fluid/inference/api/details/zero_copy_tensor.cc +++ b/paddle/fluid/inference/api/details/zero_copy_tensor.cc @@ -15,6 +15,7 @@ #include "paddle/fluid/framework/lod_tensor.h" #include "paddle/fluid/framework/scope.h" #include "paddle/fluid/inference/api/paddle_inference_api.h" +#include "paddle/fluid/memory/memcpy.h" #include "paddle/fluid/platform/enforce.h" namespace paddle { @@ -73,6 +74,61 @@ T *ZeroCopyTensor::data(PaddlePlace *place, int *size) const { return res; } +template +void ZeroCopyTensor::copy_from_cpu(const T *data) { + EAGER_GET_TENSOR; + PADDLE_ENFORCE_GE( + tensor->numel(), 0, + "You should call ZeroCopyTensor::Reshape(const std::vector &shape)" + "function before copy data from cpu."); + size_t ele_size = tensor->numel() * sizeof(T); + + if (place_ == PaddlePlace::kCPU) { + auto *t_data = tensor->mutable_data(platform::CPUPlace()); + std::memcpy(static_cast(t_data), data, ele_size); + } else { +#ifdef PADDLE_WITH_CUDA + platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance(); + platform::CUDAPlace gpu_place(device_); + auto *t_data = tensor->mutable_data(gpu_place); + auto *dev_ctx = + static_cast(pool.Get(gpu_place)); + + memory::Copy(gpu_place, static_cast(t_data), platform::CPUPlace(), + data, ele_size, dev_ctx->stream()); +#else + PADDLE_THROW("Not compile with CUDA, should not reach here."); +#endif + } +} + +template +void ZeroCopyTensor::copy_to_cpu(T *data) { + EAGER_GET_TENSOR; + auto ele_num = tensor->numel(); + auto *t_data = tensor->data(); + auto t_place = tensor->place(); + + if (platform::is_cpu_place(t_place)) { + std::memcpy(static_cast(data), t_data, ele_num * sizeof(T)); + } else { +#ifdef PADDLE_WITH_CUDA + platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance(); + auto gpu_place = boost::get(t_place); + auto *dev_ctx = + static_cast(pool.Get(gpu_place)); + memory::Copy(platform::CPUPlace(), static_cast(data), gpu_place, + t_data, ele_num * sizeof(T), dev_ctx->stream()); +#else + PADDLE_THROW("Not compile with CUDA, should not reach here."); +#endif + } +} +template void ZeroCopyTensor::copy_from_cpu(const float *data); +template void ZeroCopyTensor::copy_from_cpu(const int64_t *data); +template void ZeroCopyTensor::copy_to_cpu(float *data); +template void ZeroCopyTensor::copy_to_cpu(int64_t *data); + template float *ZeroCopyTensor::data(PaddlePlace *place, int *size) const; template int64_t *ZeroCopyTensor::data(PaddlePlace *place, @@ -92,10 +148,10 @@ void *ZeroCopyTensor::FindTensor() const { return tensor; } -std::vector ZeroCopyTensor::shape() const { +std::vector ZeroCopyTensor::shape() const { EAGER_GET_TENSOR; PADDLE_ENFORCE(tensor_, "not found tensor called %s in the scope", name_); - return framework::vectorize(tensor->dims()); + return framework::vectorize2int(tensor->dims()); } void ZeroCopyTensor::SetLoD(const std::vector> &x) { diff --git a/paddle/fluid/inference/api/details/zero_copy_tensor_dummy.cc b/paddle/fluid/inference/api/details/zero_copy_tensor_dummy.cc index 12071e09f8442f2c52a06b7c3fe4bed2c28b524a..cbbb3ea2d1395acdf4c460bea4b7868c31a20e53 100644 --- a/paddle/fluid/inference/api/details/zero_copy_tensor_dummy.cc +++ b/paddle/fluid/inference/api/details/zero_copy_tensor_dummy.cc @@ -37,7 +37,7 @@ template int64_t *ZeroCopyTensor::mutable_data(PaddlePlace place); void *ZeroCopyTensor::FindTensor() const { return nullptr; } -std::vector ZeroCopyTensor::shape() const { return {}; } +std::vector ZeroCopyTensor::shape() const { return {}; } void ZeroCopyTensor::SetLoD(const std::vector> &x) {} diff --git a/paddle/fluid/inference/api/helper.h b/paddle/fluid/inference/api/helper.h index b92781e4f2c612cbb39fcaa7c80b6051a67215fd..ec3bef42fd91cea04a656dd38a4e5c45c1a76476 100644 --- a/paddle/fluid/inference/api/helper.h +++ b/paddle/fluid/inference/api/helper.h @@ -50,6 +50,11 @@ class Timer { } }; +static int GetUniqueId() { + static int id = 0; + return id++; +} + static void split(const std::string &str, char sep, std::vector *pieces) { pieces->clear(); diff --git a/paddle/fluid/inference/api/paddle_analysis_config.h b/paddle/fluid/inference/api/paddle_analysis_config.h index c1c6227cdd8b2042f6765c7932327ecae246c260..9b05c335047d7f9a0c50004e4ff6817ddd53d80f 100644 --- a/paddle/fluid/inference/api/paddle_analysis_config.h +++ b/paddle/fluid/inference/api/paddle_analysis_config.h @@ -135,7 +135,8 @@ struct AnalysisConfig { */ void EnableTensorRtEngine(int workspace_size = 1 << 20, int max_batch_size = 1, int min_subgraph_size = 3, - Precision precision = Precision::kFloat32); + Precision precision = Precision::kFloat32, + bool use_static = true); /** A boolean state telling whether the TensorRT engine is used. */ bool tensorrt_engine_enabled() const { return use_tensorrt_; } @@ -233,6 +234,7 @@ struct AnalysisConfig { // subgraph, 3 as default value. int tensorrt_min_subgraph_size_{3}; Precision tensorrt_precision_mode_; + bool trt_use_static_engine_; // memory reuse related. bool enable_memory_optim_{false}; diff --git a/paddle/fluid/inference/api/paddle_api.h b/paddle/fluid/inference/api/paddle_api.h index c9a45b4aa3b4037d3725622fc960848bc1ccfb2c..f807289f6aee06e3ff61bc0dd92f47c599421354 100644 --- a/paddle/fluid/inference/api/paddle_api.h +++ b/paddle/fluid/inference/api/paddle_api.h @@ -160,11 +160,21 @@ class ZeroCopyTensor { template T* data(PaddlePlace* place, int* size) const; - std::vector shape() const; + template + void copy_from_cpu(const T* data); + + template + void copy_to_cpu(T* data); + + std::vector shape() const; void SetLoD(const std::vector>& x); std::vector> lod() const; const std::string& name() const { return name_; } + void SetPlace(PaddlePlace place, int device = -1) { + place_ = place; + device_ = device; + } protected: explicit ZeroCopyTensor(void* scope) : scope_{scope} {} @@ -179,6 +189,8 @@ class ZeroCopyTensor { // The corresponding tensor pointer inside Paddle workspace is cached for // performance. mutable void* tensor_{nullptr}; + PaddlePlace place_; + int device_; }; /** A simple Inference API for Paddle. @@ -200,6 +212,14 @@ class PaddlePredictor { std::vector* output_data, int batch_size = -1) = 0; + /** \brief Get input names of the model + */ + virtual std::vector GetInputNames() { return {}; } + + /** \brief Get output names of the model + */ + virtual std::vector GetOutputNames() { return {}; } + /** \brief Get a mutable tensor directly. * * NOTE Only works in AnalysisPredictor. diff --git a/paddle/fluid/inference/engine.h b/paddle/fluid/inference/engine.h index ce2b8161715a3fa2278ce950dbac82c6d0042bef..1a13ba510384c010e476bf0ba0ad5b0ba84d3240 100644 --- a/paddle/fluid/inference/engine.h +++ b/paddle/fluid/inference/engine.h @@ -49,11 +49,6 @@ class EngineBase { // Execute the engine, that will run the inference network. virtual void Execute(int batch_size) = 0; - // Return the IO buffer that allocated in engine. One can read/write directly - // on the buffer. If the buffer's buffer is nullptr, one can also allocate - // memory and maintain it outside the engine. - virtual Buffer& buffer(const std::string& name) = 0; - virtual ~EngineBase() {} }; // class EngineBase diff --git a/paddle/fluid/inference/tensorrt/convert/conv2d_op.cc b/paddle/fluid/inference/tensorrt/convert/conv2d_op.cc index 7900f56c9ce17ffc7c62c85a42c62ba326dea16e..39a99a21ea702032669ed4ed3016ab34128c9925 100644 --- a/paddle/fluid/inference/tensorrt/convert/conv2d_op.cc +++ b/paddle/fluid/inference/tensorrt/convert/conv2d_op.cc @@ -18,21 +18,6 @@ namespace paddle { namespace inference { namespace tensorrt { -bool to_skip_merging_optimize(TensorRTEngine* engine, - const std::vector& filters, - const std::vector& strides, - const std::vector& paddings, - std::string input_name) { - if (engine->itensor_quote_num[input_name] > 0) { - return true; - } - if (filters[0] == 1 && filters[1] == 1 && strides[0] == 1 && - strides[1] == 1 && paddings[0] == 0 && paddings[1] == 0) - engine->itensor_quote_num[input_name] += 1; - - return false; -} - template void ConvertConv2d(TensorRTEngine* engine, const framework::proto::OpDesc& op, const framework::Scope& scope, bool test_mode, @@ -59,7 +44,7 @@ void ConvertConv2d(TensorRTEngine* engine, const framework::proto::OpDesc& op, weight_tensor->Resize(Y_t->dims()); TensorCopySync((*Y_t), cpu_place, weight_tensor.get()); - auto* weight_data = weight_tensor->mutable_data(platform::CPUPlace()); + auto* weight_data = weight_tensor->mutable_data(cpu_place); PADDLE_ENFORCE_EQ(weight_tensor->dims().size(), 4UL); const int n_output = weight_tensor->dims()[0]; @@ -100,9 +85,7 @@ void ConvertConv2d(TensorRTEngine* engine, const framework::proto::OpDesc& op, layer->getOutput(0)->setName(output_name.c_str()); engine->SetITensor(output_name, layer->getOutput(0)); - if (test_mode || - to_skip_merging_optimize(engine, {filter_h, filter_w}, strides, paddings, - op_desc.Input("Input").front())) { + if (test_mode) { engine->DeclareOutput(output_name); } } diff --git a/paddle/fluid/inference/tensorrt/convert/elementwise_op.cc b/paddle/fluid/inference/tensorrt/convert/elementwise_op.cc index 79362f9677010247dffa4fbaa155a7a56eed6f85..0c5a1a6ef16f05308df22452ed5e184e94e117d2 100644 --- a/paddle/fluid/inference/tensorrt/convert/elementwise_op.cc +++ b/paddle/fluid/inference/tensorrt/convert/elementwise_op.cc @@ -153,7 +153,6 @@ class ElementwiseTensorOpConverter : public OpConverter { if (CheckDims(dims_x, dims_y)) { // The two input tensor should have the same dims VLOG(3) << "Convert a fluid elementwise op to TensorRT IElementWiseLayer"; - nvinfer1::IElementWiseLayer* layer = TRT_ENGINE_ADD_LAYER( engine_, ElementWise, *const_cast(X), *const_cast(Y), op_pair->second); @@ -166,7 +165,7 @@ class ElementwiseTensorOpConverter : public OpConverter { "ElementWisePluginLayer"; plugin::ElementWisePlugin* plugin = - new plugin::ElementWisePlugin(op_pair->second, dims_x, dims_y, axis); + new plugin::ElementWisePlugin(op_type_, dims_x, dims_y, axis); plugin->AddInput(X); plugin->AddInput(Y); nvinfer1::IPluginLayer* layer = engine_->AddPlugin( diff --git a/paddle/fluid/inference/tensorrt/convert/fc_op.cc b/paddle/fluid/inference/tensorrt/convert/fc_op.cc index eef4fab4e86f05fa80bc614371f1aa43e433407e..42dcd68e40e04e775961fd943070f3df2f28d99a 100644 --- a/paddle/fluid/inference/tensorrt/convert/fc_op.cc +++ b/paddle/fluid/inference/tensorrt/convert/fc_op.cc @@ -85,10 +85,10 @@ class FcOpConverter : public OpConverter { Y_t->dims()[0] * Y_t->dims()[1] * sizeof(float)); TensorRTEngine::Weight weight{nvinfer1::DataType::kFLOAT, static_cast(weight_data), - Y_t->memory_size() / sizeof(float)}; + static_cast(Y_t->numel())}; TensorRTEngine::Weight tmp_weight(nvinfer1::DataType::kFLOAT, static_cast(tmp->data()), - Y_t->memory_size() / sizeof(float)); + static_cast(Y_t->numel())); weight.dims.assign({Y_t->dims()[0], Y_t->dims()[1]}); tmp_weight.dims = weight.dims; diff --git a/paddle/fluid/inference/tensorrt/convert/op_converter.h b/paddle/fluid/inference/tensorrt/convert/op_converter.h index 91670ba8ac5332fe6e83b7bff14cb1a349d7e2a2..90ed90b1e2907cc4be6f507890bae8df5a44ee38 100644 --- a/paddle/fluid/inference/tensorrt/convert/op_converter.h +++ b/paddle/fluid/inference/tensorrt/convert/op_converter.h @@ -16,9 +16,12 @@ limitations under the License. */ #include #include +#include +#include #include "paddle/fluid/framework/block_desc.h" #include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/scope.h" +#include "paddle/fluid/inference/analysis/helper.h" #include "paddle/fluid/inference/tensorrt/engine.h" #include "paddle/fluid/inference/utils/singleton.h" @@ -26,6 +29,37 @@ namespace paddle { namespace inference { namespace tensorrt { +using FluidDT = framework::proto::VarType_Type; +using TRT_DT = nvinfer1::DataType; + +namespace { // NOLINT + +TRT_DT FluidDataType2TRT(FluidDT type) { + switch (type) { + case FluidDT::VarType_Type_FP32: + return TRT_DT::kFLOAT; + case FluidDT::VarType_Type_INT32: + return TRT_DT::kINT32; + default: + return TRT_DT::kINT32; + } + PADDLE_THROW("unkown type"); + return TRT_DT::kINT32; +} + +nvinfer1::Dims Vec2TRT_Dims(const std::vector& shape) { + PADDLE_ENFORCE_GT(shape.size(), 1UL, + "TensorRT' tensor input requires at least 2 dimensions"); + PADDLE_ENFORCE_LE(shape.size(), 4UL, + "TensorRT' tensor input requires at most 4 dimensions"); + PADDLE_ENFORCE(shape.size() == 4UL || shape.size() == 2UL); + if (shape.size() == 4UL) + return nvinfer1::DimsCHW(shape[1], shape[2], shape[3]); + return nvinfer1::DimsCHW(shape[1], 1, 1); +} + +} // namespace // NOLINT + /* * Convert Op from Fluid to TensorRT Engine. */ @@ -110,6 +144,34 @@ class OpConverter { } } + // The scope here should be inited with the parameter vars. + void ConvertBlockToTRTEngine( + framework::BlockDesc* block_desc, const framework::Scope& scope, + const std::vector& inputs, + const std::unordered_set& parameters, + const std::vector& outputs, TensorRTEngine* engine) { + engine->InitNetwork(); + for (auto& input : inputs) { + if (parameters.count(input)) continue; + auto* var = block_desc->FindVar(input); + PADDLE_ENFORCE(var, "no variable called %s", input); + PADDLE_ENFORCE_EQ(var->GetType(), FluidDT::VarType_Type_LOD_TENSOR, + "TensorRT engine only takes LoDTensor as input"); + auto var_shape = var->GetShape(); + + engine->DeclareInput( + input, FluidDataType2TRT( + var->Proto()->type().lod_tensor().tensor().data_type()), + Vec2TRT_Dims(var_shape)); + } + framework::proto::BlockDesc* block_proto = block_desc->Proto(); + ConvertBlock(*block_proto, parameters, scope, engine); + for (auto& output : outputs) { + engine->DeclareOutput(output); + } + engine->FreezeNetwork(); + } + void SetEngine(TensorRTEngine* engine) { engine_ = engine; } virtual ~OpConverter() {} diff --git a/paddle/fluid/inference/tensorrt/convert/prelu_op.cc b/paddle/fluid/inference/tensorrt/convert/prelu_op.cc index dbdff85ddebc85bc51938a204a48affe485b8240..2ae804106e5f7b51fc43e33cad986619e6a57d74 100644 --- a/paddle/fluid/inference/tensorrt/convert/prelu_op.cc +++ b/paddle/fluid/inference/tensorrt/convert/prelu_op.cc @@ -43,23 +43,20 @@ class PReluOpConverter : public OpConverter { PADDLE_ENFORCE_NOT_NULL(alpha_var); auto* alpha_tensor = alpha_var->GetMutable(); - platform::CUDAPlace place; - std::unique_ptr alpha_tensor_device( + platform::CPUPlace cpu_place; + std::unique_ptr alpha_tensor_temp( new framework::LoDTensor()); - alpha_tensor_device->Resize(alpha_tensor->dims()); - TensorCopySync(*alpha_tensor, place, alpha_tensor_device.get()); - float* alpha_data = alpha_tensor_device->mutable_data(place); + alpha_tensor_temp->Resize(alpha_tensor->dims()); + TensorCopySync(*alpha_tensor, cpu_place, alpha_tensor_temp.get()); + float* alpha_data = alpha_tensor_temp->mutable_data(cpu_place); - // Transform alpha to TensorRTEngine::Weight - TensorRTEngine::Weight alpha_rt(nvinfer1::DataType::kFLOAT, - static_cast(alpha_data), - alpha_tensor_device->numel()); - plugin::PReluPlugin* plugin = new plugin::PReluPlugin(alpha_rt, mode); + plugin::PReluPlugin* plugin = + new plugin::PReluPlugin(alpha_data, alpha_tensor_temp->numel(), mode); nvinfer1::IPluginLayer* layer = engine_->AddPlugin(&input, input_num, plugin); // keep alpha tensor to avoid release it's memory engine_->weight_map[op_desc.Input("Alpha")[0]] = - std::move(alpha_tensor_device); + std::move(alpha_tensor_temp); std::string layer_name = "prelu (Output: "; auto output_name = op_desc.Output("Out")[0]; diff --git a/paddle/fluid/inference/tensorrt/convert/ut_helper.h b/paddle/fluid/inference/tensorrt/convert/ut_helper.h index e83961f3d7bda03a7659f175c59105dcb60708e9..2571abbf69892dae626c7178609c2825775fdf2e 100644 --- a/paddle/fluid/inference/tensorrt/convert/ut_helper.h +++ b/paddle/fluid/inference/tensorrt/convert/ut_helper.h @@ -19,7 +19,9 @@ limitations under the License. */ #pragma once +#include #include +#include #include #include "paddle/fluid/framework/lod_tensor.h" @@ -79,7 +81,8 @@ class TRTConvertValidation { if_add_batch_(if_add_batch), max_batch_size_(max_batch_size) { PADDLE_ENFORCE_EQ(cudaStreamCreate(&stream_), 0); - engine_.reset(new TensorRTEngine(max_batch_size, workspace_size, stream_)); + engine_.reset( + new TensorRTEngine(max_batch_size, workspace_size, false, nullptr, 0)); engine_->InitNetwork(); } @@ -114,13 +117,12 @@ class TRTConvertValidation { } void DeclVar(const std::string& name, const std::vector dim_vec) { - platform::CUDAPlace place; - platform::CUDADeviceContext ctx(place); + platform::CUDADeviceContext ctx(place_); auto* x = scope_.Var(name); auto* x_tensor = x->GetMutable(); x_tensor->Resize(framework::make_ddim(dim_vec)); - RandomizeTensor(x_tensor, place, ctx); + RandomizeTensor(x_tensor, place_, ctx); } // Declare a variable in a fluid Scope. void DeclVar(const std::string& name, const nvinfer1::Dims& dims, @@ -146,19 +148,6 @@ class TRTConvertValidation { // Declare outputs. op_desc_.reset(new framework::OpDesc(desc, nullptr)); - - // Set Inputs. - for (const auto& input : op_desc_->InputArgumentNames()) { - if (parameters_.count(input)) continue; - auto* var = scope_.FindVar(input); - PADDLE_ENFORCE(var); - auto tensor = var->GetMutable(); - - engine_->SetInputFromGPU( - input, static_cast(tensor->data()), - sizeof(float) * - analysis::AccuDims(tensor->dims(), tensor->dims().size())); - } } // We use the set 'neglected_output' here, because some Ops like batch norm, @@ -168,43 +157,71 @@ class TRTConvertValidation { std::unordered_set neglected_output = {}) { // Execute Fluid Op PADDLE_ENFORCE_LE(batch_size, max_batch_size_); - platform::CUDAPlace place; - platform::CUDADeviceContext ctx(place); - op_->Run(scope_, place); - // Execute TRT. - engine_->Execute(batch_size); - cudaStreamSynchronize(engine_->stream()); + platform::CUDADeviceContext ctx(place_); + op_->Run(scope_, place_); - ASSERT_FALSE(op_desc_->OutputArgumentNames().empty()); - const size_t output_space_size = 3000; + std::vector input_output_names; + + // Note: we need filter the parameter + for (const auto& input : op_desc_->InputArgumentNames()) { + if (parameters_.count(input)) continue; + input_output_names.push_back(input); + } + + // Collect the fluid outputs. + std::vector> fluid_outs; for (const auto& output : op_desc_->OutputArgumentNames()) { if (neglected_output.count(output)) continue; + input_output_names.push_back(output); std::vector fluid_out; - std::vector trt_out(output_space_size); - engine_->GetOutputInCPU(output, &trt_out[0], output_space_size); - cudaStreamSynchronize(engine_->stream()); - auto* var = scope_.FindVar(output); - auto tensor = var->GetMutable(); + auto* tensor = var->GetMutable(); framework::TensorToVector(*tensor, ctx, &fluid_out); + fluid_outs.push_back(fluid_out); + } + + // Bind input and output for TRT. + const int num_bindings = input_output_names.size(); + std::vector buffers(num_bindings); + + for (const std::string& name : input_output_names) { + auto* var = scope_.FindVar(name); + auto* tensor = var->GetMutable(); + const int bind_index = engine_->engine()->getBindingIndex(name.c_str()); + buffers[bind_index] = + static_cast(tensor->mutable_data(place_)); + } + + // Execute TRT. + engine_->Execute(batch_size, &buffers, stream_); - size_t fluid_out_size = fluid_out.size(); + ASSERT_FALSE(op_desc_->OutputArgumentNames().empty()); + int index = 0; + for (const auto& output : op_desc_->OutputArgumentNames()) { + if (neglected_output.count(output)) continue; + std::vector trt_out; + auto* var = scope_.FindVar(output); + auto* tensor = var->GetMutable(); + framework::TensorToVector(*tensor, ctx, &trt_out); + + size_t fluid_out_size = fluid_outs[index].size(); if (if_add_batch_ == true) { fluid_out_size = batch_size * (framework::product(tensor->dims()) / max_batch_size_); } - // Compare two output - ASSERT_FALSE(fluid_out.empty()); + for (size_t i = 0; i < fluid_out_size; i++) { // Loose the threshold for CI in different machine model. - EXPECT_LT(std::abs(fluid_out[i] - trt_out[i]), 2e-5); + EXPECT_LT(std::abs(fluid_outs[index][i] - trt_out[i]), 2e-5); } + index += 1; } } framework::Scope& scope() { return scope_; } private: + platform::CUDAPlace place_; std::unique_ptr engine_; cudaStream_t stream_; std::unique_ptr op_; diff --git a/paddle/fluid/inference/tensorrt/engine.cc b/paddle/fluid/inference/tensorrt/engine.cc index 10f48462cfaf8073a4f5537d654d614d36b74db4..fddf5f11c285da4687b08d1962b6f1f51390e03e 100644 --- a/paddle/fluid/inference/tensorrt/engine.cc +++ b/paddle/fluid/inference/tensorrt/engine.cc @@ -32,36 +32,18 @@ void TensorRTEngine::Build(const DescType &paddle_model) { PADDLE_ENFORCE(false, "not implemented"); } -void TensorRTEngine::Execute(int batch_size) { +void TensorRTEngine::Execute(int batch_size, std::vector *buffers, + cudaStream_t stream) { freshDeviceId(); batch_size_ = batch_size; - std::vector buffers; - for (auto &buf : buffers_) { - PADDLE_ENFORCE_NOT_NULL(buf.buffer, "buffer should be allocated"); - PADDLE_ENFORCE_GT(buf.max_size, 0); - PADDLE_ENFORCE(buf.device == DeviceType::GPU); - buffers.push_back(buf.buffer); - } - infer_context_->enqueue(batch_size, buffers.data(), stream_, nullptr); - cudaStreamSynchronize(stream_); + infer_context_->enqueue(batch_size, buffers->data(), stream, nullptr); + cudaStreamSynchronize(stream); SetRuntimeBatch(batch_size); } -TensorRTEngine::~TensorRTEngine() { - cudaStreamSynchronize(stream_); - // clean buffer - for (auto &buf : buffers_) { - if (buf.device == DeviceType::GPU && buf.buffer != nullptr) { - PADDLE_ENFORCE_EQ(0, cudaFree(buf.buffer)); - buf.buffer = nullptr; - buf.max_size = 0; - } - } -} - void TensorRTEngine::FreezeNetwork() { - VLOG(3) << "TRT to freeze network"; freshDeviceId(); + VLOG(3) << "TRT to freeze network"; PADDLE_ENFORCE(infer_builder_ != nullptr, "Call InitNetwork first to initialize network."); PADDLE_ENFORCE(infer_network_ != nullptr, @@ -81,30 +63,6 @@ void TensorRTEngine::FreezeNetwork() { PADDLE_ENFORCE(infer_engine_ != nullptr, "build cuda engine failed!"); infer_context_.reset(infer_engine_->createExecutionContext()); - - // allocate GPU buffers. - buffers_.resize(buffer_sizes_.size()); - for (auto &item : buffer_sizes_) { - // The output buffers are not set in the network building phrase, need to - // infer from the TesorRT network. - if (item.second == 0) { - auto slot_offset = infer_engine_->getBindingIndex(item.first.c_str()); - auto dims = infer_engine_->getBindingDimensions(slot_offset); - item.second = kDataTypeSize[static_cast( - infer_engine_->getBindingDataType(slot_offset))] * - analysis::AccuDims(dims.d, dims.nbDims) * max_batch_; - PADDLE_ENFORCE_GT(item.second, 0); - } - - auto &buf = buffer(item.first); - buf.max_size = item.second * max_batch_; - CHECK(buf.buffer == nullptr); // buffer should be allocated only once. - - PADDLE_ENFORCE_EQ(0, cudaMalloc(&buf.buffer, item.second * max_batch_)); - buf.size = 0; - PADDLE_ENFORCE_LE(buf.max_size, 1 << 30); // 10G - buf.device = DeviceType::GPU; - } } nvinfer1::ITensor *TensorRTEngine::DeclareInput(const std::string &name, @@ -158,83 +116,6 @@ void TensorRTEngine::DeclareOutput(const std::string &name) { buffer_sizes_[name] = 0; } -void *TensorRTEngine::GetOutputInGPU(const std::string &name) { - return buffer(name).buffer; -} - -void TensorRTEngine::GetOutputInGPU(const std::string &name, void *dst, - size_t max_size) { - // determine data size - auto *output = TensorRTEngine::GetITensor(name); - nvinfer1::Dims dims = output->getDimensions(); - auto dim_size = analysis::AccuDims(dims.d, dims.nbDims); - size_t dst_size = dim_size * runtime_batch_ * - kDataTypeSize[static_cast(output->getType())]; - - auto it = buffer_sizes_.find(name); - PADDLE_ENFORCE(it != buffer_sizes_.end()); - PADDLE_ENFORCE_GT(it->second, 0); - PADDLE_ENFORCE_LE(dst_size, it->second); - PADDLE_ENFORCE_GE(max_size, dst_size); - auto &buf = buffer(name); - PADDLE_ENFORCE_NOT_NULL(buf.buffer, "buffer should be allocated before"); - PADDLE_ENFORCE_EQ(cudaMemcpyAsync(dst, buf.buffer, dst_size, - cudaMemcpyDeviceToDevice, stream_), - 0); -} - -void TensorRTEngine::GetOutputInCPU(const std::string &name, void *dst, - size_t max_size) { - // determine data size - - auto *output = TensorRTEngine::GetITensor(name); - nvinfer1::Dims dims = output->getDimensions(); - auto dim_size = analysis::AccuDims(dims.d, dims.nbDims); - size_t dst_size = dim_size * runtime_batch_ * - kDataTypeSize[static_cast(output->getType())]; - auto it = buffer_sizes_.find(name); - PADDLE_ENFORCE(it != buffer_sizes_.end()); - PADDLE_ENFORCE_GT(it->second, 0); - PADDLE_ENFORCE_LE(dst_size, it->second); - PADDLE_ENFORCE_GE(max_size, dst_size); - auto &buf = buffer(name); - PADDLE_ENFORCE_NOT_NULL(buf.buffer, "buffer should be allocated before"); - PADDLE_ENFORCE_EQ(0, cudaMemcpyAsync(dst, buf.buffer, dst_size, - cudaMemcpyDeviceToHost, stream_)); -} - -Buffer &TensorRTEngine::buffer(const std::string &name) { - PADDLE_ENFORCE(infer_engine_ != nullptr, "call FreezeNetwork first."); - auto it = buffer_sizes_.find(name); - PADDLE_ENFORCE(it != buffer_sizes_.end(), "tried to access buffer named %s", - name); - auto slot_offset = infer_engine_->getBindingIndex(name.c_str()); - return buffers_[slot_offset]; -} - -void TensorRTEngine::SetInputFromCPU(const std::string &name, const void *data, - size_t size) { - auto &buf = buffer(name); - PADDLE_ENFORCE_NOT_NULL(buf.buffer); - PADDLE_ENFORCE_NOT_NULL(data); - PADDLE_ENFORCE_LE(size, buf.max_size, "buffer is too small"); - PADDLE_ENFORCE(buf.device == DeviceType::GPU); - buf.size = size; - PADDLE_ENFORCE_EQ(0, cudaMemcpyAsync(buf.buffer, data, size, - cudaMemcpyHostToDevice, stream_)); -} - -void TensorRTEngine::SetInputFromGPU(const std::string &name, const void *data, - size_t size) { - auto &buf = buffer(name); - buf.size = size; - PADDLE_ENFORCE_NOT_NULL(buf.buffer); - PADDLE_ENFORCE_LE(size, buf.max_size, "buffer is too small"); - PADDLE_ENFORCE(buf.device == DeviceType::GPU); - PADDLE_ENFORCE_EQ(0, cudaMemcpyAsync(buf.buffer, data, size, - cudaMemcpyDeviceToDevice, stream_)); -} - void TensorRTEngine::SetITensor(const std::string &name, nvinfer1::ITensor *tensor) { PADDLE_ENFORCE(tensor != nullptr); @@ -254,13 +135,6 @@ void TensorRTEngine::SetRuntimeBatch(size_t batch_size) { int TensorRTEngine::GetRuntimeBatch() { return runtime_batch_; } -void TensorRTEngine::freshDeviceId() { - int count; - cudaGetDeviceCount(&count); - PADDLE_ENFORCE_LT(device_, count); - cudaSetDevice(device_); -} - nvinfer1::IPluginLayer *TensorRTEngine::AddPlugin( nvinfer1::ITensor *const *inputs, int num_inputs, plugin::PluginTensorRT *plugin) { @@ -268,6 +142,13 @@ nvinfer1::IPluginLayer *TensorRTEngine::AddPlugin( return infer_network_.get()->addPluginExt(inputs, num_inputs, *plugin); } +void TensorRTEngine::freshDeviceId() { + int count; + cudaGetDeviceCount(&count); + PADDLE_ENFORCE_LT(device_id_, count); + cudaSetDevice(device_id_); +} + } // namespace tensorrt } // namespace inference } // namespace paddle diff --git a/paddle/fluid/inference/tensorrt/engine.h b/paddle/fluid/inference/tensorrt/engine.h index cdfe09b5a7fd2d1f8548dab9421f671f5a345153..657dfd9355f9e3167a123b1f71655869d030a3df 100644 --- a/paddle/fluid/inference/tensorrt/engine.h +++ b/paddle/fluid/inference/tensorrt/engine.h @@ -23,6 +23,7 @@ limitations under the License. */ #include "paddle/fluid/inference/engine.h" #include "paddle/fluid/inference/tensorrt/helper.h" #include "paddle/fluid/inference/tensorrt/plugin/trt_plugin.h" +#include "paddle/fluid/inference/tensorrt/plugin/trt_plugin_factory.h" #include "paddle/fluid/inference/tensorrt/trt_int8_calibrator.h" #include "paddle/fluid/inference/utils/singleton.h" @@ -37,7 +38,9 @@ class TRTInt8Calibrator; * There are two alternative ways to use it, one is to build from a paddle * protobuf model, another way is to manully construct the network. */ -class TensorRTEngine : public EngineBase { +class TensorRTEngine { + using DescType = ::paddle::framework::proto::BlockDesc; + public: // Weight is model parameter. class Weight { @@ -56,28 +59,28 @@ class TensorRTEngine : public EngineBase { nvinfer1::Weights w_; }; - TensorRTEngine(int max_batch, int max_workspace, cudaStream_t stream, - int device = 0, bool enable_int8 = false, - TRTInt8Calibrator* calibrator = nullptr, + TensorRTEngine(int max_batch, int max_workspace, bool enable_int8 = false, + TRTInt8Calibrator* calibrator = nullptr, int device_id = 0, nvinfer1::ILogger& logger = NaiveLogger::Global()) : max_batch_(max_batch), max_workspace_(max_workspace), - stream_(stream), - device_(device), enable_int8_(enable_int8), calibrator_(calibrator), + device_id_(device_id), logger_(logger) {} - virtual ~TensorRTEngine(); + ~TensorRTEngine() {} // TODO(Superjomn) implement it later when graph segmentation is supported. - void Build(const DescType& paddle_model) override; + void Build(const DescType& paddle_model); - void Execute(int batch_size) override; + void Execute(int batch_size, std::vector* buffers, + cudaStream_t stream); // Initialize the inference network, so that TensorRT layers can add to this // network. void InitNetwork() { + freshDeviceId(); infer_builder_.reset(createInferBuilder(&logger_)); infer_network_.reset(infer_builder_->createNetwork()); } @@ -98,37 +101,34 @@ class TensorRTEngine : public EngineBase { // Check if the ITensor has been declared bool HasDeclared(const std::string& name); - // GPU memory address for an ITensor with specific name. One can operate on - // these memory directly for acceleration, for example, output the converted - // data directly to the buffer to save data copy overhead. - // NOTE this should be used after calling `FreezeNetwork`. - Buffer& buffer(const std::string& name) override; - - cudaStream_t stream() { return stream_; } - - // Fill an input from CPU memory with name and size. - void SetInputFromCPU(const std::string& name, const void* data, size_t size); - // TODO(Superjomn) is this method necessary given that buffer(xxx) can be - // accessed directly. Fill an input from GPU memory with name and size. - void SetInputFromGPU(const std::string& name, const void* data, size_t size); - // Get an output called name, the output of tensorrt is in GPU, so this method - // Return the output's GPU memory address without copy. - void* GetOutputInGPU(const std::string& name); - // Copy data into dst inside the GPU device. - void GetOutputInGPU(const std::string& name, void* dst, size_t max_size); - // LOW EFFICENCY! Get output to CPU, this will trigger a memory copy from GPU - // to CPU. - void GetOutputInCPU(const std::string& name, void* dst, size_t max_size); - // Fill an ITensor into map itensor_map_. void SetITensor(const std::string& name, nvinfer1::ITensor* tensor); // Get an ITensor called name. nvinfer1::ITensor* GetITensor(const std::string& name); nvinfer1::ICudaEngine* engine() { return infer_engine_.get(); } nvinfer1::INetworkDefinition* network() { return infer_network_.get(); } + + nvinfer1::IHostMemory* Serialize() { + PADDLE_ENFORCE(infer_engine_ != nullptr, + "You should build engine first and then serialize"); + ihost_memory_.reset(infer_engine_->serialize()); + return ihost_memory_.get(); + } + + void Deserialize(const std::string& engine_serialized_data) { + freshDeviceId(); + infer_ptr runtime(createInferRuntime(&logger_)); + infer_engine_.reset(runtime->deserializeCudaEngine( + engine_serialized_data.c_str(), engine_serialized_data.size(), + &inference::Singleton::Global())); + PADDLE_ENFORCE(infer_engine_ != nullptr, + "build cuda engine failed when deserialize engine info.!"); + infer_context_.reset(infer_engine_->createExecutionContext()); + } + void SetRuntimeBatch(size_t batch_size); int GetRuntimeBatch(); - int GetDevice() { return device_; } + int GetDeviceId() { return device_id_; } nvinfer1::IPluginLayer* AddPlugin(nvinfer1::ITensor* const* inputs, int num_inputs, plugin::PluginTensorRT*); @@ -140,17 +140,12 @@ class TensorRTEngine : public EngineBase { std::unordered_map> weight_map; - // TODO(NHZLX) - // In the normal case, the paddle-trt exists bug when runing the googlenet. - // When there are more than two convolutions of 1 * 1 with the same input, the - // paddle-tensorrt will do the merging optimization, which fuse those conv - // into one conv, and then trigger bug. So, We should use strategy to avoid - // this - // optimization for the time being. This bug will be fixed in the future. - std::unordered_map - itensor_quote_num; - private: + // Each ICudaEngine object is bound to a specific GPU when it is instantiated, + // ensure that the thread is associated with the correct device by calling + // freshDeviceId(). + void freshDeviceId(); + // the max batch size int max_batch_; // the runtime batch size @@ -158,18 +153,14 @@ class TensorRTEngine : public EngineBase { // the max memory size the engine uses int max_workspace_; - cudaStream_t stream_; - // The specific GPU id that the TensorRTEngine bounded to. - int device_; - bool enable_int8_; TRTInt8Calibrator* calibrator_; // batch size of the current data, will be updated each Executation. int batch_size_{-1}; + int device_id_; nvinfer1::ILogger& logger_; - std::vector buffers_; // max data size for the buffers. std::unordered_map buffer_sizes_; std::unordered_map @@ -192,15 +183,11 @@ class TensorRTEngine : public EngineBase { infer_ptr infer_network_; infer_ptr infer_engine_; infer_ptr infer_context_; - // Each ICudaEngine object is bound to a specific GPU when it is instantiated, - // ensure that the thread is associated with the correct device by calling - // freshDeviceId(). - void freshDeviceId(); + infer_ptr ihost_memory_; }; // class TensorRTEngine // Add an layer__ into engine__ with args ARGS. // For example: -// TRT_ENGINE_ADD_LAYER(xxx, FullyConnected, input, dim, weights, bias) // // Reference // https://docs.nvidia.com/deeplearning/sdk/tensorrt-developer-guide/index.html#charRNN_define_network diff --git a/paddle/fluid/inference/tensorrt/helper.h b/paddle/fluid/inference/tensorrt/helper.h index fc7ca7714e9325d2b6bce6189300aa339c81c2ba..010942a0678fe9a592d1a95ba9cdc6adc42cc2ec 100644 --- a/paddle/fluid/inference/tensorrt/helper.h +++ b/paddle/fluid/inference/tensorrt/helper.h @@ -17,6 +17,9 @@ #include #include #include +#include +#include +#include #include "paddle/fluid/platform/dynload/tensorrt.h" #include "paddle/fluid/platform/enforce.h" @@ -74,6 +77,32 @@ class NaiveLogger : public nvinfer1::ILogger { ~NaiveLogger() override {} }; +class NaiveProfiler : public nvinfer1::IProfiler { + public: + typedef std::pair Record; + std::vector mProfile; + + virtual void reportLayerTime(const char* layerName, float ms) { + auto record = + std::find_if(mProfile.begin(), mProfile.end(), + [&](const Record& r) { return r.first == layerName; }); + if (record == mProfile.end()) + mProfile.push_back(std::make_pair(layerName, ms)); + else + record->second += ms; + } + + void printLayerTimes() { + float totalTime = 0; + for (size_t i = 0; i < mProfile.size(); i++) { + printf("%-40.40s %4.3fms\n", mProfile[i].first.c_str(), + mProfile[i].second); + totalTime += mProfile[i].second; + } + printf("Time over all layers: %4.3f\n", totalTime); + } +}; + } // namespace tensorrt } // namespace inference } // namespace paddle diff --git a/paddle/fluid/inference/tensorrt/plugin/CMakeLists.txt b/paddle/fluid/inference/tensorrt/plugin/CMakeLists.txt index 95443e813327c1247ac530c4d2e68b3607ff0e73..709aa103d1b6681221328b180d65e90f08d3368e 100644 --- a/paddle/fluid/inference/tensorrt/plugin/CMakeLists.txt +++ b/paddle/fluid/inference/tensorrt/plugin/CMakeLists.txt @@ -1,4 +1,5 @@ nv_library(tensorrt_plugin - SRCS trt_plugin.cc split_op_plugin.cu elementwise_op_plugin.cu prelu_op_plugin.cu + SRCS trt_plugin.cc split_op_plugin.cu elementwise_op_plugin.cu + prelu_op_plugin.cu trt_plugin_factory.cc avg_pool_op_plugin.cu DEPS enforce tensorrt_engine prelu) diff --git a/paddle/fluid/inference/tensorrt/plugin/avg_pool_op_plugin.cu b/paddle/fluid/inference/tensorrt/plugin/avg_pool_op_plugin.cu index 5d747af8c55d71fee90ee0cc06fd328e583f3700..f27a838162c89b6377a7ffd995608b3a5a49eeae 100644 --- a/paddle/fluid/inference/tensorrt/plugin/avg_pool_op_plugin.cu +++ b/paddle/fluid/inference/tensorrt/plugin/avg_pool_op_plugin.cu @@ -13,6 +13,7 @@ // limitations under the License. #include "paddle/fluid/inference/tensorrt/plugin/avg_pool_op_plugin.h" +#include "paddle/fluid/inference/tensorrt/plugin/trt_plugin_factory.h" #include "paddle/fluid/operators/math/pooling.h" namespace paddle { @@ -20,6 +21,12 @@ namespace inference { namespace tensorrt { namespace plugin { +AvgPoolPlugin* CreateAvgPoolPluginDeserialize(const void* buffer, + size_t length) { + return new AvgPoolPlugin(buffer, length); +} +REGISTER_TRT_PLUGIN("avg_pool_plugin", CreateAvgPoolPluginDeserialize); + nvinfer1::Dims AvgPoolPlugin::getOutputDimensions( int index, const nvinfer1::Dims* inputDims, int nbInputs) { assert(nbInputs == 1); diff --git a/paddle/fluid/inference/tensorrt/plugin/avg_pool_op_plugin.h b/paddle/fluid/inference/tensorrt/plugin/avg_pool_op_plugin.h index b5e4ece0fba446627d619df6fe225e8c07231487..a7c0aa5794e6bb131d012cb12d6d9fc12a73bd0d 100644 --- a/paddle/fluid/inference/tensorrt/plugin/avg_pool_op_plugin.h +++ b/paddle/fluid/inference/tensorrt/plugin/avg_pool_op_plugin.h @@ -33,24 +33,27 @@ class AvgPoolPlugin : public PluginTensorRT { protected: size_t getSerializationSize() override { - return SerializedSize(ceil_mode_) + SerializedSize(ksize_) + - SerializedSize(strides_) + SerializedSize(paddings_) + - SerializedSize(input_shape_) + getBaseSerializationSize(); + return SerializedSize(getPluginType()) + SerializedSize(ceil_mode_) + + SerializedSize(ksize_) + SerializedSize(strides_) + + SerializedSize(paddings_) + SerializedSize(input_shape_) + + SerializedSize(output_shape_) + getBaseSerializationSize(); } // TRT will call this func when we need to serialize the configuration of // tensorrt. - // It should not be called by users. void serialize(void *buffer) override { + SerializeValue(&buffer, getPluginType()); serializeBase(buffer); SerializeValue(&buffer, ceil_mode_); SerializeValue(&buffer, ksize_); SerializeValue(&buffer, strides_); SerializeValue(&buffer, paddings_); SerializeValue(&buffer, input_shape_); + SerializeValue(&buffer, output_shape_); } public: + AvgPoolPlugin() {} AvgPoolPlugin(bool ceil_mode, std::vector ksize, std::vector strides, std::vector paddings, std::vector input_shape) @@ -89,6 +92,7 @@ class AvgPoolPlugin : public PluginTensorRT { DeserializeValue(&serialData, &serialLength, &strides_); DeserializeValue(&serialData, &serialLength, &paddings_); DeserializeValue(&serialData, &serialLength, &input_shape_); + DeserializeValue(&serialData, &serialLength, &output_shape_); } AvgPoolPlugin *clone() const override { @@ -96,7 +100,7 @@ class AvgPoolPlugin : public PluginTensorRT { input_shape_); } - const char *getPluginType() const override { return "avg_pool"; } + const char *getPluginType() const override { return "avg_pool_plugin"; } int getNbOutputs() const override { return 1; } nvinfer1::Dims getOutputDimensions(int index, const nvinfer1::Dims *inputs, int nbInputDims) override; diff --git a/paddle/fluid/inference/tensorrt/plugin/elementwise_op_plugin.cu b/paddle/fluid/inference/tensorrt/plugin/elementwise_op_plugin.cu index 9cd9026b7328083389b5af484bbb15c07b4908b0..9aed3ddab1448fde7cb6b0e13bcf0b05e23622e9 100644 --- a/paddle/fluid/inference/tensorrt/plugin/elementwise_op_plugin.cu +++ b/paddle/fluid/inference/tensorrt/plugin/elementwise_op_plugin.cu @@ -14,12 +14,19 @@ limitations under the License. */ #include #include "paddle/fluid/inference/tensorrt/plugin/elementwise_op_plugin.h" +#include "paddle/fluid/inference/tensorrt/plugin/trt_plugin_factory.h" namespace paddle { namespace inference { namespace tensorrt { namespace plugin { +ElementWisePlugin* CreateElementWisePluginDeserialize(const void* buffer, + size_t length) { + return new ElementWisePlugin(buffer, length); +} +REGISTER_TRT_PLUGIN("elementwise_plugin", CreateElementWisePluginDeserialize); + namespace details { template @@ -119,10 +126,10 @@ int ElementWisePlugin::enqueue(int batch_size, const void* const* inputs, const float* y = reinterpret_cast(inputs[1]); float* out = reinterpret_cast(outputs[0]); - if (type_ == nvinfer1::ElementWiseOperation::kSUM) { + if (type_ == "add") { details::ElementWise(details::Add(), x, y, out, batch_size, prev_size_, midd_size_, post_size_, stream); - } else if (type_ == nvinfer1::ElementWiseOperation::kPROD) { + } else if (type_ == "mul") { details::ElementWise(details::Mul(), x, y, out, batch_size, prev_size_, midd_size_, post_size_, stream); } else { diff --git a/paddle/fluid/inference/tensorrt/plugin/elementwise_op_plugin.h b/paddle/fluid/inference/tensorrt/plugin/elementwise_op_plugin.h index 9c461f7a5c44ebb9d4a755288c69abff55e2dea8..3b040f14c531c540b8a855da85ecc3008224526c 100644 --- a/paddle/fluid/inference/tensorrt/plugin/elementwise_op_plugin.h +++ b/paddle/fluid/inference/tensorrt/plugin/elementwise_op_plugin.h @@ -14,6 +14,7 @@ limitations under the License. */ #pragma once +#include #include #include "paddle/fluid/inference/tensorrt/plugin/trt_plugin.h" @@ -24,9 +25,8 @@ namespace plugin { class ElementWisePlugin : public PluginTensorRT { public: - ElementWisePlugin(nvinfer1::ElementWiseOperation type, - nvinfer1::Dims const &dims_x, nvinfer1::Dims const &dims_y, - int axis) + ElementWisePlugin(std::string type, nvinfer1::Dims const &dims_x, + nvinfer1::Dims const &dims_y, int axis) : type_(type), dims_x_(dims_x), dims_y_(dims_y), @@ -37,6 +37,9 @@ class ElementWisePlugin : public PluginTensorRT { ElementWisePlugin(void const *serial_data, size_t serial_length) { deserializeBase(serial_data, serial_length); + const char *elementwise_type; + DeserializeValue(&serial_data, &serial_length, &elementwise_type); + type_ = std::string(elementwise_type); DeserializeValue(&serial_data, &serial_length, &axis_); DeserializeValue(&serial_data, &serial_length, &dims_x_); DeserializeValue(&serial_data, &serial_length, &dims_y_); @@ -47,7 +50,7 @@ class ElementWisePlugin : public PluginTensorRT { return nullptr; } - const char *getPluginType() const override { return "elementwise"; } + const char *getPluginType() const override { return "elementwise_plugin"; } nvinfer1::Dims getOutputDimensions(int index, const nvinfer1::Dims *input_dims, @@ -61,18 +64,21 @@ class ElementWisePlugin : public PluginTensorRT { protected: size_t getSerializationSize() override { - return SerializedSize(axis_) + SerializedSize(dims_x_) + - SerializedSize(dims_y_) + getBaseSerializationSize(); + return SerializedSize(getPluginType()) + SerializedSize(axis_) + + SerializedSize(dims_x_) + SerializedSize(dims_y_) + + getBaseSerializationSize(); } void serialize(void *buffer) override { + SerializeValue(&buffer, getPluginType()); serializeBase(buffer); + SerializeValue(&buffer, type_.c_str()); SerializeValue(&buffer, axis_); SerializeValue(&buffer, dims_x_); SerializeValue(&buffer, dims_y_); } - nvinfer1::ElementWiseOperation type_; + std::string type_; nvinfer1::Dims dims_x_; nvinfer1::Dims dims_y_; int axis_; diff --git a/paddle/fluid/inference/tensorrt/plugin/prelu_op_plugin.cu b/paddle/fluid/inference/tensorrt/plugin/prelu_op_plugin.cu index 3075e87ea6d719a3f49d14c8c4b8015f7d688a50..b8a044fe99b91893c8c9ef661b4f46ebaa6db8c7 100644 --- a/paddle/fluid/inference/tensorrt/plugin/prelu_op_plugin.cu +++ b/paddle/fluid/inference/tensorrt/plugin/prelu_op_plugin.cu @@ -17,6 +17,7 @@ #include #include "glog/logging.h" #include "paddle/fluid/inference/tensorrt/plugin/prelu_op_plugin.h" +#include "paddle/fluid/inference/tensorrt/plugin/trt_plugin_factory.h" #include "paddle/fluid/operators/math/prelu.h" namespace paddle { @@ -24,6 +25,17 @@ namespace inference { namespace tensorrt { namespace plugin { +PReluPlugin *CreatePreluPluginDeserialize(const void *buffer, size_t length) { + return new PReluPlugin(buffer, length); +} +REGISTER_TRT_PLUGIN("prelu_plugin", CreatePreluPluginDeserialize); + +int PReluPlugin::initialize() { + cudaMalloc(&p_gpu_weight_, sizeof(float) * weight_.size()); + cudaMemcpy(p_gpu_weight_, weight_.data(), weight_.size() * sizeof(float), + cudaMemcpyHostToDevice); +} + nvinfer1::Dims PReluPlugin::getOutputDimensions(int index, const nvinfer1::Dims *inputDims, int nbInputs) { @@ -39,7 +51,8 @@ int PReluPlugin::enqueue(int batch_size, const void *const *inputs, // input dims is CHW. const auto &input_dims = this->getInputDims(0); const float *input = reinterpret_cast(inputs[0]); - const float *alpha = reinterpret_cast(alpha_.get().values); + // const float *alpha = reinterpret_cast(alpha_.get().values); + const float *alpha = p_gpu_weight_; float *output = reinterpret_cast(outputs)[0]; std::vector input_shape; diff --git a/paddle/fluid/inference/tensorrt/plugin/prelu_op_plugin.h b/paddle/fluid/inference/tensorrt/plugin/prelu_op_plugin.h index 0db56a310b072e64425f70ac23267ec72353e54b..a96649503f1c764e07370cb2b47b10f3dae72be4 100644 --- a/paddle/fluid/inference/tensorrt/plugin/prelu_op_plugin.h +++ b/paddle/fluid/inference/tensorrt/plugin/prelu_op_plugin.h @@ -14,7 +14,12 @@ #pragma once +#include #include +#include +#include "paddle/fluid/framework/tensor.h" +#include "paddle/fluid/framework/tensor_util.h" + #include "paddle/fluid/inference/tensorrt/engine.h" #include "paddle/fluid/inference/tensorrt/plugin/trt_plugin.h" @@ -24,39 +29,51 @@ namespace tensorrt { namespace plugin { class PReluPlugin : public PluginTensorRT { - TensorRTEngine::Weight alpha_; + std::vector weight_; + float *p_gpu_weight_; std::string mode_; protected: size_t getSerializationSize() override { - // return getBaseSerializationSize(alpha_) + SerializedSize(mode_); - return 0; + return getBaseSerializationSize() + SerializedSize(mode_.c_str()) + + SerializedSize(weight_) + SerializedSize(getPluginType()); } // TRT will call this func when we need to serialize the configuration of // tensorrt. // It should not be called by users. void serialize(void *buffer) override { - // serializeBase(buffer); - // SerializeValue(&buffer, alpha_); - // SerializeValue(&buffer, mode_); + SerializeValue(&buffer, getPluginType()); + serializeBase(buffer); + SerializeValue(&buffer, weight_); + SerializeValue(&buffer, mode_.c_str()); } public: - PReluPlugin(TensorRTEngine::Weight const &alpha, std::string const &mode) - : alpha_(alpha), mode_(mode) {} + PReluPlugin(const float *weight, const int weight_num, + std::string const &mode) + : mode_(mode) { + weight_.resize(weight_num); + std::copy(weight, weight + weight_num, weight_.data()); + } // It was used for tensorrt deserialization. // It should not be called by users. PReluPlugin(void const *serialData, size_t serialLength) { - // deserializeBase(serialData, serialLength); - // DeserializeValue(&serialData, &serialLength, &alpha_); - // DeserializeValue(&serialData, &serialLength, &mode_); + deserializeBase(serialData, serialLength); + DeserializeValue(&serialData, &serialLength, &weight_); + const char *prelu_mode; + DeserializeValue(&serialData, &serialLength, &prelu_mode); + mode_ = std::string(prelu_mode); } + ~PReluPlugin() { cudaFree(p_gpu_weight_); } + int initialize() override; - PReluPlugin *clone() const override { return new PReluPlugin(alpha_, mode_); } + PReluPlugin *clone() const override { + return new PReluPlugin(weight_.data(), weight_.size(), mode_); + } - const char *getPluginType() const override { return "prelu"; } + const char *getPluginType() const override { return "prelu_plugin"; } int getNbOutputs() const override { return 1; } nvinfer1::Dims getOutputDimensions(int index, const nvinfer1::Dims *inputs, int nbInputDims) override; diff --git a/paddle/fluid/inference/tensorrt/plugin/split_op_plugin.cu b/paddle/fluid/inference/tensorrt/plugin/split_op_plugin.cu index de61ace59e299a1f51940e4b433a0133d4fbe7ff..b5503c3b95ee2429dd865fd6de416a04aafbccf0 100644 --- a/paddle/fluid/inference/tensorrt/plugin/split_op_plugin.cu +++ b/paddle/fluid/inference/tensorrt/plugin/split_op_plugin.cu @@ -15,12 +15,18 @@ #include #include #include "paddle/fluid/inference/tensorrt/plugin/split_op_plugin.h" +#include "paddle/fluid/inference/tensorrt/plugin/trt_plugin_factory.h" namespace paddle { namespace inference { namespace tensorrt { namespace plugin { +SplitPlugin* CreateSplitPluginDeserialize(const void* buffer, size_t length) { + return new SplitPlugin(buffer, length); +} +REGISTER_TRT_PLUGIN("split_plugin", CreateSplitPluginDeserialize); + // copied from operators::math::SplitFunctor template __global__ void SplitKernel(const T* input_data, const int in_row, diff --git a/paddle/fluid/inference/tensorrt/plugin/split_op_plugin.h b/paddle/fluid/inference/tensorrt/plugin/split_op_plugin.h index 6f028d3d72ae3cc7d96c6782b734cdbf1243c06c..cbb72590567a35bee29387d4c00518b437913508 100644 --- a/paddle/fluid/inference/tensorrt/plugin/split_op_plugin.h +++ b/paddle/fluid/inference/tensorrt/plugin/split_op_plugin.h @@ -15,6 +15,7 @@ #pragma once #include +#include #include #include "paddle/fluid/inference/tensorrt/plugin/trt_plugin.h" @@ -25,6 +26,7 @@ namespace plugin { class SplitPlugin : public PluginTensorRT { public: + SplitPlugin() {} SplitPlugin(int axis, std::vector const &output_lengths) : axis_(axis), same_shape_(true), output_length_(output_lengths) {} @@ -38,7 +40,7 @@ class SplitPlugin : public PluginTensorRT { return new SplitPlugin(axis_, output_length_); } - const char *getPluginType() const override { return "split"; } + const char *getPluginType() const override { return "split_plugin"; } int getNbOutputs() const override { return output_length_.size(); } nvinfer1::Dims getOutputDimensions(int index, const nvinfer1::Dims *input_dims, @@ -50,11 +52,12 @@ class SplitPlugin : public PluginTensorRT { protected: size_t getSerializationSize() override { - return SerializedSize(axis_) + SerializedSize(output_length_) + - getBaseSerializationSize(); + return SerializedSize(getPluginType()) + SerializedSize(axis_) + + SerializedSize(output_length_) + getBaseSerializationSize(); } void serialize(void *buffer) override { + SerializeValue(&buffer, getPluginType()); serializeBase(buffer); SerializeValue(&buffer, axis_); SerializeValue(&buffer, output_length_); diff --git a/paddle/fluid/inference/tensorrt/plugin/trt_plugin.h b/paddle/fluid/inference/tensorrt/plugin/trt_plugin.h index 86084829e150f8a39610319a8f2138f2b2fdec68..3b737bd726ad09637f8530a114362d98d1dac1b0 100644 --- a/paddle/fluid/inference/tensorrt/plugin/trt_plugin.h +++ b/paddle/fluid/inference/tensorrt/plugin/trt_plugin.h @@ -17,9 +17,10 @@ #include #include #include +#include #include -#include "paddle/fluid/inference/tensorrt/plugin/serialize.h" +#include "paddle/fluid/inference/tensorrt/plugin/trt_plugin_utils.h" #include "paddle/fluid/platform/enforce.h" #include "paddle/fluid/platform/profiler.h" @@ -30,6 +31,13 @@ namespace inference { namespace tensorrt { namespace plugin { +class PluginTensorRT; + +typedef std::function + PluginDeserializeFunc; + +typedef std::function PluginConstructFunc; + class PluginTensorRT : public nvinfer1::IPluginExt { public: PluginTensorRT() {} diff --git a/paddle/fluid/inference/tensorrt/plugin/trt_plugin_factory.cc b/paddle/fluid/inference/tensorrt/plugin/trt_plugin_factory.cc new file mode 100644 index 0000000000000000000000000000000000000000..3c20b6d1e725273dbfdc20c01fb01deea4e8d88e --- /dev/null +++ b/paddle/fluid/inference/tensorrt/plugin/trt_plugin_factory.cc @@ -0,0 +1,48 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/inference/tensorrt/plugin/trt_plugin_factory.h" + +namespace paddle { +namespace inference { +namespace tensorrt { +namespace plugin { + +PluginTensorRT* PluginFactoryTensorRT::createPlugin(const char* layer_name, + const void* serial_data, + size_t serial_length) { + const char* plugin_type; + DeserializeValue(&serial_data, &serial_length, &plugin_type); + + PADDLE_ENFORCE(Has(plugin_type), + "trt plugin type %s does not exists, check it.", plugin_type); + auto plugin = plugin_registry_[plugin_type](serial_data, serial_length); + owned_plugins_.emplace_back(plugin); + + return plugin; +} + +bool PluginFactoryTensorRT::RegisterPlugin( + const std::string& op_name, PluginDeserializeFunc deserialize_func) { + if (Has(op_name)) return false; + auto ret = plugin_registry_.emplace(op_name, deserialize_func); + return ret.second; +} + +void PluginFactoryTensorRT::DestroyPlugins() { owned_plugins_.clear(); } + +} // namespace plugin +} // namespace tensorrt +} // namespace inference +} // namespace paddle diff --git a/paddle/fluid/inference/tensorrt/plugin/trt_plugin_factory.h b/paddle/fluid/inference/tensorrt/plugin/trt_plugin_factory.h new file mode 100644 index 0000000000000000000000000000000000000000..139c75595f9f44cacf7d14cda6b1c8eb4ef3c0ee --- /dev/null +++ b/paddle/fluid/inference/tensorrt/plugin/trt_plugin_factory.h @@ -0,0 +1,78 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include +#include +#include +#include +#include +#include +#include + +#include "paddle/fluid/inference/tensorrt/plugin/trt_plugin.h" +#include "paddle/fluid/inference/tensorrt/plugin/trt_plugin_utils.h" +#include "paddle/fluid/inference/utils/singleton.h" +#include "paddle/fluid/platform/enforce.h" + +namespace paddle { +namespace inference { +namespace tensorrt { +namespace plugin { + +class PluginFactoryTensorRT : public nvinfer1::IPluginFactory, + public DeleteHelper { + public: + // Deserialization method + PluginTensorRT* createPlugin(const char* layer_name, const void* serial_data, + size_t serial_length) override; + + bool RegisterPlugin(const std::string& op_name, + PluginDeserializeFunc deserialize_func); + + bool Has(const std::string& op_name) { + return plugin_registry_.find(op_name) != plugin_registry_.end(); + } + + void DestroyPlugins(); + + protected: + std::unordered_map plugin_registry_; + + std::list> owned_plugins_; +}; + +class TrtPluginRegistrar { + public: + TrtPluginRegistrar(const std::string& name, + PluginDeserializeFunc deserialize_func) { + inference::Singleton::Global().RegisterPlugin( + name, deserialize_func); + } +}; + +#define REGISTER_TRT_PLUGIN(name, deserialize_func) \ + REGISTER_TRT_PLUGIN_UNIQ(__COUNTER__, name, deserialize_func) + +#define REGISTER_TRT_PLUGIN_UNIQ(ctr, name, deserialize_func) \ + static paddle::inference::tensorrt::plugin::TrtPluginRegistrar \ + trt_plugin_registrar##ctr __attribute__((unused)) = \ + paddle::inference::tensorrt::plugin::TrtPluginRegistrar( \ + name, deserialize_func) + +} // namespace plugin +} // namespace tensorrt +} // namespace inference +} // namespace paddle diff --git a/paddle/fluid/inference/tensorrt/plugin/serialize.h b/paddle/fluid/inference/tensorrt/plugin/trt_plugin_utils.h similarity index 96% rename from paddle/fluid/inference/tensorrt/plugin/serialize.h rename to paddle/fluid/inference/tensorrt/plugin/trt_plugin_utils.h index ce859f16fc87479adf090687121ff06951b5684c..1cae4ccae4cc593785d9b3b0e87523e740eef4ff 100644 --- a/paddle/fluid/inference/tensorrt/plugin/serialize.h +++ b/paddle/fluid/inference/tensorrt/plugin/trt_plugin_utils.h @@ -13,8 +13,8 @@ // limitations under the License. #pragma once - #include +#include #include #include #include "paddle/fluid/platform/enforce.h" @@ -24,6 +24,13 @@ namespace inference { namespace tensorrt { namespace plugin { +// Some trt base classes lack of the destructor. +// We use a assisted class to fix this. +struct DeleteHelper { + protected: + virtual ~DeleteHelper() {} +}; + template inline void SerializeValue(void** buffer, T const& value); diff --git a/paddle/fluid/inference/tensorrt/test_engine.cc b/paddle/fluid/inference/tensorrt/test_engine.cc index 9eed0f6ee9ce4d9e35bec718dc8e8435921dbd81..a03dd45db0f80487cb4c2e6b68f94944e8558ae4 100644 --- a/paddle/fluid/inference/tensorrt/test_engine.cc +++ b/paddle/fluid/inference/tensorrt/test_engine.cc @@ -17,6 +17,8 @@ limitations under the License. */ #include #include +#include "paddle/fluid/framework/tensor.h" +#include "paddle/fluid/framework/tensor_util.h" #include "paddle/fluid/inference/tensorrt/engine.h" #include "paddle/fluid/platform/enforce.h" @@ -27,19 +29,34 @@ namespace tensorrt { class TensorRTEngineTest : public ::testing::Test { protected: void SetUp() override { - ASSERT_EQ(0, cudaStreamCreate(&stream_)); - engine_ = new TensorRTEngine(10, 1 << 10, stream_); + ctx_ = new platform::CUDADeviceContext(platform::CUDAPlace(0)); + + engine_ = new TensorRTEngine(10, 1 << 10); engine_->InitNetwork(); } void TearDown() override { - delete engine_; - cudaStreamDestroy(stream_); + if (engine_) { + delete engine_; + engine_ = nullptr; + } + } + + void PrepareInputOutput(const std::vector &input, + std::vector output_shape) { + TensorFromVector(input, *ctx_, &input_); + output_.Resize(framework::make_ddim(output_shape)); + } + + void GetOutput(std::vector *output) { + TensorToVector(output_, *ctx_, output); } protected: - TensorRTEngine* engine_; - cudaStream_t stream_; + framework::Tensor input_; + framework::Tensor output_; + TensorRTEngine *engine_; + platform::CUDADeviceContext *ctx_; }; TEST_F(TensorRTEngineTest, add_layer) { @@ -48,12 +65,14 @@ TEST_F(TensorRTEngineTest, add_layer) { float raw_weight[size] = {2.}; // Weight in CPU memory. float raw_bias[size] = {3.}; + std::vector buffers(2); // TRT binded inputs + LOG(INFO) << "create weights"; TensorRTEngine::Weight weight(nvinfer1::DataType::kFLOAT, raw_weight, size); TensorRTEngine::Weight bias(nvinfer1::DataType::kFLOAT, raw_bias, size); - auto* x = engine_->DeclareInput("x", nvinfer1::DataType::kFLOAT, + auto *x = engine_->DeclareInput("x", nvinfer1::DataType::kFLOAT, nvinfer1::DimsCHW{1, 1, 1}); - auto* fc_layer = TRT_ENGINE_ADD_LAYER(engine_, FullyConnected, *x, size, + auto *fc_layer = TRT_ENGINE_ADD_LAYER(engine_, FullyConnected, *x, size, weight.get(), bias.get()); PADDLE_ENFORCE(fc_layer != nullptr); @@ -63,18 +82,24 @@ TEST_F(TensorRTEngineTest, add_layer) { ASSERT_EQ(engine_->engine()->getNbBindings(), 2); // fill in real data - float x_v = 1234; - engine_->SetInputFromCPU("x", reinterpret_cast(&x_v), - 1 * sizeof(float)); + std::vector x_v = {1234}; + std::vector y_cpu; + PrepareInputOutput(x_v, {1}); + + auto *x_v_gpu_data = input_.mutable_data(ctx_->GetPlace()); + auto *y_gpu_data = output_.mutable_data(ctx_->GetPlace()); + + buffers[0] = reinterpret_cast(x_v_gpu_data); + buffers[1] = reinterpret_cast(y_gpu_data); + LOG(INFO) << "to execute"; - engine_->Execute(1); + engine_->Execute(1, &buffers, ctx_->stream()); LOG(INFO) << "to get output"; - float y_cpu; - engine_->GetOutputInCPU("y", &y_cpu, 1 * sizeof(float)); + GetOutput(&y_cpu); LOG(INFO) << "to checkout output"; - ASSERT_EQ(y_cpu, x_v * 2 + 3); + ASSERT_EQ(y_cpu[0], x_v[0] * 2 + 3); } TEST_F(TensorRTEngineTest, add_layer_multi_dim) { @@ -83,12 +108,13 @@ TEST_F(TensorRTEngineTest, add_layer_multi_dim) { // instead of row-major, which is [[1.0, 1.1], [3.3, 4.4]] float raw_weight[4] = {1.0, 1.1, 3.3, 4.4}; float raw_bias[2] = {1.3, 2.4}; + std::vector buffers(2); // TRT binded inputs TensorRTEngine::Weight weight(nvinfer1::DataType::kFLOAT, raw_weight, 4); TensorRTEngine::Weight bias(nvinfer1::DataType::kFLOAT, raw_bias, 2); - auto* x = engine_->DeclareInput("x", nvinfer1::DataType::kFLOAT, + auto *x = engine_->DeclareInput("x", nvinfer1::DataType::kFLOAT, nvinfer1::DimsCHW{1, 2, 1}); - auto* fc_layer = TRT_ENGINE_ADD_LAYER(engine_, FullyConnected, *x, 2, + auto *fc_layer = TRT_ENGINE_ADD_LAYER(engine_, FullyConnected, *x, 2, weight.get(), bias.get()); PADDLE_ENFORCE(fc_layer != nullptr); @@ -96,19 +122,27 @@ TEST_F(TensorRTEngineTest, add_layer_multi_dim) { engine_->FreezeNetwork(); ASSERT_EQ(engine_->engine()->getNbBindings(), 2); - float x_v[2] = {1.0, 2.0}; - engine_->SetInputFromCPU("x", reinterpret_cast(&x_v), - 2 * sizeof(float)); - engine_->Execute(1); + // fill in real data + std::vector x_v = {1.0, 2.0}; + std::vector y_cpu; + PrepareInputOutput(x_v, {2}); + + auto *x_v_gpu_data = input_.mutable_data(ctx_->GetPlace()); + auto *y_gpu_data = output_.mutable_data(ctx_->GetPlace()); + + buffers[0] = reinterpret_cast(x_v_gpu_data); + buffers[1] = reinterpret_cast(y_gpu_data); + + engine_->Execute(1, &buffers, ctx_->stream()); LOG(INFO) << "to get output"; - float y_cpu[2] = {-1., -1.}; + GetOutput(&y_cpu); auto dims = engine_->GetITensor("y")->getDimensions(); ASSERT_EQ(dims.nbDims, 3); ASSERT_EQ(dims.d[0], 2); ASSERT_EQ(dims.d[1], 1); - engine_->GetOutputInCPU("y", &y_cpu[0], 2 * sizeof(float)); + ASSERT_EQ(y_cpu[0], 4.5); ASSERT_EQ(y_cpu[1], 14.5); } @@ -117,12 +151,13 @@ TEST_F(TensorRTEngineTest, test_conv2d) { // Weight in CPU memory. float raw_weight[9] = {1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0}; float raw_bias[1] = {0}; + std::vector buffers(2); // TRT binded inputs TensorRTEngine::Weight weight(nvinfer1::DataType::kFLOAT, raw_weight, 9); TensorRTEngine::Weight bias(nvinfer1::DataType::kFLOAT, raw_bias, 1); - auto* x = engine_->DeclareInput("x", nvinfer1::DataType::kFLOAT, + auto *x = engine_->DeclareInput("x", nvinfer1::DataType::kFLOAT, nvinfer1::Dims3{1, 3, 3}); - auto* conv_layer = + auto *conv_layer = TRT_ENGINE_ADD_LAYER(engine_, Convolution, *x, 1, nvinfer1::DimsHW{3, 3}, weight.get(), bias.get()); PADDLE_ENFORCE(conv_layer != nullptr); @@ -133,28 +168,36 @@ TEST_F(TensorRTEngineTest, test_conv2d) { engine_->FreezeNetwork(); ASSERT_EQ(engine_->engine()->getNbBindings(), 2); - float x_v[18] = {1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, - 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0}; - engine_->SetInputFromCPU("x", reinterpret_cast(&x_v), - 18 * sizeof(float)); - engine_->Execute(2); + // fill in real data + std::vector x_v = {1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, + 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0}; + std::vector y_cpu; + PrepareInputOutput(x_v, {18}); + + auto *x_v_gpu_data = input_.mutable_data(ctx_->GetPlace()); + auto *y_gpu_data = output_.mutable_data(ctx_->GetPlace()); + + buffers[0] = reinterpret_cast(x_v_gpu_data); + buffers[1] = reinterpret_cast(y_gpu_data); + + engine_->Execute(2, &buffers, ctx_->stream()); LOG(INFO) << "to get output"; - float* y_cpu = new float[18]; - engine_->GetOutputInCPU("y", &y_cpu[0], 18 * sizeof(float)); + GetOutput(&y_cpu); + ASSERT_EQ(y_cpu[0], 4.0); ASSERT_EQ(y_cpu[1], 6.0); } TEST_F(TensorRTEngineTest, test_pool2d) { // Weight in CPU memory. - auto* x = engine_->DeclareInput("x", nvinfer1::DataType::kFLOAT, + auto *x = engine_->DeclareInput("x", nvinfer1::DataType::kFLOAT, nvinfer1::Dims3{1, 2, 2}); + std::vector buffers(2); // TRT binded inputs nvinfer1::PoolingType pool_t = nvinfer1::PoolingType::kAVERAGE; - auto* pool_layer = - TRT_ENGINE_ADD_LAYER(engine_, Pooling, *const_cast(x), - pool_t, nvinfer1::DimsHW{2, 2}); + auto *pool_layer = TRT_ENGINE_ADD_LAYER(engine_, Pooling, *x, pool_t, + nvinfer1::DimsHW{2, 2}); PADDLE_ENFORCE(pool_layer != nullptr); pool_layer->setStride(nvinfer1::DimsHW{1, 1}); @@ -164,14 +207,21 @@ TEST_F(TensorRTEngineTest, test_pool2d) { engine_->FreezeNetwork(); ASSERT_EQ(engine_->engine()->getNbBindings(), 2); - float x_v[8] = {1.0, 2.0, 5.0, 0.0, 2.0, 3.0, 5.0, 10.0}; - engine_->SetInputFromCPU("x", reinterpret_cast(&x_v), - 8 * sizeof(float)); - engine_->Execute(2); + // fill in real data + std::vector x_v = {1.0, 2.0, 5.0, 0.0, 2.0, 3.0, 5.0, 10.0}; + std::vector y_cpu; + PrepareInputOutput(x_v, {2}); + + auto *x_v_gpu_data = input_.mutable_data(ctx_->GetPlace()); + auto *y_gpu_data = output_.mutable_data(ctx_->GetPlace()); + + buffers[0] = reinterpret_cast(x_v_gpu_data); + buffers[1] = reinterpret_cast(y_gpu_data); + + engine_->Execute(2, &buffers, ctx_->stream()); LOG(INFO) << "to get output"; - float* y_cpu = new float[2]; - engine_->GetOutputInCPU("y", &y_cpu[0], 2 * sizeof(float)); + GetOutput(&y_cpu); ASSERT_EQ(y_cpu[0], 2.0); ASSERT_EQ(y_cpu[1], 5.0); diff --git a/paddle/fluid/inference/tests/api/analyzer_rnn1_tester.cc b/paddle/fluid/inference/tests/api/analyzer_rnn1_tester.cc index c27c39f40a2067dd2bd2150e4b1e53eab7cdf06e..36282b3efe5756da55b056c09e94aa352e3dcf8a 100644 --- a/paddle/fluid/inference/tests/api/analyzer_rnn1_tester.cc +++ b/paddle/fluid/inference/tests/api/analyzer_rnn1_tester.cc @@ -366,15 +366,17 @@ TEST(Analyzer_rnn1, ZeroCopyMultiThread) { #define NEW_TENSOR(name__) \ auto name__##_tensor = predictor->GetInputTensor(#name__); - auto base_predictor = CreatePaddlePredictor(config); + std::vector> predictors; + predictors.emplace_back(CreatePaddlePredictor(config)); + for (int tid = 1; tid < FLAGS_num_threads; tid++) { + predictors.emplace_back(predictors.front()->Clone()); + } double total_time_of_threads{0}; std::vector threads; for (int tid = 0; tid < FLAGS_num_threads; tid++) { threads.emplace_back([&, tid] { - // To ensure the thread binding correctly, - // please clone inside the threadpool. - auto predictor = base_predictor->Clone(); + auto &predictor = predictors[tid]; NEW_TENSOR(data_lod_attention); NEW_TENSOR(cell_init); NEW_TENSOR(data); diff --git a/paddle/fluid/inference/tests/api/analyzer_seq_pool1_tester.cc b/paddle/fluid/inference/tests/api/analyzer_seq_pool1_tester.cc index bd0059e18485c046df27d5ddbb39df9bbb249113..cca2ab1ee148b568e714c24dded7cd72403f0e5f 100644 --- a/paddle/fluid/inference/tests/api/analyzer_seq_pool1_tester.cc +++ b/paddle/fluid/inference/tests/api/analyzer_seq_pool1_tester.cc @@ -266,15 +266,17 @@ TEST(Analyzer_seq_pool1, zerocopy_profile_threads) { SetConfig(&config); config.SwitchUseFeedFetchOps(false); - auto base_predictor = CreatePaddlePredictor(config); + std::vector> predictors; + predictors.emplace_back(CreatePaddlePredictor(config)); + for (int tid = 1; tid < FLAGS_num_threads; tid++) { + predictors.emplace_back(predictors.front()->Clone()); + } double total_time_of_threads{0}; std::vector threads; for (int tid = 0; tid < FLAGS_num_threads; tid++) { threads.emplace_back([&, tid] { - // To ensure the thread binding correctly, - // please clone inside the threadpool. - auto predictor = base_predictor->Clone(); + auto &predictor = predictors[tid]; std::vector> inputs; PrepareZeroCopyInputs(predictor, &inputs); auto output_tensor = predictor->GetOutputTensor(out_var_name); diff --git a/paddle/fluid/inference/tests/api/tester_helper.h b/paddle/fluid/inference/tests/api/tester_helper.h index 2811eb4946ea025cf6c7ab197c4e603df86f6f2d..2e53fddfe7f6f0c5b31ff069fb1661f143022841 100644 --- a/paddle/fluid/inference/tests/api/tester_helper.h +++ b/paddle/fluid/inference/tests/api/tester_helper.h @@ -17,8 +17,10 @@ #include #include +#include #include #include // NOLINT +#include #include #ifdef WITH_GPERFTOOLS #include @@ -252,7 +254,11 @@ void TestMultiThreadPrediction( int batch_size = FLAGS_batch_size; int num_times = FLAGS_repeat; std::vector threads; - auto main_predictor = CreateTestPredictor(config, use_analysis); + std::vector> predictors; + predictors.emplace_back(CreateTestPredictor(config, use_analysis)); + for (int tid = 1; tid < num_threads; tid++) { + predictors.emplace_back(predictors.front()->Clone()); + } size_t total_time{0}; for (int tid = 0; tid < num_threads; ++tid) { @@ -260,9 +266,7 @@ void TestMultiThreadPrediction( // Each thread should have local inputs and outputs. // The inputs of each thread are all the same. std::vector outputs_tid; - // To ensure the thread binding correctly, - // please clone inside the threadpool. - auto predictor = main_predictor->Clone(); + auto &predictor = predictors[tid]; #ifdef PADDLE_WITH_MKLDNN if (use_analysis) { static_cast(predictor.get()) diff --git a/paddle/fluid/inference/tests/api/trt_models_tester.cc b/paddle/fluid/inference/tests/api/trt_models_tester.cc index 17a433c9d98768dbda4ba93bdceb6cc1717adc07..cb668a4174134ba3ce9517955ff740ada568e97b 100644 --- a/paddle/fluid/inference/tests/api/trt_models_tester.cc +++ b/paddle/fluid/inference/tests/api/trt_models_tester.cc @@ -54,7 +54,8 @@ void SetConfig(AnalysisConfig* config, std::string model_dir, if (use_gpu) { config->EnableUseGpu(100, 0); if (use_tensorrt) { - config->EnableTensorRtEngine(1 << 10, batch_size); + config->EnableTensorRtEngine(1 << 10, batch_size, 3, + AnalysisConfig::Precision::kFloat32, false); config->pass_builder()->DeletePass("conv_bn_fuse_pass"); config->pass_builder()->DeletePass("fc_fuse_pass"); config->pass_builder()->TurnOnDebug(); diff --git a/paddle/fluid/memory/allocation/allocator.cc b/paddle/fluid/memory/allocation/allocator.cc index 664b3b8420f7590dc605cb8c67059f334fc76139..15a722730074554e721cd0d887d0a4b740e2b40a 100644 --- a/paddle/fluid/memory/allocation/allocator.cc +++ b/paddle/fluid/memory/allocation/allocator.cc @@ -26,20 +26,17 @@ Allocator::~Allocator() {} bool Allocator::IsAllocThreadSafe() const { return false; } AllocationPtr Allocator::Allocate(size_t size, Allocator::Attr attr) { - VLOG(2) << "Alloc allocation on " << typeid(*this).name(); auto ptr = AllocateImpl(size, attr); ptr->RegisterAllocatorChain(this); - VLOG(2) << "Alloc success"; return AllocationPtr(ptr); } void Allocator::FreeImpl(Allocation* allocation) { - auto* allocator = allocation->TopAllocator(); + Allocator* allocator = allocation->TopAllocator(); allocator->Free(allocation); } void Allocator::Free(Allocation* allocation) { - VLOG(2) << "Free allocation on " << typeid(*this).name(); allocation->PopAllocator(); FreeImpl(allocation); } @@ -47,7 +44,7 @@ void Allocator::Free(Allocation* allocation) { const char* BadAlloc::what() const noexcept { return msg_.c_str(); } void AllocationDeleter::operator()(Allocation* allocation) const { - auto* allocator = allocation->TopAllocator(); + Allocator* allocator = allocation->TopAllocator(); allocator->Free(allocation); } diff --git a/paddle/fluid/memory/allocation/allocator.h b/paddle/fluid/memory/allocation/allocator.h index f74fab3c7516db1ce9f49fad091125265d78e5f2..fabd1ff57fedc7376bc8d4dd48166607cd73b59a 100644 --- a/paddle/fluid/memory/allocation/allocator.h +++ b/paddle/fluid/memory/allocation/allocator.h @@ -16,7 +16,7 @@ #include #include #include -#include "paddle/fluid/framework/small_stack.h" +#include "paddle/fluid/framework/inlined_stack.h" #include "paddle/fluid/platform/place.h" namespace paddle { @@ -82,7 +82,7 @@ class Allocation { std::vector GetAllocatorChain() const { std::vector allocators; for (size_t i = 0; i < allocator_chain_.size(); ++i) { - allocators[i] = allocator_chain_[i]; + allocators.push_back(allocator_chain_[i]); } return allocators; } @@ -100,7 +100,7 @@ class Allocation { void* ptr_; size_t size_; platform::Place place_; - framework::SmallStack allocator_chain_; + framework::InlinedStack allocator_chain_; friend class Allocator; friend class AllocationDeleter; diff --git a/paddle/fluid/memory/allocation/legacy_allocator.cc b/paddle/fluid/memory/allocation/legacy_allocator.cc index 9c71c0bbcef3762591516691c18f231c89938453..bb52e4795faaee539b6905cc1246cf75dc877940 100644 --- a/paddle/fluid/memory/allocation/legacy_allocator.cc +++ b/paddle/fluid/memory/allocation/legacy_allocator.cc @@ -36,6 +36,8 @@ DEFINE_bool(init_allocated_mem, false, "that initializing the allocated memory with a small value " "during unit testing."); DECLARE_double(fraction_of_gpu_memory_to_use); +DECLARE_double(initial_gpu_memory_in_mb); +DECLARE_double(reallocate_gpu_memory_in_mb); DECLARE_bool(benchmark); namespace paddle { @@ -69,7 +71,8 @@ BuddyAllocator *GetCPUBuddyAllocator() { std::call_once(init_flag, []() { a = new detail::BuddyAllocator( std::unique_ptr(new detail::CPUAllocator), - platform::CpuMinChunkSize(), platform::CpuMaxChunkSize()); + platform::CpuMinChunkSize(), platform::CpuMaxChunkSize(), + platform::CpuMaxChunkSize()); }); return a; @@ -131,40 +134,53 @@ size_t Used(const platform::CPUPlace &place) { } #ifdef PADDLE_WITH_CUDA -BuddyAllocator *GetGPUBuddyAllocator(int gpu_id) { - static std::once_flag init_flag; - static detail::BuddyAllocator **a_arr = nullptr; - static std::vector devices; - - std::call_once(init_flag, [gpu_id]() { - devices = platform::GetSelectedDevices(); - int gpu_num = devices.size(); - - allocation::GPUMemMonitor.Initialize(devices.size()); +class GPUBuddyAllocatorList { + public: + GPUBuddyAllocatorList() + : allocators_(platform::GetCUDADeviceCount()), + flags_(platform::GetCUDADeviceCount()) { + allocation::GPUMemMonitor.Initialize(allocators_.size()); + } - a_arr = new BuddyAllocator *[gpu_num]; - for (size_t i = 0; i < devices.size(); ++i) { - int dev_id = devices[i]; - a_arr[i] = nullptr; + BuddyAllocator *Get(size_t dev_id) { + PADDLE_ENFORCE(dev_id < flags_.size(), "Invalid device id %s", dev_id); + std::call_once(flags_[dev_id], [this, dev_id] { platform::SetDeviceId(dev_id); - a_arr[i] = new BuddyAllocator(std::unique_ptr( - new detail::GPUAllocator(dev_id)), - platform::GpuMinChunkSize(), - platform::GpuMaxChunkSize()); - - VLOG(10) << "\n\nNOTE: each GPU device use " - << FLAGS_fraction_of_gpu_memory_to_use * 100 - << "% of GPU memory.\n" - << "You can set GFlags environment variable '" - << "FLAGS_fraction_of_gpu_memory_to_use" - << "' to change the fraction of GPU usage.\n\n"; - } - }); + size_t first_size = platform::GpuFirstAllocateChunkSize(); + size_t re_size = platform::GpuReAllocateChunkSize(); + allocators_[dev_id] = + new BuddyAllocator(std::unique_ptr( + new detail::GPUAllocator(dev_id)), + platform::GpuMinChunkSize(), first_size, re_size); + VLOG(2) << "\n\nNOTE: each GPU device use " + << string::HumanReadableSize(first_size) << "(initial chunk) " + << string::HumanReadableSize(re_size) << "(reallocate chunk) " + << "% of GPU memory.\n" + << "You can set GFlags environment variable '" + << "FLAGS_fraction_of_gpu_memory_to_use" + << "' or " + "'FLAGS_initial_gpu_memory_in_mb/" + "FLAGS_reallocate_gpu_memory_in_mb' to change the fraction " + "of GPU usage.\n\n"; + VLOG(2) << "Currently, FLAGS_fraction_of_gpu_memory_to_use=" + << FLAGS_fraction_of_gpu_memory_to_use << ", " + << "FLAGS_initial_gpu_memory_in_mb=" + << FLAGS_initial_gpu_memory_in_mb << ", " + << "FLAGS_reallocate_gpu_memory_in_mb=" + << FLAGS_reallocate_gpu_memory_in_mb; + }); + return allocators_[dev_id]; + } + + private: + std::vector allocators_; + std::vector flags_; +}; +BuddyAllocator *GetGPUBuddyAllocator(int gpu_id) { + static GPUBuddyAllocatorList allocators; platform::SetDeviceId(gpu_id); - auto pos = std::distance(devices.begin(), - std::find(devices.begin(), devices.end(), gpu_id)); - return a_arr[pos]; + return allocators.Get(gpu_id); } #endif @@ -183,7 +199,7 @@ void *Alloc(const platform::CUDAPlace &place, #ifdef PADDLE_WITH_CUDA auto *buddy_allocator = GetGPUBuddyAllocator(place.device); auto *ptr = buddy_allocator->Alloc(size); - if (ptr == nullptr) { + if (ptr == nullptr && size > 0) { int cur_dev = platform::GetCurrentDeviceId(); platform::SetDeviceId(place.device); size_t avail, total; @@ -234,6 +250,7 @@ BuddyAllocator *GetCUDAPinnedBuddyAllocator() { ba = new BuddyAllocator(std::unique_ptr( new detail::CUDAPinnedAllocator), platform::CUDAPinnedMinChunkSize(), + platform::CUDAPinnedMaxChunkSize(), platform::CUDAPinnedMaxChunkSize()); }); diff --git a/paddle/fluid/memory/allocation/multi_bin_buffered_allocator.cc b/paddle/fluid/memory/allocation/multi_bin_buffered_allocator.cc index 44240121f055dbbcb9246b00ee5d5ae38d7c1a55..7f96025a1a9e7152dde1afbd8713d9614a3c115e 100644 --- a/paddle/fluid/memory/allocation/multi_bin_buffered_allocator.cc +++ b/paddle/fluid/memory/allocation/multi_bin_buffered_allocator.cc @@ -14,16 +14,90 @@ #include "paddle/fluid/memory/allocation/multi_bin_buffered_allocator.h" #include +#include +#include #include +#include +#include #include "paddle/fluid/platform/lock_guard_ptr.h" DEFINE_double(tolerant_times, 2, "Tolerant memory size times of buffered_allocator"); +DEFINE_string(division_plan_path, "", "Division plan file path"); + namespace paddle { namespace memory { namespace allocation { +std::string TrimStringAndToLowerCase(const std::string &str) { + auto not_space = [](char ch) { return std::isspace(ch) == 0; }; + auto first_idx = static_cast( + std::find_if(str.begin(), str.end(), not_space) - str.begin()); + auto last_idx = static_cast( + std::find_if(str.rbegin(), str.rend(), not_space) - str.rbegin()); + if (first_idx == str.size() || last_idx == str.size()) return ""; + + last_idx = str.size() - 1 - last_idx; + auto ret = str.substr(first_idx, last_idx - first_idx); + std::for_each(ret.begin(), ret.end(), + [](char &ch) { ch = std::tolower(ch); }); + return ret; +} + +static size_t ParseStringToBytes(const std::string &str) { + std::string ret = str; + if (ret.back() == 'b') { + ret.pop_back(); + } + + PADDLE_ENFORCE(!ret.empty(), "Wrong format: %s", str); + size_t multiples = 1; + switch (ret.back()) { + case 'g': + multiples *= (static_cast(1) << 30); + break; + case 'm': + multiples *= (static_cast(1) << 20); + break; + case 'k': + multiples *= (static_cast(1) << 10); + break; + default: + break; + } + + if (multiples != 1) ret.pop_back(); + ret = TrimStringAndToLowerCase(ret); + double ret_val = 0.0; + std::stringstream ss(ret); + PADDLE_ENFORCE((ss >> ret_val).good(), "Wrong format %s", str); + return static_cast(ret_val * multiples); +} + +static std::string GetDebugStringOfPlan(const std::vector &plan) { + std::string ret("["); + for (auto sz : plan) { + ret += string::HumanReadableSize(sz); + ret += ", "; + } + return ret + "]"; +} + +static std::vector ReadDivisionPlanFromFile( + const std::string &filepath) { + std::ifstream is(filepath.c_str()); + PADDLE_ENFORCE(is.good(), "File not exist"); + std::string str; + std::vector plan; + while (std::getline(is, str).good()) { + str = TrimStringAndToLowerCase(str); + if (str.empty()) break; + plan.push_back(ParseStringToBytes(str)); + } + return plan; +} + static void CheckAndModifyMemoryDivisionPlan( std::vector *division_plan) { // Check whether the division plan is strictly sorted @@ -50,10 +124,21 @@ static void CheckAndModifyMemoryDivisionPlan( } static std::vector GetDefaultDivisionPlan() { + if (!FLAGS_division_plan_path.empty()) { + return ReadDivisionPlanFromFile(FLAGS_division_plan_path); + } + + constexpr size_t kMaxLogSize = 30; + std::vector plan; + for (size_t i = 12; i <= kMaxLogSize; ++i) { + plan.push_back(static_cast(1) << i); + } + /* for (size_t i = 0; i < sizeof(size_t) * 8; ++i) { plan.push_back(static_cast(1) << i); } + */ return plan; } @@ -78,27 +163,32 @@ MultiBinBufferedAllocator::MultiBinBufferedAllocator( : underlying_allocator_(std::move(underlying_allocator)), division_plan_(division_plan) { CheckAndModifyMemoryDivisionPlan(&division_plan_); - allocations_.resize(division_plan_.size()); - mtx_.resize(division_plan_.size()); + allocations_.resize(division_plan_.size() - 1); + mtx_.resize(division_plan_.size() - 1); if (underlying_allocator_->IsAllocThreadSafe()) { for (auto &mtx : mtx_) { mtx.reset(new std::mutex()); } } + VLOG(1) << "Division plan is: " << GetDebugStringOfPlan(division_plan_); VLOG(1) << "FLAGS_tolerant_times = " << FLAGS_tolerant_times; } void MultiBinBufferedAllocator::FreeImpl(Allocation *allocation) { auto bin_index = FindDivisionPlanBinIndex(division_plan_, allocation->size()); - { + if (bin_index < allocations_.size()) { platform::LockGuardPtr guard(mtx_[bin_index]); allocations_[bin_index].emplace(allocation->size(), AllocationPtr(allocation)); + } else { + underlying_allocator_->Free(allocation); } } -void MultiBinBufferedAllocator::FreeCache(size_t size, size_t bin_index) { +// bin_index is not used currently. +// Maybe we can design more flexible FreeCache strategy based on bin_index +size_t MultiBinBufferedAllocator::FreeCache(size_t size, size_t bin_index) { size_t accumulated_size = 0; // FIXME(zjl): free the largest first when there is no extra for (size_t i = allocations_.size() - 1; i != static_cast(-1); --i) { @@ -110,33 +200,53 @@ void MultiBinBufferedAllocator::FreeCache(size_t size, size_t bin_index) { underlying_allocator_->Free(it->second.release()); allocations_[i].erase(it--); if (accumulated_size >= size) { - return; + return accumulated_size; } } while (!allocations_[i].empty()); } + return accumulated_size; } Allocation *MultiBinBufferedAllocator::AllocateImpl(size_t size, Attr attr) { auto bin_index = FindDivisionPlanBinIndex(division_plan_, size); auto upper_size = TolerantUpperSize(size); - for (; upper_size >= division_plan_[bin_index]; ++bin_index) { + // if (bin_index >= allocations_.size()) { + // VLOG(2) << "Allocate " << size << " from underlying directly"; + //} + + for (; bin_index < allocations_.size() && + upper_size >= division_plan_[bin_index]; + ++bin_index) { auto &allocation = allocations_[bin_index]; platform::LockGuardPtr lock(mtx_[bin_index]); auto it = allocation.lower_bound(size); - if (it != allocation.end() && it->second->size() < upper_size) { + if (it != allocation.end() && it->second->size() <= upper_size) { + size_t sz = it->second->size(); auto ret = std::move(it->second); allocation.erase(it); + VLOG(3) << "Allocate " << sz << "(required " << size + << ") from cache directly"; return ret.release(); } } - try { - return underlying_allocator_->Allocate(size, attr).release(); - } catch (BadAlloc &) { - VLOG(2) << "BadAlloc raises, try to free " << size << " caches"; - FreeCache(size, bin_index); - return underlying_allocator_->Allocate(size, attr).release(); + size_t retry_time = 1; + while (true) { + try { + auto ret = underlying_allocator_->Allocate(size, attr).release(); + VLOG(2) << "Allocate " << size << " from underlying directly"; + return ret; + } catch (BadAlloc &) { + VLOG(1) << retry_time << "-th BadAlloc raises, try to free " << size + << " bytes caches"; + // size_t actual_free_size = FreeCache(size, bin_index); + size_t actual_free_size = FreeCache(-1UL, bin_index); + VLOG(1) << retry_time << "-th free " << actual_free_size + << " bytes caches"; + if (actual_free_size == 0) throw; + } + ++retry_time; } } diff --git a/paddle/fluid/memory/allocation/multi_bin_buffered_allocator.h b/paddle/fluid/memory/allocation/multi_bin_buffered_allocator.h index e2437ff7e35832e80eaf15b5a4a6cd2c83cfe537..f550f76e50c3856cab2573210395bef1e26ebd17 100644 --- a/paddle/fluid/memory/allocation/multi_bin_buffered_allocator.h +++ b/paddle/fluid/memory/allocation/multi_bin_buffered_allocator.h @@ -41,7 +41,7 @@ class MultiBinBufferedAllocator : public Allocator { void FreeImpl(Allocation* allocation) override; private: - void FreeCache(size_t size, size_t bin_index); + size_t FreeCache(size_t size, size_t bin_index); std::shared_ptr underlying_allocator_; std::vector> allocations_; diff --git a/paddle/fluid/memory/detail/buddy_allocator.cc b/paddle/fluid/memory/detail/buddy_allocator.cc index 26ef27c3caafadb4801b0ae52133f6175655ce0a..80d32ba564c3060fe2dc4e1a7eb499eda2c1e1d3 100644 --- a/paddle/fluid/memory/detail/buddy_allocator.cc +++ b/paddle/fluid/memory/detail/buddy_allocator.cc @@ -25,9 +25,11 @@ namespace detail { BuddyAllocator::BuddyAllocator( std::unique_ptr system_allocator, size_t min_chunk_size, - size_t max_chunk_size) + size_t first_allocate_chunk_size, size_t reallocate_chunk_size) : min_chunk_size_(min_chunk_size), - max_chunk_size_(max_chunk_size), + first_allocate_chunk_size_(first_allocate_chunk_size), + reallocate_chunk_size_(reallocate_chunk_size), + max_chunk_size_(first_allocate_chunk_size), cache_(system_allocator->UseGpu()), system_allocator_(std::move(system_allocator)) {} @@ -36,9 +38,10 @@ BuddyAllocator::~BuddyAllocator() { "have actually been freed"; while (!pool_.empty()) { auto block = static_cast(std::get<2>(*pool_.begin())); - VLOG(10) << "Free from block (" << block << ", " << max_chunk_size_ << ")"; + auto desc = cache_.load(block); + VLOG(10) << "Free from block (" << block << ", " << desc.size << ")"; - system_allocator_->Free(block, max_chunk_size_, block->index(cache_)); + system_allocator_->Free(block, desc.size, desc.index); cache_.invalidate(block); pool_.erase(pool_.begin()); } @@ -63,7 +66,7 @@ void* BuddyAllocator::Alloc(size_t unaligned_size) { // if the allocation is huge, send directly to the system allocator if (size > max_chunk_size_) { VLOG(10) << "Allocate from system allocator."; - return SystemAlloc(size); + return SystemAlloc(size, false); } // query and allocate from the existing chunk @@ -72,9 +75,9 @@ void* BuddyAllocator::Alloc(size_t unaligned_size) { // refill the pool if failure if (it == pool_.end()) { it = RefillPool(); - // if still failure, fail fatally + // if still failure, try to allocate from SystemAllocator if (it == pool_.end()) { - return nullptr; + return SystemAlloc(size, false); } } else { VLOG(10) << "Allocation from existing memory block " << std::get<2>(*it) @@ -98,7 +101,7 @@ void BuddyAllocator::Free(void* p) { VLOG(10) << "Free from address " << block; - if (block->type(cache_) == MemoryBlock::HUGE_CHUNK) { + if (block->type(cache_) == MemoryBlock::UNMANAGED_HUGE_CHUNK) { VLOG(10) << "Free directly from system allocator"; system_allocator_->Free(block, block->total_size(cache_), block->index(cache_)); @@ -168,9 +171,12 @@ void BuddyAllocator::Free(void* p) { size_t BuddyAllocator::Used() { return total_used_; } size_t BuddyAllocator::GetMinChunkSize() { return min_chunk_size_; } -size_t BuddyAllocator::GetMaxChunkSize() { return max_chunk_size_; } +size_t BuddyAllocator::GetMaxChunkSize() { + std::lock_guard lock(mutex_); + return max_chunk_size_; +} -void* BuddyAllocator::SystemAlloc(size_t size) { +void* BuddyAllocator::SystemAlloc(size_t size, bool is_managed) { size_t index = 0; void* p = system_allocator_->Alloc(&index, size); @@ -178,25 +184,23 @@ void* BuddyAllocator::SystemAlloc(size_t size) { if (p == nullptr) return nullptr; - static_cast(p)->init(&cache_, MemoryBlock::HUGE_CHUNK, index, - size, nullptr, nullptr); + static_cast(p)->init( + &cache_, is_managed ? MemoryBlock::MANAGED_HUGE_CHUNK + : MemoryBlock::UNMANAGED_HUGE_CHUNK, + index, size, nullptr, nullptr); return static_cast(p)->data(); } BuddyAllocator::PoolSet::iterator BuddyAllocator::RefillPool() { -#ifdef PADDLE_WITH_CUDA - if (system_allocator_->UseGpu()) { - if ((total_used_ + total_free_) == 0) { - // Compute the maximum allocation size for the first allocation. - max_chunk_size_ = platform::GpuMaxChunkSize(); - } + if (total_used_ + total_free_ > 0) { + max_chunk_size_ = reallocate_chunk_size_; } -#endif // Allocate a new maximum sized block size_t index = 0; - void* p = system_allocator_->Alloc(&index, max_chunk_size_); + size_t chunk_size = max_chunk_size_; + void* p = system_allocator_->Alloc(&index, chunk_size); if (p == nullptr) return pool_.end(); @@ -204,7 +208,7 @@ BuddyAllocator::PoolSet::iterator BuddyAllocator::RefillPool() { << " from system allocator"; static_cast(p)->init(&cache_, MemoryBlock::FREE_CHUNK, index, - max_chunk_size_, nullptr, nullptr); + chunk_size, nullptr, nullptr); // gpu fallback allocation if (system_allocator_->UseGpu() && @@ -212,10 +216,10 @@ BuddyAllocator::PoolSet::iterator BuddyAllocator::RefillPool() { fallback_alloc_count_++; } - total_free_ += max_chunk_size_; + total_free_ += chunk_size; // dump the block into pool - return pool_.insert(IndexSizeAddress(index, max_chunk_size_, p)).first; + return pool_.insert(IndexSizeAddress(index, chunk_size, p)).first; } BuddyAllocator::PoolSet::iterator BuddyAllocator::FindExistChunk(size_t size) { @@ -271,27 +275,24 @@ void* BuddyAllocator::SplitToAlloc(BuddyAllocator::PoolSet::iterator it, void BuddyAllocator::CleanIdleFallBackAlloc() { // If fallback allocation does not exist, return directly - if (!fallback_alloc_count_) return; + if (!fallback_alloc_count_ || !system_allocator_->UseGpu()) return; for (auto pool = pool_.rbegin(); pool != pool_.rend();) { - // If free memory block less than max_chunk_size_, return directly - if (std::get<1>(*pool) < max_chunk_size_) return; - MemoryBlock* block = static_cast(std::get<2>(*pool)); - // If no GPU fallback allocator, return - if (!system_allocator_->UseGpu() || block->index(cache_) == 0) { + auto desc = cache_.load(block); + if (desc.index == 0) { return; } VLOG(10) << "Return block " << block << " to fallback allocator."; - system_allocator_->Free(block, max_chunk_size_, block->index(cache_)); + system_allocator_->Free(block, desc.size, block->index(cache_)); cache_.invalidate(block); pool = PoolSet::reverse_iterator(pool_.erase(std::next(pool).base())); - total_free_ -= max_chunk_size_; + total_free_ -= desc.size; fallback_alloc_count_--; // If no fall allocation exists, return directly @@ -315,19 +316,21 @@ void BuddyAllocator::CleanIdleNormalAlloc() { if (!shall_free_alloc()) return; for (auto pool = pool_.rbegin(); pool != pool_.rend();) { - // If free memory block less than max_chunk_size_, return directly - if (std::get<1>(*pool) < max_chunk_size_) return; - MemoryBlock* block = static_cast(std::get<2>(*pool)); + auto desc = cache_.load(block); + + if (desc.type != MemoryBlock::MANAGED_HUGE_CHUNK) { + return; + } VLOG(10) << "Return block " << block << " to base allocator."; - system_allocator_->Free(block, max_chunk_size_, block->index(cache_)); + system_allocator_->Free(block, desc.size, desc.index); cache_.invalidate(block); pool = PoolSet::reverse_iterator(pool_.erase(std::next(pool).base())); - total_free_ -= max_chunk_size_; + total_free_ -= desc.size; if (!shall_free_alloc()) return; } diff --git a/paddle/fluid/memory/detail/buddy_allocator.h b/paddle/fluid/memory/detail/buddy_allocator.h index 3f86a51f0d0b8504bbc4b0477f123093b343e9cf..88d6f736a8f10e4481a061e22e6c75450c3038f9 100644 --- a/paddle/fluid/memory/detail/buddy_allocator.h +++ b/paddle/fluid/memory/detail/buddy_allocator.h @@ -34,7 +34,8 @@ namespace detail { class BuddyAllocator { public: BuddyAllocator(std::unique_ptr system_allocator, - size_t min_chunk_size, size_t max_chunk_size); + size_t min_chunk_size, size_t first_allocate_chunk_size, + size_t reallocate_chunk_size); ~BuddyAllocator(); @@ -57,7 +58,7 @@ class BuddyAllocator { using PoolSet = std::set; /*! \brief Allocate fixed-size memory from system */ - void* SystemAlloc(size_t size); + void* SystemAlloc(size_t size, bool is_managed = true); /*! \brief If existing chunks are not suitable, refill pool */ PoolSet::iterator RefillPool(); @@ -87,7 +88,11 @@ class BuddyAllocator { size_t total_free_ = 0; // the total size of free memory size_t min_chunk_size_; // the minimum size of each chunk - size_t max_chunk_size_; // the maximum size of each chunk + + size_t first_allocate_chunk_size_; + size_t reallocate_chunk_size_; + + size_t max_chunk_size_; private: /** diff --git a/paddle/fluid/memory/detail/memory_block.h b/paddle/fluid/memory/detail/memory_block.h index 5cceba659beeec1b3c986dc43229f6725e3e11de..5e5ff5b849d795f36e3b53ae626617ae7eea2751 100644 --- a/paddle/fluid/memory/detail/memory_block.h +++ b/paddle/fluid/memory/detail/memory_block.h @@ -27,10 +27,11 @@ class MetadataCache; // MemoryBlock::Desc and the payload. struct MemoryBlock { enum Type { - FREE_CHUNK, // memory is free and idle - ARENA_CHUNK, // memory is being occupied - HUGE_CHUNK, // memory is out of management - INVALID_CHUNK // memory is invalid + FREE_CHUNK, // memory is free and idle + ARENA_CHUNK, // memory is being occupied + MANAGED_HUGE_CHUNK, // memory is huge and out of management + UNMANAGED_HUGE_CHUNK, // memory is huge and managed by allocator + INVALID_CHUNK // memory is invalid }; // init saves the MemoryBlock::Desc of the memory block in a MetadataCache. diff --git a/paddle/fluid/operators/benchmark/op_tester.cc b/paddle/fluid/operators/benchmark/op_tester.cc index 064903c299d947df3c6b42d916fce8dcbd85eebb..fec091255f6391b77cd2858905f3aa2e5dd8baff 100644 --- a/paddle/fluid/operators/benchmark/op_tester.cc +++ b/paddle/fluid/operators/benchmark/op_tester.cc @@ -42,8 +42,8 @@ void OpTester::Init(const OpTesterConfig &config) { // Initialize the OpDesc if (op_desc_info.Has(config_.op_type)) { type_ = config_.op_type; - op_desc_.SetType(config_.op_type); + CreateOpDesc(); CreateInputVarDesc(); CreateOutputVarDesc(); } else { @@ -131,6 +131,40 @@ std::vector OpTester::GetOpProtoOutputNames() { return output_names; } +std::unordered_map +OpTester::GetOpProtoAttrNames() { + std::unordered_map attr_types; + const framework::proto::OpProto &proto = + framework::OpInfoMap::Instance().Get(type_).Proto(); + const std::vector skipped_attrs = { + framework::OpProtoAndCheckerMaker::OpRoleAttrName(), + framework::OpProtoAndCheckerMaker::OpRoleVarAttrName(), + framework::OpProtoAndCheckerMaker::OpNamescopeAttrName(), + framework::OpProtoAndCheckerMaker::OpCreationCallstackAttrName()}; + for (int i = 0; i != proto.attrs_size(); ++i) { + const auto &attr = proto.attrs(i); + if (!Has(skipped_attrs, attr.name())) { + VLOG(4) << "attr: " << attr.name() << ", type: " << attr.type(); + attr_types[attr.name()] = attr.type(); + } + } + return attr_types; +} + +framework::proto::VarType::Type OpTester::TransToVarType(std::string str) { + if (str == "int32") { + return framework::proto::VarType::INT32; + } else if (str == "int64") { + return framework::proto::VarType::INT64; + } else if (str == "fp32") { + return framework::proto::VarType::FP32; + } else if (str == "fp64") { + return framework::proto::VarType::FP64; + } else { + PADDLE_THROW("Unsupported dtype %s.", str.c_str()); + } +} + void OpTester::CreateInputVarDesc() { std::vector input_names = GetOpProtoInputNames(); for (auto &name : input_names) { @@ -145,11 +179,11 @@ void OpTester::CreateInputVarDesc() { // Need to support more type var->SetType(framework::proto::VarType::LOD_TENSOR); var->SetPersistable(false); - var->SetDataType(framework::proto::VarType::FP32); + var->SetDataType(TransToVarType(input->dtype)); var->SetShape(input->dims); op_desc_.SetInput(name, {var_name}); - input_lods_[var_name] = input->lod; + inputs_[var_name] = *input; } } @@ -167,6 +201,49 @@ void OpTester::CreateOutputVarDesc() { } } +void OpTester::CreateOpDesc() { + op_desc_.SetType(config_.op_type); + std::unordered_map attr_types = + GetOpProtoAttrNames(); + for (auto item : config_.attrs) { + const std::string &name = item.first; + if (attr_types.find(name) == attr_types.end()) { + LOG(FATAL) << "Operator " << type_ << " do not have attr " << name; + } + + const std::string &value_str = item.second; + const framework::proto::AttrType &type = attr_types[name]; + switch (type) { + case framework::proto::AttrType::BOOLEAN: + break; + case framework::proto::AttrType::INT: { + int value = StringTo(value_str); + op_desc_.SetAttr(name, {value}); + } break; + case framework::proto::AttrType::FLOAT: { + float value = StringTo(value_str); + op_desc_.SetAttr(name, {value}); + } break; + case framework::proto::AttrType::STRING: { + op_desc_.SetAttr(name, {value_str}); + } break; + case framework::proto::AttrType::BOOLEANS: + case framework::proto::AttrType::INTS: + case framework::proto::AttrType::FLOATS: + case framework::proto::AttrType::STRINGS: + LOG(FATAL) << "Not supported yet."; + break; + case framework::proto::AttrType::LONG: { + int64_t value = StringTo(value_str); + op_desc_.SetAttr(name, value); + } break; + case framework::proto::AttrType::LONGS: + default: + PADDLE_THROW("Unsupport attr type %d", type); + } + } +} + framework::VarDesc *OpTester::Var(const std::string &name) { auto it = vars_.find(name); if (it != vars_.end()) { @@ -179,24 +256,41 @@ framework::VarDesc *OpTester::Var(const std::string &name) { template void OpTester::SetupTensor(framework::LoDTensor *tensor, - const std::vector &shape, T lower, - T upper) { + const std::vector &shape, T lower, T upper, + const std::string &initializer) { static unsigned int seed = 100; std::mt19937 rng(seed++); std::uniform_real_distribution uniform_dist(0, 1); T *ptr = tensor->mutable_data(framework::make_ddim(shape), place_); - if (platform::is_cpu_place(place_)) { - for (int i = 0; i < tensor->numel(); ++i) { - ptr[i] = static_cast(uniform_dist(rng) * (upper - lower) + lower); - } + + framework::LoDTensor cpu_tensor; + T *cpu_ptr = nullptr; + + if (!platform::is_cpu_place(place_)) { + cpu_ptr = cpu_tensor.mutable_data(framework::make_ddim(shape), + platform::CPUPlace()); } else { - framework::LoDTensor cpu_tensor; - T *cpu_ptr = cpu_tensor.mutable_data(framework::make_ddim(shape), - platform::CPUPlace()); + cpu_ptr = ptr; + } + + if (initializer == "random") { for (int i = 0; i < cpu_tensor.numel(); ++i) { cpu_ptr[i] = static_cast(uniform_dist(rng) * (upper - lower) + lower); } + } else if (initializer == "natural") { + for (int i = 0; i < cpu_tensor.numel(); ++i) { + cpu_ptr[i] = lower + i; + } + } else if (initializer == "zeros") { + for (int i = 0; i < cpu_tensor.numel(); ++i) { + cpu_ptr[i] = 0; + } + } else { + PADDLE_THROW("Unsupported initializer %s.", initializer.c_str()); + } + + if (!platform::is_cpu_place(place_)) { TensorCopySync(cpu_tensor, place_, tensor); } } @@ -219,7 +313,7 @@ void OpTester::CreateVariables(framework::Scope *scope) { } } - for (auto &item : input_lods_) { + for (auto &item : inputs_) { // Allocate memory for input tensor auto &var_name = item.first; VLOG(3) << "Allocate memory for tensor " << var_name; @@ -229,11 +323,23 @@ void OpTester::CreateVariables(framework::Scope *scope) { auto *var = scope->Var(var_name); auto *tensor = var->GetMutable(); - SetupTensor(tensor, shape, static_cast(0.0), - static_cast(1.0)); + const auto &data_type = var_desc->GetDataType(); + if (data_type == framework::proto::VarType::INT32) { + SetupTensor(tensor, shape, 0, 1, item.second.initializer); + } else if (data_type == framework::proto::VarType::INT64) { + SetupTensor(tensor, shape, 0, 1, item.second.initializer); + } else if (data_type == framework::proto::VarType::FP32) { + SetupTensor(tensor, shape, static_cast(0.0), + static_cast(1.0), item.second.initializer); + } else if (data_type == framework::proto::VarType::FP64) { + SetupTensor(tensor, shape, static_cast(0.0), + static_cast(1.0), item.second.initializer); + } else { + PADDLE_THROW("Unsupported dtype %d.", data_type); + } VLOG(3) << "Set lod for tensor " << var_name; - std::vector> &lod_vec = item.second; + std::vector> &lod_vec = item.second.lod; framework::LoD lod; for (size_t i = 0; i < lod_vec.size(); ++i) { lod.push_back(lod_vec[i]); @@ -261,7 +367,16 @@ std::string OpTester::DebugString() { ss << GenSpaces(count) << "type: LOD_TENSOR\n"; ss << GenSpaces(count++) << "lod_tensor {\n"; ss << GenSpaces(count++) << "tensor {\n"; - ss << GenSpaces(count) << "data_type: FP32\n"; + const auto &data_type = var->GetDataType(); + if (data_type == framework::proto::VarType::INT32) { + ss << GenSpaces(count) << "data_type: INT32\n"; + } else if (data_type == framework::proto::VarType::INT64) { + ss << GenSpaces(count) << "data_type: INT64\n"; + } else if (data_type == framework::proto::VarType::FP32) { + ss << GenSpaces(count) << "data_type: FP32\n"; + } else if (data_type == framework::proto::VarType::FP64) { + ss << GenSpaces(count) << "data_type: FP64\n"; + } std::vector shape = var->GetShape(); for (auto d : shape) { ss << GenSpaces(count) << "dims: " << d << "\n"; @@ -288,6 +403,63 @@ std::string OpTester::DebugString() { ss << GenSpaces(--count) << "}\n"; } ss << GenSpaces(count) << "type: " << op_desc_.Type() << "\n"; + for (auto &name : op_desc_.AttrNames()) { + ss << GenSpaces(count++) << "attrs {\n"; + const auto &attr_type = op_desc_.GetAttrType(name); + const auto &attr = op_desc_.GetAttr(name); + ss << GenSpaces(count) << "name: \"" << name << "\"\n"; + switch (attr_type) { + case framework::proto::AttrType::BOOLEAN: { + ss << GenSpaces(count) << "type: BOOLEAN\n"; + ss << GenSpaces(count) << "b: " << boost::get(attr) << "\n"; + } break; + case framework::proto::AttrType::INT: { + ss << GenSpaces(count) << "type: INT\n"; + ss << GenSpaces(count) << "i: " << boost::get(attr) << "\n"; + } break; + case framework::proto::AttrType::FLOAT: { + ss << GenSpaces(count) << "type: FLOAT\n"; + ss << GenSpaces(count) << "f: " << boost::get(attr) << "\n"; + } break; + case framework::proto::AttrType::STRING: { + ss << GenSpaces(count) << "type: STRING\n"; + ss << GenSpaces(count) << "s: \"" << boost::get(attr) + << "\"\n"; + } break; + case framework::proto::AttrType::BOOLEANS: { + ss << GenSpaces(count) << "type: BOOLEANS\n"; + ss << GenSpaces(count) << "bools: " + << "\n"; + } break; + case framework::proto::AttrType::INTS: { + ss << GenSpaces(count) << "type: INTS\n"; + ss << GenSpaces(count) << "ints: " + << "\n"; + } break; + case framework::proto::AttrType::FLOATS: { + ss << GenSpaces(count) << "type: FLOATS\n"; + ss << GenSpaces(count) << "floats: " + << "\n"; + } break; + case framework::proto::AttrType::STRINGS: { + ss << GenSpaces(count) << "type: STRINGS\n"; + ss << GenSpaces(count) << "strings: " + << "\n"; + } break; + case framework::proto::AttrType::LONG: { + ss << GenSpaces(count) << "type: LONG\n"; + ss << GenSpaces(count) << "l: " << boost::get(attr) << "\n"; + } break; + case framework::proto::AttrType::LONGS: { + ss << GenSpaces(count) << "type: LONGS\n"; + ss << GenSpaces(count) << "longs: " + << "\n"; + } break; + default: + PADDLE_THROW("Unsupport attr type %d", attr_type); + } + ss << GenSpaces(--count) << "}\n"; + } ss << GenSpaces(--count) << "}\n"; return ss.str(); } @@ -299,6 +471,7 @@ TEST(op_tester, base) { FLAGS_op_config_list.c_str()); std::vector op_configs; while (!fin.eof()) { + VLOG(4) << "Reading config " << op_configs.size() << "..."; OpTesterConfig config; bool result = config.Init(fin); if (result) { diff --git a/paddle/fluid/operators/benchmark/op_tester.h b/paddle/fluid/operators/benchmark/op_tester.h index 8f150b23ad783acdfd203d471d578ab6aae71494..328389293c4b71a2f1fefbc3bf26fd46b79ec6e2 100644 --- a/paddle/fluid/operators/benchmark/op_tester.h +++ b/paddle/fluid/operators/benchmark/op_tester.h @@ -14,7 +14,9 @@ limitations under the License. */ #pragma once +#include #include +#include #include #include "paddle/fluid/framework/ddim.h" #include "paddle/fluid/framework/op_desc.h" @@ -39,16 +41,21 @@ class OpTester { private: std::vector GetOpProtoInputNames(); std::vector GetOpProtoOutputNames(); + std::unordered_map + GetOpProtoAttrNames(); + framework::proto::VarType::Type TransToVarType(std::string str); void CreateInputVarDesc(); void CreateOutputVarDesc(); + void CreateOpDesc(); framework::VarDesc *Var(const std::string &name); void CreateVariables(framework::Scope *scope); template void SetupTensor(framework::LoDTensor *input, - const std::vector &shape, T lower, T upper); + const std::vector &shape, T lower, T upper, + const std::string &initializer); void RunImpl(); @@ -57,7 +64,7 @@ class OpTester { std::string type_; framework::OpDesc op_desc_; std::unordered_map> vars_; - std::unordered_map>> input_lods_; + std::unordered_map inputs_; std::unique_ptr op_; platform::Place place_; std::unique_ptr scope_; diff --git a/paddle/fluid/operators/benchmark/op_tester_config.cc b/paddle/fluid/operators/benchmark/op_tester_config.cc index 8336804ec07d2b7b176f55ad4113452086296494..b4878ab04244cf6b54d323943fc1fbf4e3882660 100644 --- a/paddle/fluid/operators/benchmark/op_tester_config.cc +++ b/paddle/fluid/operators/benchmark/op_tester_config.cc @@ -14,7 +14,6 @@ limitations under the License. */ #include "paddle/fluid/operators/benchmark/op_tester_config.h" #include -#include "glog/logging.h" #include "paddle/fluid/platform/enforce.h" namespace paddle { @@ -40,6 +39,62 @@ static void EraseEndSep(std::string* str, } } +OpInputConfig::OpInputConfig(std::istream& is) { + std::string sep; + is >> sep; + if (sep == kStartSeparator) { + while (sep != kEndSeparator) { + is >> sep; + if (sep == "name" || sep == "name:") { + is >> name; + EraseEndSep(&name); + } else if (sep == "dtype" || sep == "dtype:") { + ParseDType(is); + } else if (sep == "initializer" || sep == "initializer:") { + ParseInitializer(is); + } else if (sep == "dims" || sep == "dims:") { + ParseDims(is); + } else if (sep == "lod" || sep == "lod:") { + ParseLoD(is); + } + } + } +} + +void OpInputConfig::ParseDType(std::istream& is) { + std::string dtype_str; + is >> dtype_str; + EraseEndSep(&dtype_str); + + if (dtype_str == "int32" || dtype_str == "int") { + dtype = "int32"; + } else if (dtype_str == "int64" || dtype_str == "long") { + dtype = "int64"; + } else if (dtype_str == "fp32" || dtype_str == "float") { + dtype = "fp32"; + } else if (dtype_str == "fp64" || dtype_str == "double") { + dtype = "fp64"; + } else { + PADDLE_THROW("Unsupported dtype %s", dtype_str.c_str()); + } + VLOG(4) << "dtype of input " << name << " is: " << dtype; +} + +void OpInputConfig::ParseInitializer(std::istream& is) { + std::string initializer_str; + is >> initializer_str; + EraseEndSep(&initializer_str); + + const std::vector supported_initializers = {"random", "natural", + "zeros"}; + if (!Has(supported_initializers, initializer_str)) { + PADDLE_THROW("Unsupported initializer %s", initializer_str.c_str()); + } + + initializer = initializer_str; + VLOG(4) << "initializer of input " << name << " is: " << initializer; +} + void OpInputConfig::ParseDims(std::istream& is) { std::string dims_str; is >> dims_str; @@ -84,7 +139,7 @@ void OpInputConfig::ParseLoD(std::istream& is) { number += lod_str[i]; ++i; } - level.push_back(atoi(number.c_str())); + level.push_back(StringTo(number)); } lod.push_back(level); } else if (lod_str[i] == '}') { @@ -93,24 +148,6 @@ void OpInputConfig::ParseLoD(std::istream& is) { } } -OpInputConfig::OpInputConfig(std::istream& is) { - std::string sep; - is >> sep; - if (sep == kStartSeparator) { - while (sep != kEndSeparator) { - is >> sep; - if (sep == "name" || sep == "name:") { - is >> name; - EraseEndSep(&name); - } else if (sep == "dims" || sep == "dims:") { - ParseDims(is); - } else if (sep == "lod" || sep == "lod:") { - ParseLoD(is); - } - } - } -} - OpTesterConfig::OpTesterConfig(const std::string& filename) { std::ifstream fin(filename, std::ios::in | std::ios::binary); PADDLE_ENFORCE(static_cast(fin), "Cannot open file %s", @@ -167,6 +204,7 @@ bool OpTesterConfig::ParseAttrs(std::istream& is) { is >> value; EraseEndSep(&key, ":"); EraseEndSep(&value); + VLOG(4) << "attrs: " << key << ", " << value; attrs[key] = value; } diff --git a/paddle/fluid/operators/benchmark/op_tester_config.h b/paddle/fluid/operators/benchmark/op_tester_config.h index c2ff6dafc053eb7202a686954d53ae6f3d62d02e..5803f82ac28867a481875c2af607290c5d366146 100644 --- a/paddle/fluid/operators/benchmark/op_tester_config.h +++ b/paddle/fluid/operators/benchmark/op_tester_config.h @@ -15,6 +15,7 @@ limitations under the License. */ #pragma once #include +#include #include #include #include @@ -27,10 +28,14 @@ struct OpInputConfig { OpInputConfig() {} explicit OpInputConfig(std::istream& is); + void ParseDType(std::istream& is); + void ParseInitializer(std::istream& is); void ParseDims(std::istream& is); void ParseLoD(std::istream& is); std::string name; + std::string dtype{"fp32"}; // int32/int, int64/long, fp32/float, fp64/double + std::string initializer{"random"}; // random, natural std::vector dims; std::vector> lod; }; @@ -55,6 +60,23 @@ struct OpTesterConfig { double runtime{0.0}; }; +static bool Has(const std::vector& vec, const std::string& item) { + for (size_t i = 0; i < vec.size(); ++i) { + if (vec[i] == item) { + return true; + } + } + return false; +} + +template +T StringTo(const std::string& str) { + std::istringstream is(str); + T value; + is >> value; + return value; +} + } // namespace benchmark } // namespace operators } // namespace paddle diff --git a/paddle/fluid/operators/cast_op.cc b/paddle/fluid/operators/cast_op.cc index 8d6a498dc941e44688ec8a2b49a6e080608f9b85..0c517cc757ca3f6f1ff7f4191ab2d529890b7154 100644 --- a/paddle/fluid/operators/cast_op.cc +++ b/paddle/fluid/operators/cast_op.cc @@ -13,6 +13,7 @@ See the License for the specific language governing permissions and limitations under the License. */ #include "paddle/fluid/operators/cast_op.h" +#include #include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/platform/float16.h" @@ -30,7 +31,8 @@ class CastOpProtoMaker : public framework::OpProtoAndCheckerMaker { Cast Operator. This Operator casts the input tensor to another data type and -returns tha Output Tensor. +returns the Output Tensor. It's meaningless if the output dtype equals +the input dtype, but it's fine if you do so. )DOC"); } diff --git a/paddle/fluid/operators/detection/CMakeLists.txt b/paddle/fluid/operators/detection/CMakeLists.txt index f6fbe97565c43c306ea885c765c0a665492fa317..c87837e69424335ac926bf05664e5f79940390b5 100644 --- a/paddle/fluid/operators/detection/CMakeLists.txt +++ b/paddle/fluid/operators/detection/CMakeLists.txt @@ -33,11 +33,14 @@ detection_library(rpn_target_assign_op SRCS rpn_target_assign_op.cc) detection_library(generate_proposal_labels_op SRCS generate_proposal_labels_op.cc) detection_library(box_clip_op SRCS box_clip_op.cc box_clip_op.cu) detection_library(yolov3_loss_op SRCS yolov3_loss_op.cc) +detection_library(box_decoder_and_assign_op SRCS box_decoder_and_assign_op.cc box_decoder_and_assign_op.cu) if(WITH_GPU) detection_library(generate_proposals_op SRCS generate_proposals_op.cc generate_proposals_op.cu DEPS memory cub) + detection_library(distribute_fpn_proposals_op SRCS distribute_fpn_proposals_op.cc distribute_fpn_proposals_op.cu DEPS memory cub) else() detection_library(generate_proposals_op SRCS generate_proposals_op.cc) + detection_library(distribute_fpn_proposals_op SRCS distribute_fpn_proposals_op.cc) endif() detection_library(roi_perspective_transform_op SRCS roi_perspective_transform_op.cc roi_perspective_transform_op.cu) diff --git a/paddle/fluid/operators/detection/box_decoder_and_assign_op.cc b/paddle/fluid/operators/detection/box_decoder_and_assign_op.cc new file mode 100644 index 0000000000000000000000000000000000000000..945d575a6446429a0ec34a603356c2c99263a776 --- /dev/null +++ b/paddle/fluid/operators/detection/box_decoder_and_assign_op.cc @@ -0,0 +1,169 @@ +/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + http://www.apache.org/licenses/LICENSE-2.0 +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/operators/detection/box_decoder_and_assign_op.h" + +namespace paddle { +namespace operators { + +using LoDTensor = framework::LoDTensor; + +class BoxDecoderAndAssignOp : public framework::OperatorWithKernel { + public: + using framework::OperatorWithKernel::OperatorWithKernel; + + protected: + void InferShape(framework::InferShapeContext *ctx) const override { + PADDLE_ENFORCE( + ctx->HasInput("PriorBox"), + "Input(PriorBox) of BoxDecoderAndAssignOp should not be null."); + PADDLE_ENFORCE( + ctx->HasInput("PriorBoxVar"), + "Input(PriorBoxVar) of BoxDecoderAndAssignOp should not be null."); + PADDLE_ENFORCE( + ctx->HasInput("TargetBox"), + "Input(TargetBox) of BoxDecoderAndAssignOp should not be null."); + PADDLE_ENFORCE( + ctx->HasInput("BoxScore"), + "Input(BoxScore) of BoxDecoderAndAssignOp should not be null."); + PADDLE_ENFORCE( + ctx->HasOutput("DecodeBox"), + "Output(DecodeBox) of BoxDecoderAndAssignOp should not be null."); + PADDLE_ENFORCE( + ctx->HasOutput("OutputAssignBox"), + "Output(OutputAssignBox) of BoxDecoderAndAssignOp should not be null."); + + auto prior_box_dims = ctx->GetInputDim("PriorBox"); + auto prior_box_var_dims = ctx->GetInputDim("PriorBoxVar"); + auto target_box_dims = ctx->GetInputDim("TargetBox"); + auto box_score_dims = ctx->GetInputDim("BoxScore"); + + PADDLE_ENFORCE_EQ(prior_box_dims.size(), 2, + "The rank of Input of PriorBox must be 2"); + PADDLE_ENFORCE_EQ(prior_box_dims[1], 4, "The shape of PriorBox is [N, 4]"); + PADDLE_ENFORCE_EQ(prior_box_var_dims.size(), 1, + "The rank of Input of PriorBoxVar must be 1"); + PADDLE_ENFORCE_EQ(prior_box_var_dims[0], 4, + "The shape of PriorBoxVar is [4]"); + PADDLE_ENFORCE_EQ(target_box_dims.size(), 2, + "The rank of Input of TargetBox must be 2"); + PADDLE_ENFORCE_EQ(box_score_dims.size(), 2, + "The rank of Input of BoxScore must be 2"); + PADDLE_ENFORCE_EQ(prior_box_dims[0], target_box_dims[0], + "The first dim of prior_box and target_box is roi nums " + "and should be same!"); + PADDLE_ENFORCE_EQ(prior_box_dims[0], box_score_dims[0], + "The first dim of prior_box and box_score is roi nums " + "and should be same!"); + PADDLE_ENFORCE_EQ(target_box_dims[1], box_score_dims[1] * prior_box_dims[1], + "The shape of target_box is [N, classnum * 4], The shape " + "of box_score is [N, classnum], The shape of prior_box " + "is [N, 4]"); + + ctx->SetOutputDim("DecodeBox", framework::make_ddim({target_box_dims[0], + target_box_dims[1]})); + ctx->ShareLoD("TargetBox", /*->*/ "DecodeBox"); + ctx->SetOutputDim( + "OutputAssignBox", + framework::make_ddim({prior_box_dims[0], prior_box_dims[1]})); + ctx->ShareLoD("PriorBox", /*->*/ "OutputAssignBox"); + } +}; + +class BoxDecoderAndAssignOpMaker : public framework::OpProtoAndCheckerMaker { + public: + void Make() override { + AddInput( + "PriorBox", + "(Tensor, default Tensor) " + "Box list PriorBox is a 2-D Tensor with shape [N, 4] which holds N " + "boxes and each box is represented as [xmin, ymin, xmax, ymax], " + "[xmin, ymin] is the left top coordinate of the anchor box, " + "if the input is image feature map, they are close to the origin " + "of the coordinate system. [xmax, ymax] is the right bottom " + "coordinate of the anchor box."); + AddInput("PriorBoxVar", + "(Tensor, default Tensor, optional) " + "PriorBoxVar is a 2-D Tensor with shape [N, 4] which holds N " + "group of variance. PriorBoxVar will set all elements to 1 by " + "default.") + .AsDispensable(); + AddInput("TargetBox", + "(LoDTensor or Tensor) " + "This input can be a 2-D LoDTensor with shape " + "[N, classnum*4]. It holds N targets for N boxes."); + AddInput("BoxScore", + "(LoDTensor or Tensor) " + "This input can be a 2-D LoDTensor with shape " + "[N, classnum], each box is represented as [classnum] which is " + "the classification probabilities."); + AddAttr("box_clip", + "(float, default 4.135, np.log(1000. / 16.)) " + "clip box to prevent overflowing") + .SetDefault(4.135f); + AddOutput("DecodeBox", + "(LoDTensor or Tensor) " + "the output tensor of op with shape [N, classnum * 4] " + "representing the result of N target boxes decoded with " + "M Prior boxes and variances for each class."); + AddOutput("OutputAssignBox", + "(LoDTensor or Tensor) " + "the output tensor of op with shape [N, 4] " + "representing the result of N target boxes decoded with " + "M Prior boxes and variances with the best non-background class " + "by BoxScore."); + AddComment(R"DOC( + +Bounding Box Coder. + +Decode the target bounding box with the prior_box information. + +The Decoding schema is described below: + + $$ + ox = (pw \\times pxv \\times tx + px) - \\frac{tw}{2} + $$ + $$ + oy = (ph \\times pyv \\times ty + py) - \\frac{th}{2} + $$ + $$ + ow = \\exp (pwv \\times tw) \\times pw + \\frac{tw}{2} + $$ + $$ + oh = \\exp (phv \\times th) \\times ph + \\frac{th}{2} + $$ + +where `tx`, `ty`, `tw`, `th` denote the target box's center coordinates, width +and height respectively. Similarly, `px`, `py`, `pw`, `ph` denote the +prior_box's (anchor) center coordinates, width and height. `pxv`, `pyv`, `pwv`, +`phv` denote the variance of the prior_box and `ox`, `oy`, `ow`, `oh` denote the +decoded coordinates, width and height in decode_box. + +decode_box is obtained after box decode, then assigning schema is described below: + +For each prior_box, use the best non-background class's decoded values to +update the prior_box locations and get output_assign_box. So, the shape of +output_assign_box is the same as PriorBox. +)DOC"); + } +}; + +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; +REGISTER_OPERATOR(box_decoder_and_assign, ops::BoxDecoderAndAssignOp, + ops::BoxDecoderAndAssignOpMaker, + paddle::framework::EmptyGradOpMaker); +REGISTER_OP_CPU_KERNEL( + box_decoder_and_assign, + ops::BoxDecoderAndAssignKernel, + ops::BoxDecoderAndAssignKernel); diff --git a/paddle/fluid/operators/detection/box_decoder_and_assign_op.cu b/paddle/fluid/operators/detection/box_decoder_and_assign_op.cu new file mode 100644 index 0000000000000000000000000000000000000000..25e6545eb59bde5e080dc907f9ecd4281062413f --- /dev/null +++ b/paddle/fluid/operators/detection/box_decoder_and_assign_op.cu @@ -0,0 +1,147 @@ +/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + http://www.apache.org/licenses/LICENSE-2.0 +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/memory/memcpy.h" +#include "paddle/fluid/operators/detection/box_decoder_and_assign_op.h" +#include "paddle/fluid/platform/cuda_primitives.h" + +namespace paddle { +namespace operators { + +template +__global__ void DecodeBoxKernel(const T* prior_box_data, + const T* prior_box_var_data, + const T* target_box_data, const int roi_num, + const int class_num, const T box_clip, + T* output_box_data) { + const int idx = threadIdx.x + blockIdx.x * blockDim.x; + if (idx < roi_num * class_num) { + int i = idx / class_num; + int j = idx % class_num; + T prior_box_width = prior_box_data[i * 4 + 2] - prior_box_data[i * 4] + 1; + T prior_box_height = + prior_box_data[i * 4 + 3] - prior_box_data[i * 4 + 1] + 1; + T prior_box_center_x = prior_box_data[i * 4] + prior_box_width / 2; + T prior_box_center_y = prior_box_data[i * 4 + 1] + prior_box_height / 2; + + int offset = i * class_num * 4 + j * 4; + T dw = prior_box_var_data[2] * target_box_data[offset + 2]; + T dh = prior_box_var_data[3] * target_box_data[offset + 3]; + if (dw > box_clip) { + dw = box_clip; + } + if (dh > box_clip) { + dh = box_clip; + } + T target_box_center_x = 0, target_box_center_y = 0; + T target_box_width = 0, target_box_height = 0; + target_box_center_x = + prior_box_var_data[0] * target_box_data[offset] * prior_box_width + + prior_box_center_x; + target_box_center_y = + prior_box_var_data[1] * target_box_data[offset + 1] * prior_box_height + + prior_box_center_y; + target_box_width = expf(dw) * prior_box_width; + target_box_height = expf(dh) * prior_box_height; + + output_box_data[offset] = target_box_center_x - target_box_width / 2; + output_box_data[offset + 1] = target_box_center_y - target_box_height / 2; + output_box_data[offset + 2] = + target_box_center_x + target_box_width / 2 - 1; + output_box_data[offset + 3] = + target_box_center_y + target_box_height / 2 - 1; + } +} + +template +__global__ void AssignBoxKernel(const T* prior_box_data, + const T* box_score_data, T* output_box_data, + const int roi_num, const int class_num, + T* output_assign_box_data) { + const int idx = threadIdx.x + blockIdx.x * blockDim.x; + if (idx < roi_num) { + int i = idx; + T max_score = -1; + int max_j = -1; + for (int j = 0; j < class_num; ++j) { + T score = box_score_data[i * class_num + j]; + if (score > max_score && j > 0) { + max_score = score; + max_j = j; + } + } + if (max_j > 0) { + for (int pno = 0; pno < 4; pno++) { + output_assign_box_data[i * 4 + pno] = + output_box_data[i * class_num * 4 + max_j * 4 + pno]; + } + } else { + for (int pno = 0; pno < 4; pno++) { + output_assign_box_data[i * 4 + pno] = prior_box_data[i * 4 + pno]; + } + } + } +} + +template +class BoxDecoderAndAssignCUDAKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& context) const override { + PADDLE_ENFORCE(platform::is_gpu_place(context.GetPlace()), + "This kernel only runs on GPU device."); + auto* prior_box = context.Input("PriorBox"); + auto* prior_box_var = context.Input("PriorBoxVar"); + auto* target_box = context.Input("TargetBox"); + auto* box_score = context.Input("BoxScore"); + auto* output_box = context.Output("DecodeBox"); + auto* output_assign_box = + context.Output("OutputAssignBox"); + + auto roi_num = target_box->dims()[0]; + auto class_num = box_score->dims()[1]; + auto* target_box_data = target_box->data(); + auto* prior_box_data = prior_box->data(); + auto* prior_box_var_data = prior_box_var->data(); + auto* box_score_data = box_score->data(); + output_box->mutable_data({roi_num, class_num * 4}, context.GetPlace()); + output_assign_box->mutable_data({roi_num, 4}, context.GetPlace()); + T* output_box_data = output_box->data(); + T* output_assign_box_data = output_assign_box->data(); + + int block = 512; + int grid = (roi_num * class_num + block - 1) / block; + auto& device_ctx = context.cuda_device_context(); + + const T box_clip = context.Attr("box_clip"); + + DecodeBoxKernel<<>>( + prior_box_data, prior_box_var_data, target_box_data, roi_num, class_num, + box_clip, output_box_data); + + context.device_context().Wait(); + int assign_grid = (roi_num + block - 1) / block; + AssignBoxKernel<<>>( + prior_box_data, box_score_data, output_box_data, roi_num, class_num, + output_assign_box_data); + context.device_context().Wait(); + } +}; + +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; +REGISTER_OP_CUDA_KERNEL( + box_decoder_and_assign, + ops::BoxDecoderAndAssignCUDAKernel, + ops::BoxDecoderAndAssignCUDAKernel); diff --git a/paddle/fluid/operators/detection/box_decoder_and_assign_op.h b/paddle/fluid/operators/detection/box_decoder_and_assign_op.h new file mode 100644 index 0000000000000000000000000000000000000000..e66a8351f4761fc805dbd2e44f237c751642d816 --- /dev/null +++ b/paddle/fluid/operators/detection/box_decoder_and_assign_op.h @@ -0,0 +1,103 @@ +/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + http://www.apache.org/licenses/LICENSE-2.0 +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once +#include +#include +#include +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/operators/math/math_function.h" + +namespace paddle { +namespace operators { + +template +class BoxDecoderAndAssignKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& context) const override { + auto* prior_box = context.Input("PriorBox"); + auto* prior_box_var = context.Input("PriorBoxVar"); + auto* target_box = context.Input("TargetBox"); + auto* box_score = context.Input("BoxScore"); + auto* output_box = context.Output("DecodeBox"); + auto* output_assign_box = + context.Output("OutputAssignBox"); + int roi_num = target_box->dims()[0]; + int class_num = box_score->dims()[1]; + auto* target_box_data = target_box->data(); + auto* prior_box_data = prior_box->data(); + auto* prior_box_var_data = prior_box_var->data(); + auto* box_score_data = box_score->data(); + output_box->mutable_data({roi_num, class_num * 4}, context.GetPlace()); + output_assign_box->mutable_data({roi_num, 4}, context.GetPlace()); + T* output_box_data = output_box->data(); + T* output_assign_box_data = output_assign_box->data(); + const T bbox_clip = context.Attr("box_clip"); + + for (int i = 0; i < roi_num; ++i) { + T prior_box_width = prior_box_data[i * 4 + 2] - prior_box_data[i * 4] + 1; + T prior_box_height = + prior_box_data[i * 4 + 3] - prior_box_data[i * 4 + 1] + 1; + T prior_box_center_x = prior_box_data[i * 4] + prior_box_width / 2; + T prior_box_center_y = prior_box_data[i * 4 + 1] + prior_box_height / 2; + for (int j = 0; j < class_num; ++j) { + int64_t offset = i * class_num * 4 + j * 4; + T dw = std::min(prior_box_var_data[2] * target_box_data[offset + 2], + bbox_clip); + T dh = std::min(prior_box_var_data[3] * target_box_data[offset + 3], + bbox_clip); + T target_box_center_x = 0, target_box_center_y = 0; + T target_box_width = 0, target_box_height = 0; + target_box_center_x = + prior_box_var_data[0] * target_box_data[offset] * prior_box_width + + prior_box_center_x; + target_box_center_y = prior_box_var_data[1] * + target_box_data[offset + 1] * + prior_box_height + + prior_box_center_y; + target_box_width = std::exp(dw) * prior_box_width; + target_box_height = std::exp(dh) * prior_box_height; + + output_box_data[offset] = target_box_center_x - target_box_width / 2; + output_box_data[offset + 1] = + target_box_center_y - target_box_height / 2; + output_box_data[offset + 2] = + target_box_center_x + target_box_width / 2 - 1; + output_box_data[offset + 3] = + target_box_center_y + target_box_height / 2 - 1; + } + + T max_score = -1; + int max_j = -1; + for (int j = 0; j < class_num; ++j) { + T score = box_score_data[i * class_num + j]; + if (score > max_score && j > 0) { + max_score = score; + max_j = j; + } + } + + if (max_j > 0) { + for (int pno = 0; pno < 4; pno++) { + output_assign_box_data[i * 4 + pno] = + output_box_data[i * class_num * 4 + max_j * 4 + pno]; + } + } else { + for (int pno = 0; pno < 4; pno++) { + output_assign_box_data[i * 4 + pno] = prior_box_data[i * 4 + pno]; + } + } + } + } +}; + +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/detection/distribute_fpn_proposals_op.cc b/paddle/fluid/operators/detection/distribute_fpn_proposals_op.cc new file mode 100644 index 0000000000000000000000000000000000000000..6d36876efd747d9e6f90c0d0200a9e9610a5318c --- /dev/null +++ b/paddle/fluid/operators/detection/distribute_fpn_proposals_op.cc @@ -0,0 +1,93 @@ +/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/operators/detection/distribute_fpn_proposals_op.h" + +namespace paddle { +namespace operators { + +class DistributeFpnProposalsOp : public framework::OperatorWithKernel { + public: + using framework::OperatorWithKernel::OperatorWithKernel; + + void InferShape(framework::InferShapeContext* ctx) const override { + PADDLE_ENFORCE(ctx->HasInput("FpnRois"), + "Input(FpnRois) shouldn't be null"); + PADDLE_ENFORCE_GE( + ctx->Outputs("MultiFpnRois").size(), 1UL, + "Outputs(MultiFpnRois) of DistributeOp should not be empty"); + size_t min_level = static_cast(ctx->Attrs().Get("min_level")); + size_t max_level = static_cast(ctx->Attrs().Get("max_level")); + PADDLE_ENFORCE_GE(max_level, min_level, + "max_level must not lower than min_level"); + // Set the output shape + size_t num_out_rois = max_level - min_level + 1; + std::vector outs_dims; + outs_dims.reserve(num_out_rois); + for (size_t i = 0; i < num_out_rois; ++i) { + framework::DDim out_dim = {-1, 4}; + outs_dims.push_back(out_dim); + } + ctx->SetOutputsDim("MultiFpnRois", outs_dims); + ctx->SetOutputDim("RestoreIndex", {1, -1}); + } + + protected: + framework::OpKernelType GetExpectedKernelType( + const framework::ExecutionContext& ctx) const override { + auto data_type = framework::GetDataTypeOfVar(ctx.InputVar("FpnRois")); + return framework::OpKernelType(data_type, platform::CPUPlace()); + } +}; + +class DistributeFpnProposalsOpMaker : public framework::OpProtoAndCheckerMaker { + public: + void Make() override { + AddInput("FpnRois", "(LoDTensor) The rois at all levels in shape (-1, 4)"); + AddOutput("MultiFpnRois", "(LoDTensor) Output with distribute operator") + .AsDuplicable(); + AddOutput("RestoreIndex", + "(Tensor) An array of positive number which is " + "used to restore the order of FpnRois"); + AddAttr("min_level", + "The lowest level of FPN layer where the" + " proposals come from"); + AddAttr("max_level", + "The highest level of FPN layer where the" + " proposals come from"); + AddAttr("refer_level", + "The referring level of FPN layer with" + " specified scale"); + AddAttr("refer_scale", + "The referring scale of FPN layer with" + " specified level"); + AddComment(R"DOC( +This operator distribute all proposals into different fpn level, + with respect to scale of the proposals, the referring scale and + the referring level. Besides, to restore the order of proposals, +we return an array which indicate the original index of rois in + current proposals. +)DOC"); + } +}; +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; +REGISTER_OPERATOR(distribute_fpn_proposals, ops::DistributeFpnProposalsOp, + ops::DistributeFpnProposalsOpMaker, + paddle::framework::EmptyGradOpMaker); +REGISTER_OP_CPU_KERNEL(distribute_fpn_proposals, + ops::DistributeFpnProposalsOpKernel, + ops::DistributeFpnProposalsOpKernel); diff --git a/paddle/fluid/operators/detection/distribute_fpn_proposals_op.cu b/paddle/fluid/operators/detection/distribute_fpn_proposals_op.cu new file mode 100644 index 0000000000000000000000000000000000000000..9cbb969158386547485fad54120510595eb92804 --- /dev/null +++ b/paddle/fluid/operators/detection/distribute_fpn_proposals_op.cu @@ -0,0 +1,221 @@ +/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + +http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include +#include "cub/cub.cuh" +#include "paddle/fluid/memory/memcpy.h" +#include "paddle/fluid/operators/detection/distribute_fpn_proposals_op.h" +#include "paddle/fluid/operators/gather.cu.h" +#include "paddle/fluid/platform/cuda_primitives.h" +#include "paddle/fluid/platform/for_range.h" + +namespace paddle { +namespace operators { + +using Tensor = framework::Tensor; +using LoDTensor = framework::LoDTensor; + +static constexpr int kNumCUDAThreads = 512; +static constexpr int kNumMaxinumNumBlocks = 4096; + +#define CUDA_1D_KERNEL_LOOP(i, n) \ + for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < (n); \ + i += blockDim.x * gridDim.x) + +int const BBoxSize = 4; + +struct RangeInitFunctor { + int start_; + int delta_; + int* out_; + __device__ void operator()(size_t i) { out_[i] = start_ + i * delta_; } +}; + +static inline int NumBlocks(const int N) { + return std::min((N + kNumCUDAThreads - 1) / kNumCUDAThreads, + kNumMaxinumNumBlocks); +} + +static inline void TransLoD(const int* length_lod, const int lod_size, + int* offset_lod) { + int offset = 0; + for (int i = 0; i < lod_size; ++i) { + offset_lod[i] = offset; + offset += length_lod[i]; + } +} + +template +static __device__ inline T RoIArea(const T* box, bool normalized) { + if (box[2] < box[0] || box[3] < box[1]) { + // If coordinate values are is invalid + // (e.g. xmax < xmin or ymax < ymin), return 0. + return static_cast(0.); + } else { + const T w = box[2] - box[0]; + const T h = box[3] - box[1]; + if (normalized) { + return w * h; + } else { + // If coordinate values are not within range [0, 1]. + return (w + 1) * (h + 1); + } + } +} + +template +static __global__ void GPUDistFpnProposalsHelper( + const int nthreads, const T* rois, const int lod_size, + const int refer_level, const int refer_scale, const int max_level, + const int min_level, int* roi_batch_id_data, int* sub_lod_list, + int* target_lvls) { + CUDA_1D_KERNEL_LOOP(i, nthreads) { + const T* offset_roi = rois + i * BBoxSize; + int roi_batch_ind = roi_batch_id_data[i]; + // get the target level of current rois + T roi_area = RoIArea(offset_roi, false); + T roi_scale = sqrt(roi_area); + int tgt_lvl = floor(log2(roi_scale / refer_scale) + refer_level); + tgt_lvl = min(max_level, max(tgt_lvl, min_level)); + target_lvls[i] = tgt_lvl; + // compute number of rois in the same batch and same target level + platform::CudaAtomicAdd(sub_lod_list + tgt_lvl * lod_size + roi_batch_ind, + 1); + } +} + +template +class GPUDistributeFpnProposalsOpKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const override { + auto* fpn_rois = ctx.Input("FpnRois"); + + auto multi_fpn_rois = ctx.MultiOutput("MultiFpnRois"); + auto* restore_index = ctx.Output("RestoreIndex"); + + const int min_level = ctx.Attr("min_level"); + const int max_level = ctx.Attr("max_level"); + const int refer_level = ctx.Attr("refer_level"); + const int refer_scale = ctx.Attr("refer_scale"); + int num_level = max_level - min_level + 1; + + // check that the fpn_rois is not empty + PADDLE_ENFORCE_EQ(fpn_rois->lod().size(), 1UL, + "DistributeFpnProposalsOp need 1 level of LoD"); + + auto fpn_rois_lod = fpn_rois->lod().back(); + int lod_size = fpn_rois_lod.size() - 1; + int roi_num = fpn_rois_lod[lod_size]; + + auto& dev_ctx = ctx.template device_context(); + + // get batch id by lod in CPU + Tensor roi_batch_id_list; + roi_batch_id_list.Resize({roi_num}); + int* roi_batch_id_data = + roi_batch_id_list.mutable_data(platform::CPUPlace()); + for (int n = 0; n < lod_size; ++n) { + for (size_t i = fpn_rois_lod[n]; i < fpn_rois_lod[n + 1]; ++i) { + roi_batch_id_data[i] = n; + } + } + // copy batch id list to GPU + Tensor roi_batch_id_list_gpu; + framework::TensorCopySync(roi_batch_id_list, dev_ctx.GetPlace(), + &roi_batch_id_list_gpu); + + Tensor sub_lod_list; + sub_lod_list.Resize({num_level, lod_size}); + int* sub_lod_list_data = sub_lod_list.mutable_data(dev_ctx.GetPlace()); + Tensor target_lvls; + target_lvls.Resize({roi_num}); + int* target_lvls_data = target_lvls.mutable_data(dev_ctx.GetPlace()); + + int blocks = NumBlocks(roi_num); + int threads = kNumCUDAThreads; + + // get target levels and sub_lod list + GPUDistFpnProposalsHelper<<>>( + roi_num, fpn_rois->data(), lod_size, refer_level, refer_scale, + max_level, min_level, roi_batch_id_list_gpu.data(), + sub_lod_list_data, target_lvls_data); + + Tensor index_in_t; + int* idx_in = index_in_t.mutable_data({roi_num}, dev_ctx.GetPlace()); + platform::ForRange for_range(dev_ctx, roi_num); + for_range(RangeInitFunctor{0, 1, idx_in}); + + Tensor keys_out_t; + int* keys_out = keys_out_t.mutable_data({roi_num}, dev_ctx.GetPlace()); + Tensor index_out_t; + int* idx_out = index_out_t.mutable_data({roi_num}, dev_ctx.GetPlace()); + + // Determine temporary device storage requirements + size_t temp_storage_bytes = 0; + cub::DeviceRadixSort::SortPairsDescending( + nullptr, temp_storage_bytes, target_lvls_data, keys_out, idx_in, + idx_out, roi_num); + // Allocate temporary storage + auto place = boost::get(dev_ctx.GetPlace()); + auto d_temp_storage = memory::Alloc(place, temp_storage_bytes, + memory::Allocator::kScratchpad); + + // Run sorting operation + // sort target level to get corresponding index + cub::DeviceRadixSort::SortPairsDescending( + d_temp_storage->ptr(), temp_storage_bytes, target_lvls_data, keys_out, + idx_in, idx_out, roi_num); + + int* restore_idx_data = + restore_index->mutable_data({roi_num, 1}, dev_ctx.GetPlace()); + // sort current index to get restore index + cub::DeviceRadixSort::SortPairsDescending( + d_temp_storage->ptr(), temp_storage_bytes, idx_out, keys_out, idx_in, + restore_idx_data, roi_num); + + Tensor offset_lod; + int* offset_lod_data = + offset_lod.mutable_data({lod_size + 1}, dev_ctx.GetPlace()); + for (int i = 0; i < num_level; ++i) { + Tensor sub_lod = sub_lod_list.Slice(i, i + 1); + int* sub_lod_data = sub_lod.data(); + // transfer length-based lod to offset-based lod + TransLoD(sub_lod_data, lod_size + 1, offset_lod_data); + int sub_rois_num = offset_lod_data[lod_size]; + Tensor sub_idx = index_out_t.Slice(0, sub_rois_num); + + multi_fpn_rois[i]->mutable_data({sub_rois_num, kBoxDim}, + dev_ctx.GetPlace()); + + GPUGather(dev_ctx, *fpn_rois, sub_idx, multi_fpn_rois[i]); + framework::LoD lod; + std::vector offset; + memory::Copy(platform::CPUPlace(), offset.data(), place, offset_lod_data, + sizeof(int) * (lod_size + 1), 0); + lod.emplace_back(offset); + multi_fpn_rois[i]->set_lod(lod); + } + } +}; + +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; +REGISTER_OP_CUDA_KERNEL( + distribute_fpn_proposals, + ops::GPUDistributeFpnProposalsOpKernel, + ops::GPUDistributeFpnProposalsOpKernel); diff --git a/paddle/fluid/operators/detection/distribute_fpn_proposals_op.h b/paddle/fluid/operators/detection/distribute_fpn_proposals_op.h new file mode 100644 index 0000000000000000000000000000000000000000..f63e856626d64ec13476c3f967a085624a007c3a --- /dev/null +++ b/paddle/fluid/operators/detection/distribute_fpn_proposals_op.h @@ -0,0 +1,147 @@ +/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +#include +#include +#include +#include +#include +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/operators/detail/safe_ref.h" +#include "paddle/fluid/operators/gather.h" +#include "paddle/fluid/operators/math/math_function.h" + +namespace paddle { +namespace operators { + +const int kBoxDim = 4; + +template +static inline T BBoxArea(const T* box, bool normalized) { + if (box[2] < box[0] || box[3] < box[1]) { + // If coordinate values are is invalid + // (e.g. xmax < xmin or ymax < ymin), return 0. + return static_cast(0.); + } else { + const T w = box[2] - box[0]; + const T h = box[3] - box[1]; + if (normalized) { + return w * h; + } else { + // If coordinate values are not within range [0, 1]. + return (w + 1) * (h + 1); + } + } +} + +template +class DistributeFpnProposalsOpKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& context) const override { + auto* fpn_rois = context.Input("FpnRois"); + + auto multi_fpn_rois = + context.MultiOutput("MultiFpnRois"); + + auto* restore_index = + context.Output("RestoreIndex"); + + const int min_level = context.Attr("min_level"); + const int max_level = context.Attr("max_level"); + const int refer_level = context.Attr("refer_level"); + const int refer_scale = context.Attr("refer_scale"); + const int num_level = max_level - min_level + 1; + + // check that the fpn_rois is not empty + PADDLE_ENFORCE_EQ(fpn_rois->lod().size(), 1UL, + "DistributeFpnProposalsOp need 1 level of LoD"); + + auto fpn_rois_lod = fpn_rois->lod().back(); + int fpn_rois_num = fpn_rois_lod[fpn_rois_lod.size() - 1]; + std::vector target_level; + // std::vector target_level(fpn_rois_num, -1); + // record the number of rois in each level + std::vector num_rois_level(num_level, 0); + std::vector num_rois_level_integral(num_level + 1, 0); + for (int i = 0; i < fpn_rois_lod.size() - 1; ++i) { + Tensor fpn_rois_slice = + fpn_rois->Slice(fpn_rois_lod[i], fpn_rois_lod[i + 1]); + const T* rois_data = fpn_rois_slice.data(); + for (int j = 0; j < fpn_rois_slice.dims()[0]; ++j) { + // get the target level of current rois + T roi_scale = std::sqrt(BBoxArea(rois_data, false)); + int tgt_lvl = + std::floor(std::log2(roi_scale / refer_scale) + refer_level); + tgt_lvl = std::min(max_level, std::max(tgt_lvl, min_level)); + target_level.push_back(tgt_lvl); + num_rois_level[tgt_lvl - min_level]++; + rois_data += kBoxDim; + } + } + // define the output rois + // pointer which point to each level fpn rois + std::vector multi_fpn_rois_data(num_level); + // lod0 which will record the offset information of each level rois + std::vector> multi_fpn_rois_lod0; + for (int i = 0; i < num_level; ++i) { + // allocate memory for each level rois + multi_fpn_rois[i]->mutable_data({num_rois_level[i], kBoxDim}, + context.GetPlace()); + multi_fpn_rois_data[i] = multi_fpn_rois[i]->data(); + std::vector lod0(1, 0); + multi_fpn_rois_lod0.push_back(lod0); + // statistic start point for each level rois + num_rois_level_integral[i + 1] = + num_rois_level_integral[i] + num_rois_level[i]; + } + restore_index->mutable_data({1, fpn_rois_num}, context.GetPlace()); + int* restore_index_data = restore_index->data(); + std::vector restore_index_inter(fpn_rois_num, -1); + // distribute the rois into different fpn level by target level + for (int i = 0; i < fpn_rois_lod.size() - 1; ++i) { + Tensor fpn_rois_slice = + fpn_rois->Slice(fpn_rois_lod[i], fpn_rois_lod[i + 1]); + const T* rois_data = fpn_rois_slice.data(); + size_t cur_offset = fpn_rois_lod[i]; + // std::vector lod_offset[num_level]; + for (int j = 0; j < num_level; j++) { + multi_fpn_rois_lod0[j].push_back(multi_fpn_rois_lod0[j][i]); + } + for (int j = 0; j < fpn_rois_slice.dims()[0]; ++j) { + int lvl = target_level[cur_offset + j]; + memcpy(multi_fpn_rois_data[lvl - min_level], rois_data, + kBoxDim * sizeof(T)); + multi_fpn_rois_data[lvl - min_level] += kBoxDim; + int index_in_shuffle = num_rois_level_integral[lvl - min_level] + + multi_fpn_rois_lod0[lvl - min_level][i + 1]; + restore_index_inter[index_in_shuffle] = cur_offset + j; + multi_fpn_rois_lod0[lvl - min_level][i + 1]++; + rois_data += kBoxDim; + } + } + for (int i = 0; i < fpn_rois_num; ++i) { + restore_index_data[restore_index_inter[i]] = i; + } + // merge lod information into LoDTensor + for (int i = 0; i < num_level; ++i) { + framework::LoD lod; + lod.emplace_back(multi_fpn_rois_lod0[i]); + multi_fpn_rois[i]->set_lod(lod); + } + } +}; +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/fused/fused_embedding_seq_pool_op.h b/paddle/fluid/operators/fused/fused_embedding_seq_pool_op.h index 2b0c1f560f23eee7fbdf14444bf933535b704167..f13c02038606e52337b7ef85545e37054e54b631 100644 --- a/paddle/fluid/operators/fused/fused_embedding_seq_pool_op.h +++ b/paddle/fluid/operators/fused/fused_embedding_seq_pool_op.h @@ -22,7 +22,6 @@ limitations under the License. */ #include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/selected_rows.h" #include "paddle/fluid/operators/jit/kernels.h" -#include "paddle/fluid/operators/math/blas.h" namespace paddle { namespace operators { @@ -47,7 +46,7 @@ struct EmbeddingVSumFunctor { auto *output = output_t->mutable_data(context.GetPlace()); PADDLE_ENFORCE_LE(table_width * idx_width, out_width); - PADDLE_ENFORCE_GT(ids_lod.size(), 1UL); + PADDLE_ENFORCE_GT(ids_lod.size(), 1UL, "The LoD[0] could NOT be empty"); jit::emb_seq_pool_attr_t attr(table_height, table_width, 0, idx_width, out_width, jit::SeqPoolType::kSum); @@ -83,11 +82,11 @@ class FusedEmbeddingSeqPoolKernel : public framework::OpKernel { FusedEmbeddingSeqPoolLastDim(table_var->dims(), ids_t->dims()); const auto &ids_lod = ids_t->lod(); // in run time, the LoD of ids must be 1 - PADDLE_ENFORCE(ids_lod.size(), 1u, "The LoD level of Input(Ids) must be 1"); - PADDLE_ENFORCE_GE(ids_lod[0].size(), 1u, "The LoD could NOT be empty"); + PADDLE_ENFORCE(ids_lod.size(), 1UL, + "The LoD level of Input(Ids) must be 1"); int64_t batch_size = ids_lod[0].size() - 1; // in run time, the shape from Ids -> output - // should be [seq_length, 1] -> [batch_size, embedding_size] + // should be [seq_length, 1] -> [batch_size, last_dim] output_t->Resize({batch_size, last_dim}); if (combiner_type == "sum") { @@ -125,7 +124,7 @@ class FusedEmbeddingSeqPoolGradKernel : public framework::OpKernel { auto *ids_data = ids->data(); int64_t ids_num = ids->numel(); auto lod = ids->lod()[0]; - int64_t row_width = d_output->dims()[1]; + int64_t out_width = d_output->dims()[1]; framework::Vector *new_rows = d_table->mutable_rows(); new_rows->resize(ids_num); @@ -136,15 +135,13 @@ class FusedEmbeddingSeqPoolGradKernel : public framework::OpKernel { T *d_table_data = d_table_value->mutable_data(context.GetPlace()); const T *d_output_data = d_output->data(); - auto blas = math::GetBlas(context); + auto vbroadcast = jit::Get, + platform::CPUPlace>(out_width); for (int i = 0; i < static_cast(lod.size()) - 1; ++i) { int64_t h = static_cast(lod[i + 1] - lod[i]); - int64_t in_offset = lod[i] * row_width; - const T *out_pos = d_output_data + i * row_width; - T *in_pos = d_table_data + in_offset; - for (int r = 0; r != h; ++r) { - blas.VCOPY(row_width, out_pos, in_pos + r * row_width); - } + const T *src = d_output_data + i * out_width; + T *dst = d_table_data + lod[i] * out_width; + vbroadcast(src, dst, h, out_width); } } else { LOG(ERROR) << "Dense is not supported in fused_embedding_seq_pool_op now"; diff --git a/paddle/fluid/operators/jit/benchmark.cc b/paddle/fluid/operators/jit/benchmark.cc index 11dc615f5ff8ea78bbbf6eeb655ee88b3a52dc13..3088280bb90174e6195a349c07a3435e131e2b33 100644 --- a/paddle/fluid/operators/jit/benchmark.cc +++ b/paddle/fluid/operators/jit/benchmark.cc @@ -474,6 +474,23 @@ void BenchCRFDecodingKernel() { } } +template +void BenchVBroadcastKernel() { + for (int64_t w : {1, 16, 64, 100, 256}) { + Tensor x; + x.Resize({w}); + RandomVec(w, x.mutable_data(PlaceType())); + const T* x_data = x.data(); + for (int h : TestSizes()) { + Tensor y; + y.Resize({h * w}); + T* y_data = y.mutable_data(PlaceType()); + BenchAllImpls, PlaceType>( + w, x_data, y_data, static_cast(h), w); + } + } +} + using T = float; using CPUPlace = paddle::platform::CPUPlace; @@ -498,6 +515,7 @@ BENCH_FP32_CPU(kVSquare) { BenchXYNKernel(); } BENCH_FP32_CPU(kVExp) { BenchXYNKernel(); } BENCH_FP32_CPU(kVSigmoid) { BenchXYNKernel(); } BENCH_FP32_CPU(kVTanh) { BenchXYNKernel(); } +BENCH_FP32_CPU(kVCopy) { BenchXYNKernel(); } // lstm and peephole BENCH_FP32_CPU(kLSTMCtHt) { BenchLSTMKernel(); } @@ -535,6 +553,11 @@ BENCH_FP32_CPU(kCRFDecoding) { BenchCRFDecodingKernel(); } +// vbroadcast function +BENCH_FP32_CPU(kVBroadcast) { + BenchVBroadcastKernel(); +} + // Benchmark all jit kernels including jitcode, mkl and refer. // To use this tool, run command: ./benchmark [options...] // Options: diff --git a/paddle/fluid/operators/jit/gen/CMakeLists.txt b/paddle/fluid/operators/jit/gen/CMakeLists.txt index eb0c03568ddddf1c456fec6fcc81f3b40d051844..99244ea9bd919a018732b75d1ab811e8bf338516 100644 --- a/paddle/fluid/operators/jit/gen/CMakeLists.txt +++ b/paddle/fluid/operators/jit/gen/CMakeLists.txt @@ -33,3 +33,4 @@ USE_JITKERNEL_GEN(kHMax) USE_JITKERNEL_GEN(kHSum) USE_JITKERNEL_GEN(kEmbSeqPool) USE_JITKERNEL_GEN(kSgd) +USE_JITKERNEL_GEN(kVBroadcast) diff --git a/paddle/fluid/operators/jit/gen/vbroadcast.cc b/paddle/fluid/operators/jit/gen/vbroadcast.cc new file mode 100644 index 0000000000000000000000000000000000000000..3f9fbdbd821acae0940c5a7b8d9a5eb2432712ff --- /dev/null +++ b/paddle/fluid/operators/jit/gen/vbroadcast.cc @@ -0,0 +1,91 @@ +/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. */ + +#include "paddle/fluid/operators/jit/gen/vbroadcast.h" +#include +#include +#include "paddle/fluid/operators/jit/registry.h" +#include "paddle/fluid/platform/cpu_info.h" + +namespace paddle { +namespace operators { +namespace jit { +namespace gen { + +void VBroadcastJitCode::genCode() { + preCode(); + constexpr int block = YMM_FLOAT_BLOCK; + constexpr int max_num_regs = 16; + const int num_block = w_ / block; + const int num_groups = num_block / max_num_regs; + const size_t block_size = sizeof(float) * block; + std::vector groups(num_groups, max_num_regs); + int rest_num_regs = num_block % max_num_regs; + if (rest_num_regs > 0) { + groups.push_back(rest_num_regs); + } + + // protect param_h + mov(reg_height, param_h); + Label l_next_h; + xor_(reg_h_i, reg_h_i); + mov(reg_ptr_dst_i, param_dst); + L(l_next_h); + { + mov(reg_ptr_src_i, param_src); + for (int num_regs : groups) { + size_t w_offset = 0; + for (int reg_i = 0; reg_i < num_regs; ++reg_i) { + vmovups(ymm_t(reg_i), ptr[reg_ptr_src_i + w_offset]); + w_offset += block_size; + } + add(reg_ptr_src_i, num_regs * block_size); + + w_offset = 0; + for (int reg_i = 0; reg_i < num_regs; ++reg_i) { + vmovups(ptr[reg_ptr_dst_i + w_offset], ymm_t(reg_i)); + w_offset += block_size; + } + add(reg_ptr_dst_i, num_regs * block_size); + } // end of groups + inc(reg_h_i); + cmp(reg_h_i, reg_height); + jl(l_next_h, T_NEAR); + } // end of l_next_h + + postCode(); +} + +class VBroadcastCreator : public JitCodeCreator { + public: + bool UseMe(const int64_t& w) const override { + return platform::MayIUse(platform::avx) && w % YMM_FLOAT_BLOCK == 0; + } + size_t CodeSize(const int64_t& w) const override { + return 96 + (w / YMM_FLOAT_BLOCK) * 16 * 8; + } + std::unique_ptr CreateJitCode(const int64_t& w) const override { + PADDLE_ENFORCE_GT(w, 0); + return make_unique(w, CodeSize(w)); + } +}; + +} // namespace gen +} // namespace jit +} // namespace operators +} // namespace paddle + +namespace gen = paddle::operators::jit::gen; + +REGISTER_JITKERNEL_GEN(kVBroadcast, gen::VBroadcastCreator); diff --git a/paddle/fluid/operators/jit/gen/vbroadcast.h b/paddle/fluid/operators/jit/gen/vbroadcast.h new file mode 100644 index 0000000000000000000000000000000000000000..27c75f6f710e9514c7d91181e7f447d9dd997081 --- /dev/null +++ b/paddle/fluid/operators/jit/gen/vbroadcast.h @@ -0,0 +1,53 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. */ + +#pragma once + +#include +#include "glog/logging.h" +#include "paddle/fluid/operators/jit/gen/jitcode.h" + +namespace paddle { +namespace operators { +namespace jit { +namespace gen { + +class VBroadcastJitCode : public JitCode { + public: + explicit VBroadcastJitCode(const int64_t& w, size_t code_size = 256 * 1024, + void* code_ptr = nullptr) + : JitCode(code_size, code_ptr), w_(w) { + this->genCode(); + } + + DECLARE_JIT_CODE(VBroadcastJitCode); + void genCode() override; + + private: + int w_; + reg64_t param_src{abi_param1}; + reg64_t param_dst{abi_param2}; + reg64_t param_h{abi_param3}; + reg64_t param_w{abi_param4}; + + reg64_t reg_height{r9}; + reg64_t reg_h_i{r10}; + reg64_t reg_ptr_src_i{r11}; + reg64_t reg_ptr_dst_i{r12}; +}; + +} // namespace gen +} // namespace jit +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/jit/helper.cc b/paddle/fluid/operators/jit/helper.cc index 1dc60442d5c5f6acf49b6319223b190f6c81e1a6..eb1c410b6f9a31c3f97a274c5e5ff55bf1c32ea0 100644 --- a/paddle/fluid/operators/jit/helper.cc +++ b/paddle/fluid/operators/jit/helper.cc @@ -36,6 +36,8 @@ const char* to_string(KernelType kt) { ONE_CASE(kVScal); ONE_CASE(kVAddBias); ONE_CASE(kVRelu); + ONE_CASE(kVBroadcast); + ONE_CASE(kVCopy); ONE_CASE(kVIdentity); ONE_CASE(kVExp); ONE_CASE(kVSquare); diff --git a/paddle/fluid/operators/jit/kernel_base.h b/paddle/fluid/operators/jit/kernel_base.h index 895e2d4d6f3809a66443ed6d6bfc1ee02d6c529a..96e162a21bff2a5624f35ada615c9a9a17ad3c75 100644 --- a/paddle/fluid/operators/jit/kernel_base.h +++ b/paddle/fluid/operators/jit/kernel_base.h @@ -41,6 +41,8 @@ typedef enum { kVAdd, kVAddBias, kVAddRelu, + kVBroadcast, + kVCopy, kVExp, kVIdentity, kVMul, @@ -133,6 +135,13 @@ struct GRUTuples { typedef void (*func_type)(gru_t*, const gru_attr_t*); }; +template +struct VBroadcastTuples { + typedef T data_type; + typedef int64_t attr_type; + typedef void (*func_type)(const T*, T*, int64_t, int64_t); +}; + typedef struct seq_pool_attr_s { int h, w; // h should always be the first one SeqPoolType type; diff --git a/paddle/fluid/operators/jit/kernel_key.cc b/paddle/fluid/operators/jit/kernel_key.cc index 740d0f850a072a5ad3238e52402141a83c0b7e33..1c2fddcae79d8b89e1169d5bcb364b3ff2e42dd3 100644 --- a/paddle/fluid/operators/jit/kernel_key.cc +++ b/paddle/fluid/operators/jit/kernel_key.cc @@ -24,6 +24,11 @@ size_t JitCodeKey(const int& d) { return d; } +template <> +size_t JitCodeKey(const int64_t& d) { + return d; +} + // TODO(TJ): refine and benchmark JitCodeKey generatation constexpr int act_type_shift = 3; // suppot 2^3 act types static inline int act_type_convert(KernelType type) { diff --git a/paddle/fluid/operators/jit/more/mkl/CMakeLists.txt b/paddle/fluid/operators/jit/more/mkl/CMakeLists.txt index 9a00ad56a6a909a677cb8f60bd80fe399e82952f..f69417c370b653d93cce04a2248ad809168670da 100644 --- a/paddle/fluid/operators/jit/more/mkl/CMakeLists.txt +++ b/paddle/fluid/operators/jit/more/mkl/CMakeLists.txt @@ -9,9 +9,11 @@ USE_JITKERNEL_MORE(kVAdd, mkl) USE_JITKERNEL_MORE(kVScal, mkl) USE_JITKERNEL_MORE(kVExp, mkl) USE_JITKERNEL_MORE(kVSquare, mkl) +USE_JITKERNEL_MORE(kVCopy, mkl) USE_JITKERNEL_MORE(kVSigmoid, mkl) USE_JITKERNEL_MORE(kVTanh, mkl) USE_JITKERNEL_MORE(kSeqPool, mkl) USE_JITKERNEL_MORE(kSoftmax, mkl) USE_JITKERNEL_MORE(kEmbSeqPool, mkl) USE_JITKERNEL_MORE(kSgd, mkl) +USE_JITKERNEL_MORE(kVBroadcast, mkl) diff --git a/paddle/fluid/operators/jit/more/mkl/mkl.cc b/paddle/fluid/operators/jit/more/mkl/mkl.cc index 780fda02c1ff3da2e0b945f9b2fece30484e4519..4f51353bce834325e6c659399a374e4fbc40d4b7 100644 --- a/paddle/fluid/operators/jit/more/mkl/mkl.cc +++ b/paddle/fluid/operators/jit/more/mkl/mkl.cc @@ -154,6 +154,21 @@ bool VSquareKernel::UseMe(const int& d) const { return d > 7; } +template <> +bool VCopyKernel::UseMe(const int& d) const { + return d > 15; +} + +template <> +bool VBroadcastKernel::UseMe(const int64_t& d) const { + return d > 127; +} + +template <> +bool VBroadcastKernel::UseMe(const int64_t& attr) const { + return true; +} + template <> bool VSigmoidKernel::UseMe(const int& d) const { return d > 7; @@ -223,6 +238,7 @@ AWALYS_USE_ME_WITH_DOUBLE(VExp); AWALYS_USE_ME_WITH_DOUBLE(VSigmoid); AWALYS_USE_ME_WITH_DOUBLE(VTanh); AWALYS_USE_ME_WITH_DOUBLE(VSquare); +AWALYS_USE_ME_WITH_DOUBLE(VCopy); AWALYS_USE_ME_WITH_DOUBLE(Softmax); #undef AWALYS_USE_ME_WITH_DOUBLE @@ -244,6 +260,8 @@ REGISTER_MKL_KERNEL(kVAdd, VAdd); REGISTER_MKL_KERNEL(kVScal, VScal); REGISTER_MKL_KERNEL(kVExp, VExp); REGISTER_MKL_KERNEL(kVSquare, VSquare); +REGISTER_MKL_KERNEL(kVCopy, VCopy); +REGISTER_MKL_KERNEL(kVBroadcast, VBroadcast); REGISTER_MKL_KERNEL(kVSigmoid, VSigmoid); REGISTER_MKL_KERNEL(kVTanh, VTanh); REGISTER_MKL_KERNEL(kSeqPool, SeqPool); diff --git a/paddle/fluid/operators/jit/more/mkl/mkl.h b/paddle/fluid/operators/jit/more/mkl/mkl.h index a7bc2de4a3e8e7d8e2a6b00990bfa459b3029c2a..db2d6faed4fdcfebedb9d9eb752831259af30186 100644 --- a/paddle/fluid/operators/jit/more/mkl/mkl.h +++ b/paddle/fluid/operators/jit/more/mkl/mkl.h @@ -50,6 +50,13 @@ void VCopy(const T* x, T* y, int n); template void VAXPY(T a, const T* x, T* y, int n); +template +void VBroadcast(const T* x, T* y, int64_t y_h, int64_t x_len) { + for (int64_t h = 0; h < y_h; ++h) { + VCopy(x, y + h * x_len, x_len); + } +} + template void VSigmoid(const T* x, T* y, int n) { const T min = SIGMOID_THRESHOLD_MIN; @@ -192,6 +199,7 @@ DECLARE_MKL_KERNEL(VExp, XYNTuples); DECLARE_MKL_KERNEL(VSigmoid, XYNTuples); DECLARE_MKL_KERNEL(VTanh, XYNTuples); DECLARE_MKL_KERNEL(VSquare, XYNTuples); +DECLARE_MKL_KERNEL(VCopy, XYNTuples); DECLARE_MKL_KERNEL(SeqPool, SeqPoolTuples); @@ -201,6 +209,8 @@ DECLARE_MKL_KERNEL(Softmax, SoftmaxTuples); DECLARE_MKL_KERNEL(Sgd, SgdTuples); +DECLARE_MKL_KERNEL(VBroadcast, VBroadcastTuples); + #undef DECLARE_MKL_KERNEL } // namespace mkl diff --git a/paddle/fluid/operators/jit/refer/CMakeLists.txt b/paddle/fluid/operators/jit/refer/CMakeLists.txt index cd19dd169d0bfdfe2cb8157ade29f48ad6428453..ffab9c1457b932b3211e6aa75954bb1435f8e34c 100644 --- a/paddle/fluid/operators/jit/refer/CMakeLists.txt +++ b/paddle/fluid/operators/jit/refer/CMakeLists.txt @@ -13,6 +13,7 @@ USE_JITKERNEL_REFER(kVAddRelu) USE_JITKERNEL_REFER(kVSub) USE_JITKERNEL_REFER(kVScal) USE_JITKERNEL_REFER(kVAddBias) +USE_JITKERNEL_REFER(kVCopy) USE_JITKERNEL_REFER(kVRelu) USE_JITKERNEL_REFER(kVIdentity) USE_JITKERNEL_REFER(kVExp) @@ -34,3 +35,4 @@ USE_JITKERNEL_REFER(kHMax) USE_JITKERNEL_REFER(kSoftmax) USE_JITKERNEL_REFER(kEmbSeqPool) USE_JITKERNEL_REFER(kSgd) +USE_JITKERNEL_REFER(kVBroadcast) diff --git a/paddle/fluid/operators/jit/refer/refer.cc b/paddle/fluid/operators/jit/refer/refer.cc index 0c434bd2b8cacdf4b8872da66bb8e763a6a45cee..c279d1b2ca4f53bb6bc5da0cab41e9086ed475bd 100644 --- a/paddle/fluid/operators/jit/refer/refer.cc +++ b/paddle/fluid/operators/jit/refer/refer.cc @@ -30,6 +30,7 @@ REGISTER_REFER_KERNEL(kVScal, VScal); REGISTER_REFER_KERNEL(kVAddBias, VAddBias); REGISTER_REFER_KERNEL(kVRelu, VRelu); +REGISTER_REFER_KERNEL(kVCopy, VCopy); REGISTER_REFER_KERNEL(kVIdentity, VIdentity); REGISTER_REFER_KERNEL(kVSquare, VSquare); REGISTER_REFER_KERNEL(kVExp, VExp); @@ -61,4 +62,6 @@ REGISTER_REFER_KERNEL(kEmbSeqPool, EmbSeqPool); REGISTER_REFER_KERNEL(kSgd, Sgd); +REGISTER_REFER_KERNEL(kVBroadcast, VBroadcast); + #undef REGISTER_REFER_KERNEL diff --git a/paddle/fluid/operators/jit/refer/refer.h b/paddle/fluid/operators/jit/refer/refer.h index 0f714edf85bbbf4838bfe09251bd1c2d5f3b3eb7..b3b2097828c5b6d647fd6bfe14a6e8bff04409e0 100644 --- a/paddle/fluid/operators/jit/refer/refer.h +++ b/paddle/fluid/operators/jit/refer/refer.h @@ -70,6 +70,20 @@ void VAddBias(const T* a, const T* x, T* y, int n) { } } +template +void VCopy(const T* x, T* y, int n) { + std::memcpy(y, x, n * sizeof(T)); +} + +// x shape: (x_len) +// y shape: (h, x_len) +template +void VBroadcast(const T* x, T* y, int64_t y_h, int64_t x_len) { + for (int64_t h = 0; h < y_h; ++h) { + VCopy(x, y + h * x_len, x_len); + } +} + template void VRelu(const T* x, T* y, int n) { for (int i = 0; i < n; ++i) { @@ -500,6 +514,7 @@ DECLARE_REFER_KERNEL(VExp, XYNTuples); DECLARE_REFER_KERNEL(VSigmoid, XYNTuples); DECLARE_REFER_KERNEL(VTanh, XYNTuples); DECLARE_REFER_KERNEL(VSquare, XYNTuples); +DECLARE_REFER_KERNEL(VCopy, XYNTuples); // lstm_t*, const lstm_attr_t* DECLARE_REFER_KERNEL(LSTMCtHt, LSTMTuples); @@ -528,6 +543,8 @@ DECLARE_REFER_KERNEL(EmbSeqPool, EmbSeqPoolTuples); DECLARE_REFER_KERNEL(Sgd, SgdTuples); +DECLARE_REFER_KERNEL(VBroadcast, VBroadcastTuples); + #undef DECLARE_REFER_KERNEL } // namespace refer diff --git a/paddle/fluid/operators/jit/test.cc b/paddle/fluid/operators/jit/test.cc index b618cd6a84be752a052f9d49a4a4c772b1d7eeae..cdec14dc4383897f4ae24fc89b99fe00c713cf42 100644 --- a/paddle/fluid/operators/jit/test.cc +++ b/paddle/fluid/operators/jit/test.cc @@ -26,8 +26,8 @@ limitations under the License. */ DEFINE_double(acc, 1e-5, "Test accuracy threshold."); template -void RandomVec(const int n, T* a, const T lower = static_cast(-20.f), - const T upper = static_cast(20.f)) { +void RandomVec(const int n, T* a, const T lower = static_cast(-2.f), + const T upper = static_cast(2.f)) { static unsigned int seed = 100; std::mt19937 rng(seed++); std::uniform_real_distribution uniform_dist(0, 1); @@ -157,6 +157,26 @@ struct TestFuncWithRefer, std::vector, T> { } }; +template +struct TestFuncWithRefer, std::vector, + std::vector, int64_t, + typename jit::VBroadcastTuples::attr_type> { + void operator()(const typename jit::VBroadcastTuples::func_type tgt, + const std::vector& x, const std::vector& yref, + int64_t h, + const typename jit::VBroadcastTuples::attr_type& attr) { + EXPECT_TRUE(tgt != nullptr); + EXPECT_EQ(x.size(), static_cast(attr)); + EXPECT_EQ(yref.size(), x.size() * h); + std::vector y(yref.size()); + const T* x_data = x.data(); + const T* yref_data = yref.data(); + T* y_data = y.data(); + tgt(x_data, y_data, h, attr); + ExpectEQ(y_data, yref_data, yref.size()); + } +}; + template struct TestFuncWithRefer, std::vector, std::vector> { void operator()(const typename jit::XYNTuples::func_type tgt, @@ -514,7 +534,7 @@ void TestKernelXRNTuples() { auto ref = jit::GetRefer>(); EXPECT_TRUE(ref != nullptr); std::vector x(d); - RandomVec(d, x.data(), -2.f, 2.f); + RandomVec(d, x.data()); T ref_res; ref(x.data(), &ref_res, d); TestAllImpls, PlaceType, std::vector, T>(d, x, @@ -532,7 +552,7 @@ void TestKernelXYNTuples() { std::vector x(d), yref(d); std::vector xinp(d); // inplace test - RandomVec(d, x.data(), -2.f, 2.f); + RandomVec(d, x.data()); std::copy(x.begin(), x.end(), xinp.begin()); const T* x_data = x.data(); @@ -566,7 +586,7 @@ void TestKernelLSTMTuples() { EXPECT_TRUE(ref != nullptr); std::vector xsrc(4 * d), wp(3 * d), ct_1(d); std::vector ct_ref(d), ht_ref(d), checked(2 * d); - RandomVec(4 * d, xsrc.data(), -2.f, 2.f); + RandomVec(4 * d, xsrc.data()); RandomVec(3 * d, wp.data(), -1.f, 1.f); RandomVec(d, ct_1.data(), -1.f, 1.f); // x could be changed after compute, so copy to save src @@ -614,8 +634,8 @@ void TestKernelGRUTuples() { auto ref = jit::GetRefer>(); EXPECT_TRUE(ref != nullptr); std::vector xsrc(3 * d), ht_1(d), ht_ref(d); - RandomVec(3 * d, xsrc.data(), -2.f, 2.f); - RandomVec(d, ht_1.data(), -2.f, 2.f); + RandomVec(3 * d, xsrc.data()); + RandomVec(d, ht_1.data()); // x could be changed after compute, so copy to save src std::vector x(xsrc.size()); std::copy(xsrc.begin(), xsrc.end(), x.begin()); @@ -651,7 +671,7 @@ void TestKernelSeqPoolTuples() { auto ref = jit::GetRefer>(); EXPECT_TRUE(ref != nullptr); std::vector x(h * w), yref(w); - RandomVec(h * w, x.data(), -2.f, 2.f); + RandomVec(h * w, x.data()); const T* x_data = x.data(); T* yref_data = yref.data(); ref(x_data, yref_data, &attr); @@ -676,8 +696,8 @@ void TestKernelMatMulTuples() { auto ref = jit::GetRefer>(); EXPECT_TRUE(ref != nullptr); std::vector a(m * k), b(k * n), c(m * n); - RandomVec(m * k, a.data(), -2.f, 2.f); - RandomVec(k * n, b.data(), -2.f, 2.f); + RandomVec(m * k, a.data()); + RandomVec(k * n, b.data()); const T* a_data = a.data(); const T* b_data = b.data(); T* c_data = c.data(); @@ -699,7 +719,7 @@ void TestKernelSoftmaxTuples() { auto ref = jit::GetRefer>(); EXPECT_TRUE(ref != nullptr); std::vector x(bs * n), y(bs * n); - RandomVec(bs * n, x.data(), -2.f, 2.f); + RandomVec(bs * n, x.data()); const T* x_data = x.data(); T* y_data = y.data(); @@ -726,7 +746,7 @@ void TestKernelEmbSeqPoolTuples() { test_sizes.erase(std::remove(test_sizes.begin(), test_sizes.end(), 1000)); for (int tbl_w : test_sizes) { std::vector table(tbl_h * tbl_w); - RandomVec(tbl_h * tbl_w, table.data(), -2.f, 2.f); + RandomVec(tbl_h * tbl_w, table.data()); const T* table_data = table.data(); for (auto type : pool_types) { for (int idx_w : {1, 2, 10, 16}) { @@ -772,14 +792,14 @@ void TestKernelSgdTuples() { for (int grad_w : TestSizes()) { std::vector param(param_h * grad_w); std::vector param_out(param_h * grad_w); - RandomVec(param_h * grad_w, param.data(), -2.f, 2.f); + RandomVec(param_h * grad_w, param.data()); const T* param_data = param.data(); T* out_data = param_out.data(); for (int rows_size = 1; rows_size <= param_h; ++rows_size) { std::vector grad(rows_size * grad_w); std::vector rows = UnDuplicatedRandomVec(rows_size, 0, rows_size - 1); - RandomVec(rows_size * grad_w, grad.data(), -2.f, 2.f); + RandomVec(rows_size * grad_w, grad.data()); const int64_t* rows_data = rows.data(); const T* grad_data = grad.data(); auto ref = jit::GetRefer>(); @@ -815,8 +835,8 @@ void TestKernelNCHW16CMulNCTuples() { int sz = n * c * h * w; std::vector x(sz), y(n * c), zref(sz); std::vector ztgt(sz), zjit(sz); - RandomVec(sz, x.data(), -2.f, 2.f); - RandomVec(n * c, y.data(), -2.f, 2.f); + RandomVec(sz, x.data()); + RandomVec(n * c, y.data()); const T* x_data = x.data(); const T* y_data = y.data(); @@ -873,11 +893,11 @@ void TestKernelLayerNormTuples() { int sz = left * right; std::vector x(sz), mean(left), var(left), scale(right), bias(right), outref(sz); - RandomVec(sz, x.data(), -2.f, 2.f); - RandomVec(left, mean.data(), -2.f, 2.f); - RandomVec(left, var.data(), -2.f, 2.f); - RandomVec(right, scale.data(), -2.f, 2.f); - RandomVec(right, bias.data(), -2.f, 2.f); + RandomVec(sz, x.data()); + RandomVec(left, mean.data()); + RandomVec(left, var.data()); + RandomVec(right, scale.data()); + RandomVec(right, bias.data()); const T* scale_data = scale.data(); const T* bias_data = bias.data(); @@ -903,7 +923,7 @@ void TestKernelCRFDecodingTuples() { VLOG(10) << "===== Test JITKernel " << jit::to_string(KT); constexpr int state_trans_base_idx = 2; auto test_sizes = TestSizes(); - test_sizes.erase(std::remove(test_sizes.begin(), test_sizes.end(), 1000)); + test_sizes.erase(std::remove(test_sizes.begin(), test_sizes.end(), 2000)); for (int seq_len : {1, 11, 17, 50}) { for (int tag_num : test_sizes) { auto ref = jit::GetRefer>(); @@ -912,8 +932,8 @@ void TestKernelCRFDecodingTuples() { int w_sz = (tag_num + state_trans_base_idx) * tag_num; std::vector x(x_sz), w(w_sz), alpharef(x_sz); std::vector trackref(x_sz); - RandomVec(x_sz, x.data(), -2.f, 2.f); - RandomVec(w_sz, w.data(), -2.f, 2.f); + RandomVec(x_sz, x.data()); + RandomVec(w_sz, w.data()); ref(seq_len, (const T*)x.data(), (const T*)w.data(), alpharef.data(), trackref.data(), tag_num); @@ -926,6 +946,27 @@ void TestKernelCRFDecodingTuples() { } } +template +void TestKernelVBroadcastTuples() { + VLOG(10) << "===== Test JITKernel " << jit::to_string(KT); + for (int w : TestSizes()) { + std::vector x(w); + RandomVec(w, x.data()); + const T* x_data = x.data(); + for (int64_t h : {1, 2, 6}) { + auto ref = jit::GetRefer>(); + EXPECT_TRUE(ref != nullptr); + std::vector y(w * h); + T* y_data = y.data(); + ref(x_data, y_data, h, w); + + TestAllImpls, PlaceType, std::vector, + std::vector, int64_t>(static_cast(w), x, y, h, + static_cast(w)); + } + } +} + #define TEST_CPU_KERNEL(test_tuple, kernel_type) \ TEST(JITKernel, kernel_type) { \ TestKernel##test_tuple(); \ @@ -949,6 +990,7 @@ TEST_CPU_KERNEL(XYNTuples, kVSquare); TEST_CPU_KERNEL(XYNTuples, kVExp); TEST_CPU_KERNEL(XYNTuples, kVSigmoid); TEST_CPU_KERNEL(XYNTuples, kVTanh); +TEST_CPU_KERNEL(XYNTuples, kVCopy); TEST_CPU_KERNEL(LSTMTuples, kLSTMCtHt); TEST_CPU_KERNEL(LSTMTuples, kLSTMC1H1); @@ -966,6 +1008,7 @@ TEST_CPU_KERNEL(EmbSeqPoolTuples, kEmbSeqPool); TEST_CPU_KERNEL(SgdTuples, kSgd); TEST_CPU_KERNEL(LayerNormTuples, kLayerNorm); TEST_CPU_KERNEL(CRFDecodingTuples, kCRFDecoding); +TEST_CPU_KERNEL(VBroadcastTuples, kVBroadcast); TEST(JITKernel_key, lstm) { jit::lstm_attr_t attr1(8, jit::kVIdentity, jit::kVSigmoid, jit::kVTanh); diff --git a/paddle/fluid/operators/mkldnn/requantize_mkldnn_op.cc b/paddle/fluid/operators/mkldnn/requantize_mkldnn_op.cc new file mode 100644 index 0000000000000000000000000000000000000000..44e8281424ba6937dad2c2dee1db4dee96b3b2eb --- /dev/null +++ b/paddle/fluid/operators/mkldnn/requantize_mkldnn_op.cc @@ -0,0 +1,94 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "mkldnn.hpp" +#include "paddle/fluid/framework/data_layout_transform.h" +#include "paddle/fluid/framework/tensor.h" +#include "paddle/fluid/operators/requantize_op.h" +#include "paddle/fluid/platform/mkldnn_helper.h" + +namespace paddle { +namespace operators { + +using mkldnn::memory; +using mkldnn::primitive; +using mkldnn::reorder; +using platform::to_void_cast; +using Tensor = framework::Tensor; +using framework::DataLayout; +using mkldnn::stream; +using platform::GetMKLDNNFormat; + +template +class ReQuantOpKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const override { + auto* input = ctx.Input("Input"); + auto scale_in = ctx.Attr("Scale_in"); + auto scale_out = ctx.Attr("Scale_out"); + auto* output = ctx.Output("Output"); + auto& dev_ctx = + ctx.template device_context(); + const auto& engine = dev_ctx.GetEngine(); + + std::vector pipeline; + std::vector src_tz = paddle::framework::vectorize2int(input->dims()); + std::vector dst_tz = paddle::framework::vectorize2int(output->dims()); + mkldnn::memory::data_type src_dt = + paddle::framework::ToMKLDNNDataType(input->type()); + mkldnn::memory::data_type dst_dt = src_dt; // TODO(Xiaoli) support + // requantize from different + // data type (e.g., s8 to u8) + mkldnn::memory::format src_fmt = memory::format::nhwc; + mkldnn::memory::format dst_fmt = memory::format::nhwc; + + const T* input_data = input->data(); + T* output_data = output->mutable_data(ctx.GetPlace()); + float scale_shift = scale_out / scale_in; + + mkldnn::primitive_attr attri; + int mask = 0; + attri.set_output_scales(mask, {scale_shift}); + + auto src_md = platform::MKLDNNMemDesc({src_tz}, src_dt, src_fmt); + auto src_pd = mkldnn::memory::primitive_desc(src_md, engine); + auto src_memory = + std::make_shared(src_pd, to_void_cast(input_data)); + std::shared_ptr src_memory_p = + std::shared_ptr(new primitive::at(*src_memory)); + + auto dst_md = platform::MKLDNNMemDesc({dst_tz}, dst_dt, dst_fmt); + auto dst_pd = mkldnn::memory::primitive_desc(dst_md, engine); + auto dst_memory = mkldnn::memory(dst_pd, to_void_cast(output_data)); + + auto reorder_pd = std::shared_ptr( + new reorder::primitive_desc(src_pd, dst_pd, attri)); + + auto reorder_p = std::shared_ptr( + new reorder(*reorder_pd, *src_memory_p, dst_memory)); + pipeline.push_back(*reorder_p); + stream(stream::kind::eager).submit(pipeline).wait(); + + output->set_layout(DataLayout::kMKLDNN); + output->set_format(GetMKLDNNFormat(dst_memory)); + } +}; + +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; + +REGISTER_OP_KERNEL(requantize, MKLDNN, ::paddle::platform::CPUPlace, + ops::ReQuantOpKernel, ops::ReQuantOpKernel); diff --git a/paddle/fluid/operators/recurrent_op.cc b/paddle/fluid/operators/recurrent_op.cc index a1e02a3fd0e7902e89890f8d3b13159172571f5c..88c968a0eaae8a2ac6f14ede9348c837bcd92d76 100644 --- a/paddle/fluid/operators/recurrent_op.cc +++ b/paddle/fluid/operators/recurrent_op.cc @@ -157,11 +157,13 @@ class RecurrentBase : public framework::OperatorBase { const std::vector &src_vars, framework::Scope *dst_scope, const std::vector &dst_vars, - Callback callback) { + Callback callback, + bool is_backward = false) { PADDLE_ENFORCE_EQ(src_vars.size(), dst_vars.size()); for (size_t i = 0; i < dst_vars.size(); ++i) { VLOG(10) << "Link " << src_vars[i] << " to " << dst_vars[i]; - AccessTensor(src_scope, src_vars[i], dst_scope, dst_vars[i], callback); + AccessTensor(src_scope, src_vars[i], dst_scope, dst_vars[i], callback, + is_backward); } } @@ -173,11 +175,13 @@ class RecurrentBase : public framework::OperatorBase { const std::vector &src_vars, const framework::Scope &dst_scope, const std::vector &dst_vars, - Callback callback) { + Callback callback, + bool is_backward = false) { PADDLE_ENFORCE_EQ(src_vars.size(), dst_vars.size()); for (size_t i = 0; i < dst_vars.size(); ++i) { VLOG(10) << "Link " << src_vars[i] << " to " << dst_vars[i]; - AccessTensor(src_scope, src_vars[i], dst_scope, dst_vars[i], callback); + AccessTensor(src_scope, src_vars[i], dst_scope, dst_vars[i], callback, + is_backward); } } @@ -194,9 +198,13 @@ class RecurrentBase : public framework::OperatorBase { static void AccessTensor(const framework::Scope &src_scope, const std::string &src_var_name, framework::Scope *dst_scope, - const std::string &dst_var_name, Callback callback) { + const std::string &dst_var_name, Callback callback, + bool is_backward = false) { auto *src_var = src_scope.FindVar(src_var_name); - PADDLE_ENFORCE(src_var != nullptr); + if (is_backward && src_var == nullptr) { + return; + } + PADDLE_ENFORCE(src_var != nullptr, "%s is not found.", src_var_name); auto &src_tensor = src_var->Get(); auto *dst_var = dst_scope->Var(dst_var_name); @@ -208,12 +216,16 @@ class RecurrentBase : public framework::OperatorBase { static void AccessTensor(const framework::Scope &src_scope, const std::string &src_var_name, const framework::Scope &dst_scope, - const std::string &dst_var_name, Callback callback) { + const std::string &dst_var_name, Callback callback, + bool is_backward = false) { + auto *dst_var = dst_scope.FindVar(dst_var_name); + if (is_backward && dst_var == nullptr) { + return; + } auto *src_var = src_scope.FindVar(src_var_name); - PADDLE_ENFORCE(src_var != nullptr); + PADDLE_ENFORCE(src_var != nullptr, "%s is not found.", src_var_name); auto &src_tensor = src_var->Get(); - auto *dst_var = dst_scope.FindVar(dst_var_name); - PADDLE_ENFORCE(dst_var != nullptr); + PADDLE_ENFORCE(dst_var != nullptr, "%s is not found.", dst_var_name); auto *dst_tensor = dst_var->GetMutable(); callback(src_tensor, dst_tensor); } @@ -345,7 +357,8 @@ class RecurrentGradOp : public RecurrentBase { auto dims = framework::vectorize(inside->dims()); dims.erase(dims.begin()); inside->Resize(framework::make_ddim(dims)); - }); + }, + true /*is_backward*/); auto og_set = List2Set(Inputs(kOutputGrads)); if (VLOG_IS_ON(10)) { @@ -454,7 +467,8 @@ class RecurrentGradOp : public RecurrentBase { auto dst = outside->Slice(seq_offset, seq_offset + 1); framework::TensorCopy(inside, place, dev_ctx, &dst); - }); + }, + true /*is_backward*/); VLOG(5) << "Link outside gradient finished "; if (step_id + 1 == seq_len) { // at_end @@ -467,7 +481,8 @@ class RecurrentGradOp : public RecurrentBase { outside->Resize(inside.dims()); outside->mutable_data(place, inside.type()); framework::TensorCopy(inside, place, dev_ctx, outside); - }); + }, + true /*is_backward*/); VLOG(5) << "Link initialize state gradient finished "; } scopes.Next(); @@ -608,10 +623,8 @@ class RecurrentGradOpShapeInference : public framework::InferShapeBase { std::vector input{kInputs, kInitialStates}; std::vector output{kOutputs}; for (auto &s : input) { + // NOTE(zcd): In some case, some of kInputs doesn't have gradient. PADDLE_ENFORCE(ctx->HasInputs(s)); - PADDLE_ENFORCE(ctx->HasOutputs(framework::GradVarName(s)), - "Cannot find the gradient variable %s", - framework::GradVarName(s)); } for (auto &s : output) { PADDLE_ENFORCE(ctx->HasInputs(s)); diff --git a/paddle/fluid/operators/requantize_op.cc b/paddle/fluid/operators/requantize_op.cc new file mode 100644 index 0000000000000000000000000000000000000000..08ba1470aaddf146fe3685ff6c3cd9f3d7e16d75 --- /dev/null +++ b/paddle/fluid/operators/requantize_op.cc @@ -0,0 +1,46 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. */ + +#include "paddle/fluid/operators/requantize_op.h" +#ifdef PADDLE_WITH_MKLDNN +#include "paddle/fluid/platform/mkldnn_helper.h" +#endif + +namespace paddle { +namespace operators { + +framework::OpKernelType ReQuantOp::GetExpectedKernelType( + const framework::ExecutionContext& ctx) const { + framework::LibraryType library_ = framework::LibraryType::kMKLDNN; + framework::DataLayout layout_ = framework::DataLayout::kMKLDNN; + + return framework::OpKernelType(ctx.Input("Input")->type(), + ctx.GetPlace(), layout_, library_); +} + +void ReQuantOpMaker::Make() { + AddInput("Input", "input data"); + AddOutput("Output", "output data"); + AddAttr("Scale_in", "scale in data").SetDefault({1.0f}); + AddAttr("Scale_out", "scale out data").SetDefault({1.0f}); + AddComment( + R"DOC(This op will re-quantize data from INT8 with scale_in to INT8 with scale_out)DOC"); +} + +} // namespace operators +} // namespace paddle +namespace ops = paddle::operators; + +REGISTER_OPERATOR(requantize, ops::ReQuantOp, ops::ReQuantOpMaker, + paddle::framework::DefaultGradOpDescMaker); diff --git a/paddle/fluid/operators/requantize_op.h b/paddle/fluid/operators/requantize_op.h new file mode 100644 index 0000000000000000000000000000000000000000..c2b154db11dc713fdce1b9ef2f2616428bc09202 --- /dev/null +++ b/paddle/fluid/operators/requantize_op.h @@ -0,0 +1,47 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +#include +#include +#include "paddle/fluid/framework/op_registry.h" + +namespace paddle { +namespace operators { + +using framework::OpKernelType; +using framework::Tensor; + +class ReQuantOp : public framework::OperatorWithKernel { + public: + using framework::OperatorWithKernel::OperatorWithKernel; + + void InferShape(framework::InferShapeContext* ctx) const override { + ctx->SetOutputDim("Output", ctx->GetInputDim("Input")); + ctx->ShareLoD("Input", /*->*/ "Output"); + } + + protected: + framework::OpKernelType GetExpectedKernelType( + const framework::ExecutionContext& ctx) const override; +}; + +class ReQuantOpMaker : public framework::OpProtoAndCheckerMaker { + public: + void Make() override; +}; + +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/reshape_op.cc b/paddle/fluid/operators/reshape_op.cc index eda54f76b898cdf893347d31cadb86dea892a4ce..37f69426b62fedf8cbeca68105fb86fb4ea72eab 100644 --- a/paddle/fluid/operators/reshape_op.cc +++ b/paddle/fluid/operators/reshape_op.cc @@ -56,6 +56,9 @@ class ReshapeOp : public framework::OperatorWithKernel { static framework::DDim ValidateShape(const std::vector shape, const framework::DDim &in_dims) { const int64_t in_size = framework::product(in_dims); + auto in_dims_vec = framework::vectorize(in_dims); + bool all_positive = std::all_of(in_dims_vec.cbegin(), in_dims_vec.cend(), + [](int64_t i) { return i > 0; }); // only one dimension can be set to -1, whose size will be automatically // infered. const int64_t unk_dim_val = -1; @@ -88,7 +91,7 @@ class ReshapeOp : public framework::OperatorWithKernel { } if (unk_dim_idx != -1) { - if (in_size > 0) { + if (all_positive) { // in_size < 0 and is un-determinate in compile time, skip the check, // for example, in_dims = [-1, 8, 1, 1], shape = [-1, 3, 8], // capacity = -24, in_size = -8, output_shape[0] = 0 diff --git a/paddle/fluid/operators/tensorrt/tensorrt_engine_op.cc b/paddle/fluid/operators/tensorrt/tensorrt_engine_op.cc index 031335009b692f9d1f73070c88e8e79d852cbe36..a8c86de9f9a1aea9ecdedd750757ec7d25cdf2f3 100644 --- a/paddle/fluid/operators/tensorrt/tensorrt_engine_op.cc +++ b/paddle/fluid/operators/tensorrt/tensorrt_engine_op.cc @@ -30,6 +30,9 @@ class TensorRTEngineOpMaker : public framework::OpProtoAndCheckerMaker { AddOutput("Ys", "A list of outputs").AsDuplicable(); AddAttr("subgraph", "the subgraph."); AddAttr("calibration_data", "the calibration data for int8"); + AddAttr( + "engine_serialized_data", + "the serialized data contains the all info of the ICUDAEngine"); AddAttr( "engine_key", "The engine_key here is used to distinguish different TRT Engines"); diff --git a/paddle/fluid/operators/tensorrt/tensorrt_engine_op.h b/paddle/fluid/operators/tensorrt/tensorrt_engine_op.h index 2ff35c7c6ac6409d529de5b794bfc322b1f5dd9b..c36673312489738ad0475a0b70a23a1c6c948b9d 100644 --- a/paddle/fluid/operators/tensorrt/tensorrt_engine_op.h +++ b/paddle/fluid/operators/tensorrt/tensorrt_engine_op.h @@ -16,8 +16,10 @@ #ifdef PADDLE_WITH_CUDA +#include #include #include +#include #include #include "paddle/fluid/framework/executor.h" @@ -31,37 +33,6 @@ namespace paddle { namespace operators { -using FluidDT = framework::proto::VarType_Type; -using TRT_DT = nvinfer1::DataType; - -namespace { // NOLINT - -TRT_DT FluidDataType2TRT(FluidDT type) { - switch (type) { - case FluidDT::VarType_Type_FP32: - return TRT_DT::kFLOAT; - case FluidDT::VarType_Type_INT32: - return TRT_DT::kINT32; - default: - return TRT_DT::kINT32; - } - PADDLE_THROW("unkown type"); - return TRT_DT::kINT32; -} - -nvinfer1::Dims Vec2TRT_Dims(const std::vector &shape) { - PADDLE_ENFORCE_GT(shape.size(), 1UL, - "TensorRT' tensor input requires at least 2 dimensions"); - PADDLE_ENFORCE_LE(shape.size(), 4UL, - "TensorRT' tensor input requires at most 4 dimensions"); - PADDLE_ENFORCE(shape.size() == 4UL || shape.size() == 2UL); - if (shape.size() == 4UL) - return nvinfer1::DimsCHW(shape[1], shape[2], shape[3]); - return nvinfer1::DimsCHW(shape[1], 1, 1); -} - -} // namespace // NOLINT - using inference::Singleton; using inference::tensorrt::TensorRTEngine; using inference::tensorrt::TRTInt8Calibrator; @@ -79,6 +50,7 @@ class TensorRTEngineOp : public framework::OperatorBase { bool enable_int8_; std::string calibration_data_; std::string engine_key_; + std::string engine_serialized_data_; bool calibration_mode_; public: @@ -93,6 +65,7 @@ class TensorRTEngineOp : public framework::OperatorBase { enable_int8_ = Attr("enable_int8"); calibration_data_ = Attr("calibration_data"); engine_key_ = Attr("engine_key"); + engine_serialized_data_ = Attr("engine_serialized_data"); auto params = Attr>("parameters"); for (const auto ¶m : params) { @@ -125,7 +98,8 @@ class TensorRTEngineOp : public framework::OperatorBase { RunCalibration(scope, dev_place); return; } - RunTrt(scope, dev_place); + auto *trt_engine = GetEngine(scope, dev_place); + RunTrt(scope, dev_place, trt_engine); } void RunCalibration(const framework::Scope &scope, @@ -136,10 +110,6 @@ class TensorRTEngineOp : public framework::OperatorBase { LOG_FIRST_N(INFO, 1) << "The TRT engine: " << engine_key_ << " is running calibration trt int8... "; int runtime_batch = 1; - platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance(); - auto &dev_ctx = *pool.Get(dev_place); - auto stream = - reinterpret_cast(dev_ctx).stream(); if (!Singleton::Global().Has(engine_key_)) { TRTCalibratorEngine *calib_res = Singleton::Global().Create(engine_key_); @@ -156,11 +126,11 @@ class TensorRTEngineOp : public framework::OperatorBase { calib_buffers, runtime_batch, engine_key_, dev_place)); calib_res->thr_.reset(new std::thread([&]() { calib_res->engine_.reset(new TensorRTEngine( - max_batch_size_, workspace_size_, stream, - boost::get(dev_place).device, enable_int8_, - calib_res->calib_.get())); + max_batch_size_, workspace_size_, enable_int8_, + calib_res->calib_.get(), + boost::get(dev_place).device)); VLOG(3) << "start the calib trt engine thread"; - Prepare(scope, dev_place, calib_res->engine_.get()); + PrepareTRTEngine(scope, calib_res->engine_.get()); })); } @@ -180,28 +150,29 @@ class TensorRTEngineOp : public framework::OperatorBase { RunNativeImpl(scope, dev_place); } - void RunTrt(const framework::Scope &scope, - const platform::Place &dev_place) const { + void RunTrt(const framework::Scope &scope, const platform::Place &dev_place, + TensorRTEngine *engine) const { int runtime_batch = 1; platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance(); auto &dev_ctx = *pool.Get(dev_place); auto stream = reinterpret_cast(dev_ctx).stream(); - if (trt_engine_.get() == nullptr) { - trt_engine_.reset( - new TensorRTEngine(max_batch_size_, workspace_size_, stream, - boost::get(dev_place).device, - enable_int8_, calibrator_.get())); - Prepare(scope, dev_place, trt_engine_.get()); - } - auto *engine = trt_engine_.get(); PADDLE_ENFORCE(!input_names_.empty(), "should pass more than one inputs"); std::vector output_maps = Attr>("output_name_mapping"); - // Convert input tensor from fluid to engine. + int num_inputs = 0; + + for (const auto &x : Inputs("Xs")) { + if (param_names_.count(x)) continue; + num_inputs += 1; + } + const int num_bindings = num_inputs + Outputs("Ys").size(); + std::vector buffers(num_bindings); + + // Bind input tensor to TRT. for (const auto &x : Inputs("Xs")) { if (param_names_.count(x)) continue; // convert input and copy to TRT engine's buffer @@ -209,28 +180,20 @@ class TensorRTEngineOp : public framework::OperatorBase { inference::analysis::GetFromScope(scope, x); auto t_shape = framework::vectorize(t.dims()); runtime_batch = t_shape[0]; - if (platform::is_cpu_place(t.place())) { - engine->SetInputFromCPU(x, static_cast(t.data()), - t.memory_size()); - } else { - engine->SetInputFromGPU(x, static_cast(t.data()), - t.memory_size()); - } - } - cudaStreamSynchronize(stream); - PADDLE_ENFORCE_LE(runtime_batch, max_batch_size_); - // Execute the engine. - engine->Execute(runtime_batch); + const int bind_index = engine->engine()->getBindingIndex(x.c_str()); + PADDLE_ENFORCE(bind_index < num_bindings, + "The bind index should be less than num_bindings"); + buffers[bind_index] = static_cast(t.data()); + } - // Convert output tensor from engine to fluid + // Bind output tensor to TRT. int output_index = 0; VLOG(4) << "TensorRT Engine Op Outputs:"; for (const auto &y : Outputs("Ys")) { - VLOG(4) << y; - // convert output and copy to fluid. - nvinfer1::ITensor *trt_t = engine->GetITensor(output_maps[output_index]); - auto dims = trt_t->getDimensions(); + const int bind_index = + engine->engine()->getBindingIndex(output_maps[output_index].c_str()); + auto dims = engine->engine()->getBindingDimensions(bind_index); // Use the output ITensor's dims to reshape the Fluid Tensor. // The ITensor doesn't contain the batch size dim. std::vector ddim; @@ -238,71 +201,55 @@ class TensorRTEngineOp : public framework::OperatorBase { for (int i = 0; i < dims.nbDims; i++) { ddim.push_back(dims.d[i]); } - auto *fluid_v = scope.FindVar(y); PADDLE_ENFORCE_NOT_NULL(fluid_v, "no output variable called %s", y); auto *fluid_t = fluid_v->GetMutable(); - fluid_t->Resize(framework::make_ddim(ddim)); - // TODO(Superjomn) change this float to dtype size. - auto size = - inference::analysis::AccuDims(dims.d, dims.nbDims) * runtime_batch; - engine->GetOutputInGPU( - output_maps[output_index], - fluid_t->mutable_data(platform::CUDAPlace( - boost::get(dev_place).device)), - size * sizeof(float)); + PADDLE_ENFORCE(bind_index < num_bindings, + "The bind index should be less than num_bindings"); + buffers[bind_index] = static_cast(fluid_t->mutable_data( + boost::get(dev_place))); + output_index += 1; } + PADDLE_ENFORCE_LE(runtime_batch, max_batch_size_); + // Execute the engine. + engine->Execute(runtime_batch, &buffers, stream); cudaStreamSynchronize(stream); } - void Prepare(const framework::Scope &scope, const platform::Place &dev_place, - TensorRTEngine *engine) const { + TensorRTEngine *GetEngine(const framework::Scope &scope, + const platform::Place &dev_place) const { + if (!trt_engine_) { + trt_engine_.reset(new inference::tensorrt::TensorRTEngine( + max_batch_size_, workspace_size_, enable_int8_, calibrator_.get(), + boost::get(dev_place).device)); + if (!engine_serialized_data_.empty()) { + trt_engine_->Deserialize(engine_serialized_data_); + } else { + PrepareTRTEngine(scope, trt_engine_.get()); + } + } + return trt_engine_.get(); + } + + void PrepareTRTEngine(const framework::Scope &scope, + TensorRTEngine *engine) const { LOG(INFO) << "Prepare TRT engine (Optimize model structure, Select OP " "kernel etc). This process may cost a lot of time."; - framework::proto::BlockDesc block_desc; - block_desc.ParseFromString(Attr("subgraph")); + framework::proto::BlockDesc block_proto; + block_proto.ParseFromString(Attr("subgraph")); + framework::BlockDesc block_desc(nullptr, &block_proto); - std::vector output_maps = + std::vector inputs = Inputs("Xs"); + std::vector outputs = Attr>("output_name_mapping"); - engine->InitNetwork(); - - framework::BlockDesc block(nullptr /*programdesc*/, &block_desc); - VLOG(4) << "parsed var size " << block.AllVars().size(); - // Add inputs - VLOG(4) << "declare inputs"; - for (auto &input : Inputs("Xs")) { - if (param_names_.count(input)) continue; - VLOG(4) << "declare input " << input; - - auto &t = - inference::analysis::GetFromScope(scope, input); - auto t_shape = framework::vectorize(t.dims()); - - auto *var = block.FindVar(input); - // TensorRT engine need to create parameters. The parameter's description - // should be set in - PADDLE_ENFORCE(var, "no variable called %s", input); - PADDLE_ENFORCE_EQ(var->GetType(), FluidDT::VarType_Type_LOD_TENSOR, - "TensorRT engine only takes LoDTensor as input"); - - engine->DeclareInput( - input, FluidDataType2TRT( - var->Proto()->type().lod_tensor().tensor().data_type()), - Vec2TRT_Dims(t_shape)); - } inference::Singleton::Global() - .ConvertBlock(block_desc, param_names_, scope, engine); - - // Add outputs - for (auto &output : output_maps) { - engine->DeclareOutput(output); - } - engine->FreezeNetwork(); + .ConvertBlockToTRTEngine(&block_desc, scope, inputs, param_names_, + outputs, engine); } }; diff --git a/paddle/fluid/operators/tensorrt/tensorrt_engine_op_test.cc b/paddle/fluid/operators/tensorrt/tensorrt_engine_op_test.cc index 5a3d9d2c1a3e8111acbad2ddcf4f5469a3a99751..e7ad2f4fe0c654d8928f5793c1ad8052ab766fb5 100644 --- a/paddle/fluid/operators/tensorrt/tensorrt_engine_op_test.cc +++ b/paddle/fluid/operators/tensorrt/tensorrt_engine_op_test.cc @@ -107,6 +107,7 @@ TEST(TensorRTEngineOp, manual) { engine_op_desc.SetAttr("output_name_mapping", std::vector({"z0"})); engine_op_desc.SetAttr("subgraph", std::string(block_->SerializeAsString())); + engine_op_desc.SetAttr("engine_serialized_data", std::string("")); LOG(INFO) << "create engine op"; auto engine_op = framework::OpRegistry::CreateOp(engine_op_desc); @@ -202,6 +203,7 @@ void Execute(int batch_size, int input_dim, int output_dim, int nlayers = 1) { engine_op_desc.SetAttr("output_name_mapping", std::vector({"z3"})); engine_op_desc.SetAttr("subgraph", std::string(block_->SerializeAsString())); + engine_op_desc.SetAttr("engine_serialized_data", std::string("")); auto engine_op = framework::OpRegistry::CreateOp(engine_op_desc); diff --git a/paddle/fluid/platform/gpu_info.cc b/paddle/fluid/platform/gpu_info.cc index 400a6d7bfa5912774c4bbb2a5868dd9a471afd00..9553298d5e6b315d510b6cdcc8ab30dd33a5b2c9 100644 --- a/paddle/fluid/platform/gpu_info.cc +++ b/paddle/fluid/platform/gpu_info.cc @@ -38,6 +38,22 @@ DEFINE_double(fraction_of_gpu_memory_to_use, fraction_of_gpu_memory_to_use, "additional trunks of the same size will be requested from gpu " "until the gpu has no memory left for another trunk."); +DEFINE_double( + initial_gpu_memory_in_mb, -1.0, + "GPU memory chunk size in MB." + "Allocator would allocate FLAGS_initial_gpu_memory_in_mb size " + "chunk first and reallocate FLAGS_reallocate_gpu_memory_in_mb size " + "chunk when the first chunk is not enough. This flag has higher priority " + "than FLAGS_fraction_of_gpu_memory_to_use. Disable when less than 0."); + +DEFINE_double(reallocate_gpu_memory_in_mb, -1.0, + "GPU memory chunk size in MB." + "If FLAGS_initial_gpu_memory_in_mb is set and " + "FLAGS_reallocate_gpu_memory_in_mb " + "is less than 0, it would be replaced by " + "FLAGS_initial_gpu_memory_in_mb. Disable " + "when FLAGS_initial_gpu_memory_in_mb is less than 0."); + DEFINE_bool( enable_cublas_tensor_op_math, false, "The enable_cublas_tensor_op_math indicate whether to use Tensor Core, " @@ -211,13 +227,54 @@ size_t GpuMaxChunkSize() { size_t allocating = static_cast(FLAGS_fraction_of_gpu_memory_to_use * (total - reserving)); - PADDLE_ENFORCE_LE(allocating, available, "Insufficient GPU memory to allocation."); return allocating; } +size_t GpuFirstAllocateChunkSize() { + if (FLAGS_initial_gpu_memory_in_mb <= 0) { + return GpuMaxChunkSize(); + } + + size_t total = 0; + size_t available = 0; + + GpuMemoryUsage(&available, &total); + VLOG(10) << "GPU Usage " << available / 1024 / 1024 << "M/" + << total / 1024 / 1024 << "M"; + + size_t initial_mem = + static_cast(FLAGS_initial_gpu_memory_in_mb * (1 << 20)); + PADDLE_ENFORCE_LE(initial_mem, available, + "Insufficient GPU memory to allocation."); + return initial_mem; +} + +size_t GpuReAllocateChunkSize() { + if (FLAGS_initial_gpu_memory_in_mb <= 0) { + return GpuMaxChunkSize(); + } + + double reallocate_mem = FLAGS_reallocate_gpu_memory_in_mb; + if (reallocate_mem < 0) { + PADDLE_ENFORCE(FLAGS_initial_gpu_memory_in_mb > 0, + "FLAGS_init_gpu_memory_to_use_mb must be larger than 0"); + reallocate_mem = FLAGS_initial_gpu_memory_in_mb; + } + + size_t total = 0; + size_t available = 0; + GpuMemoryUsage(&available, &total); + VLOG(10) << "GPU Usage " << available / 1024 / 1024 << "M/" + << total / 1024 / 1024 << "M"; + size_t realloc_mem = static_cast(reallocate_mem * (1 << 20)); + PADDLE_ENFORCE_LE(realloc_mem, available, + "Insufficient GPU memory to allocation."); + return realloc_mem; +} + void GpuMemcpyAsync(void *dst, const void *src, size_t count, enum cudaMemcpyKind kind, cudaStream_t stream) { PADDLE_ENFORCE(cudaMemcpyAsync(dst, src, count, kind, stream), diff --git a/paddle/fluid/platform/gpu_info.h b/paddle/fluid/platform/gpu_info.h index 1e1ab2503f53fe20bbe62c48f65d8535947f1aa8..7c05658851d0ea1118d706ed3810809e68593df4 100644 --- a/paddle/fluid/platform/gpu_info.h +++ b/paddle/fluid/platform/gpu_info.h @@ -66,6 +66,12 @@ size_t GpuMinChunkSize(); //! Get the maximum chunk size for GPU buddy allocator. size_t GpuMaxChunkSize(); +//! Get init chunk size for GPU buddy allocator. +size_t GpuFirstAllocateChunkSize(); + +//! Get reallocate chunk size for GPU buddy allocator. +size_t GpuReAllocateChunkSize(); + //! Copy memory from address src to dst asynchronously. void GpuMemcpyAsync(void *dst, const void *src, size_t count, enum cudaMemcpyKind kind, cudaStream_t stream); diff --git a/paddle/fluid/platform/temporary_allocator.cc b/paddle/fluid/platform/temporary_allocator.cc index 6cb4ec1da5e324aece08234ea56704bb5df001ff..4e1056cfb9e3d8c50139db50b67491ce3b839fd3 100644 --- a/paddle/fluid/platform/temporary_allocator.cc +++ b/paddle/fluid/platform/temporary_allocator.cc @@ -77,6 +77,7 @@ void TemporaryAllocator::FreeImpl(alloc::Allocation *temp_allocation) { } VLOG(10) << "Delete temporary allocation " << temp_allocation->ptr() << " size: " << temp_allocation->size(); + alloc::AllocationDeleter()(temp_allocation); } size_t TemporaryAllocator::TemporaryAllocationQueueSize() { diff --git a/paddle/fluid/pybind/inference_api.cc b/paddle/fluid/pybind/inference_api.cc index 7db2bb451b49918fd8d92a6036c132d34e965c63..03c1b0bd092181e4f20bf8944823c688ff98d65f 100644 --- a/paddle/fluid/pybind/inference_api.cc +++ b/paddle/fluid/pybind/inference_api.cc @@ -221,7 +221,8 @@ void BindAnalysisConfig(py::module *m) { .def("enable_tensorrt_engine", &AnalysisConfig::EnableTensorRtEngine, py::arg("workspace_size") = 1 << 20, py::arg("max_batch_size") = 1, py::arg("min_subgraph_size") = 3, - py::arg("precision_mode") = AnalysisConfig::Precision::kFloat32) + py::arg("precision_mode") = AnalysisConfig::Precision::kFloat32, + py::arg("use_static") = true) .def("tensorrt_engine_enabled", &AnalysisConfig::tensorrt_engine_enabled) .def("switch_ir_debug", &AnalysisConfig::SwitchIrDebug, py::arg("x") = true) diff --git a/python/paddle/fluid/__init__.py b/python/paddle/fluid/__init__.py index 2fa6b79caf129862e501842b0f7b86926423d3a1..f29ccb94ee4d03acb1da394b2f309a38f0530e79 100644 --- a/python/paddle/fluid/__init__.py +++ b/python/paddle/fluid/__init__.py @@ -159,6 +159,7 @@ def __bootstrap__(): if core.is_compiled_with_cuda(): read_env_flags += [ + 'initial_gpu_memory_in_mb', 'reallocate_gpu_memory_in_mb', 'fraction_of_gpu_memory_to_use', 'cudnn_deterministic', 'enable_cublas_tensor_op_math', 'conv_workspace_size_limit', 'cudnn_exhaustive_search', 'memory_optimize_debug', 'selected_gpus', diff --git a/python/paddle/fluid/imperative/layer_object_helper.py b/python/paddle/fluid/imperative/layer_object_helper.py new file mode 100644 index 0000000000000000000000000000000000000000..6afffe3636dd79d124a5b0e9d9eccb02630f5b8c --- /dev/null +++ b/python/paddle/fluid/imperative/layer_object_helper.py @@ -0,0 +1,220 @@ +# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import print_function + +import copy +import six +from ..framework import Parameter, _in_imperative_mode +from ..param_attr import ParamAttr +from .. import core +from six.moves import zip +from ..layer_helper_base import LayerHelperBase + + +class LayerObjectHelper(LayerHelperBase): + def __init__(self, name): + super(LayerObjectHelper, self).__init__(name, layer_type=name) + + def append_op(self, + type=None, + inputs=None, + outputs=None, + attrs=None, + stop_gradient=None): + """append an operator for this layer object. + + Args: + type: operator type + inputs: input variable of the operator + dtype: data type of this parameter + is_bias: if this is a bias parameter + default_initializer: set the default initializer for this parameter + + Returns created parameter Variable. + """ + return self.main_program.current_block().append_op( + type=type, + inputs=inputs, + outputs=outputs, + attrs=attrs, + stop_gradient=stop_gradient) + + def _multiple_input(self, inputs_in): + inputs = inputs_in + ret = [] + if isinstance(inputs, (list, tuple)): + for inp in inputs: + ret.append(self.to_variable(inp)) + else: + ret.append(self.to_variable(inputs)) + return ret + + # TODO: make it public when we need it + def _input(self, inputs_in): + inputs = self._multiple_input(inputs_in) + if len(inputs) != 1: + raise "{0} layer only takes one input".format(self.layer_type) + return inputs[0] + + def _multiple_param_attr(self, length, param_attr_in=None): + param_attr = param_attr_in + if isinstance(param_attr, ParamAttr): + param_attr = [param_attr] + + if len(param_attr) != 1 and len(param_attr) != length: + raise ValueError("parameter number mismatch") + elif len(param_attr) == 1 and length != 1: + tmp = [None] * length + for i in six.moves.range(length): + tmp[i] = copy.deepcopy(param_attr[0]) + param_attr = tmp + return param_attr + + def iter_inputs_and_params(self, inputs_in, param_attr_in=None): + """Access all inputs and params one by one + + Args: + inputs_in: inputs to be iter + param_attr_in: param_attr to be iter + + Returns input, param_attr + """ + inputs = inputs_in if (inputs_in is not None) else [] + inputs = self._multiple_input(inputs) + param_attrs = self._multiple_param_attr(len(inputs), param_attr_in) + for ipt, param_attr in zip(inputs, param_attrs): + yield ipt, param_attr + + def input_dtype(self, inputs_in): + """Get input data type + + Args: + inputs_in: inputs wanted know the data type + + Returns dtype of the input + """ + inputs = self._multiple_input(inputs_in) + dtype = None + for each in inputs: + if dtype is None: + dtype = each.dtype + elif dtype != each.dtype: + raise ValueError("Data Type mismatch: %d to %d" % + (dtype, each.dtype)) + return dtype + + def get_parameter(self, name): + """Get parameter specifically + + Args: + name: parameter's name + + Returns target parameter + """ + param = self.main_program.global_block().var(name) + if not isinstance(param, Parameter): + raise ValueError("no Parameter name %s found" % name) + return param + + def append_bias_op(self, + input_var, + dim_start=1, + dim_end=None, + bias_attr=None): + """Append bias operator and return its output. If the user does not set bias_attr, append_bias_op will return input_var + + Args: + input_var: the input variable. The len(input_var.shape) is + larger or equal than 2. + dim_start: + dim_end: the shape of the bias will be + bias_attr: the bias_attr of it + + Return the Variable of after append bias op + """ + size = list(input_var.shape[dim_start:dim_end]) + bias_attr = bias_attr + if not bias_attr: + return input_var + + b = self.create_parameter( + attr=bias_attr, shape=size, dtype=input_var.dtype, is_bias=True) + tmp = self.create_variable_for_type_inference(dtype=input_var.dtype) + self.append_op( + type='elementwise_add', + inputs={'X': [input_var], + 'Y': [b]}, + outputs={'Out': [tmp]}, + attrs={'axis': dim_start}) + return tmp + + # TODO: this should not be called anymore after all activation func move to Layers + def append_activation(self, + input_var, + act=None, + use_cudnn=None, + use_mkl_dnn=None): + """Append activation + + Args: + input_var: the input variable. The len(input_var.shape) is + larger or equal than 2. + act: activation type + use_mkl_dnn: if use mkldnn + use_cudnn: if use cudnn + + Return the Variable of after append activation + """ + act = act + if act is None: + return input_var + if isinstance(act, six.string_types): + act = {'type': act} + else: + raise TypeError(str(act) + " should be unicode or str") + + if (use_cudnn is not None) and use_cudnn: + act['use_cudnn'] = use_cudnn + if (use_mkl_dnn is not None) and use_mkl_dnn: + act['use_mkldnn'] = use_mkl_dnn + act_type = act.pop('type') + + tmp = input_var + # NOTE(dzhwinter): some activation support inplace compution. + # NOTE(minqiyang): currently, we don't support inplace in imperative mode + if not _in_imperative_mode() and core.IsInplace(act_type): + tmp = input_var + else: + tmp = self.create_variable_for_type_inference(dtype=input_var.dtype) + self.append_op( + type=act_type, + inputs={"X": [input_var]}, + outputs={"Out": [tmp]}, + attrs=act) + return tmp + + def is_instance(self, param, cls): + """Check if the input parameter is instance of input class + + Args: + param: parameter to be check + cls: class of the parameter + + Return result of the check (True or False) + """ + param = param + if not isinstance(param, cls): + raise TypeError("The input {0} parameter of method {1} must be {2}", + param, self.layer_type, cls.__name__) diff --git a/python/paddle/fluid/imperative/layers.py b/python/paddle/fluid/imperative/layers.py index 46640ce37a78f7409af7f82d3302a610ccd366b2..0c96d4dc5910f9500755dcd9837eeaff5ad4f831 100644 --- a/python/paddle/fluid/imperative/layers.py +++ b/python/paddle/fluid/imperative/layers.py @@ -19,8 +19,8 @@ import numpy as np import collections from .. import unique_name from paddle.fluid import core +from .layer_object_helper import LayerObjectHelper from paddle.fluid import framework -from paddle.fluid.imperative import base __all__ = ['Layer', 'PyLayer'] @@ -44,6 +44,8 @@ class Layer(core.Layer): self._parameters = collections.OrderedDict() self._sub_layers = collections.OrderedDict() + self._helper = LayerObjectHelper(self._full_name) + def full_name(self): """Full name for this layers. @@ -53,6 +55,51 @@ class Layer(core.Layer): """ return self._full_name + def create_parameter(self, + attr, + shape, + dtype, + is_bias=False, + default_initializer=None): + """Create parameters for this layers. + + Args: + attr: [ParamAttr] should be the parameter attribute for this parameter + shape: shape of the paramter + dtype: data type of this parameter + is_bias: if this is a bias parameter + default_initializer: set the default initializer for this parameter + + Returns created parameter Variable. + """ + return self._helper.create_parameter(attr, shape, dtype, is_bias, + default_initializer) + + # TODO: Add more parameter list when we need them + def create_variable(self, + name=None, + persistable=None, + dtype=None, + type=core.VarDesc.VarType.LOD_TENSOR): + """Create Variable for this layers. + + Args: + name: name of the variable + persistable: if set this variable persistable + dtype: data type of data in the variable + type: type of the variable + + Returns created Variable. + """ + if name is not None: + var_name = ".".join([self._full_name, name]) + else: + var_name = unique_name.generate(".".join( + [self._full_name, "_generated_var"])) + + return self._helper.main_program.current_block().create_var( + name=var_name, persistable=persistable, dtype=dtype, type=type) + def parameters(self, include_sublayers=True): """Returns a list of Parameters from current and sub-layers. diff --git a/python/paddle/fluid/imperative/nn.py b/python/paddle/fluid/imperative/nn.py index 41655c4f54eecec55bd2c7d2b74adb51efa88b61..4786f8b8ad3cdd3e16a5fb4ed15c32704f5c7990 100644 --- a/python/paddle/fluid/imperative/nn.py +++ b/python/paddle/fluid/imperative/nn.py @@ -41,21 +41,12 @@ class Conv2D(layers.Layer): bias_attr=None, dtype=core.VarDesc.VarType.FP32): assert param_attr is not False, "param_attr should not be False here." - super(Conv2D, self).__init__(name_scope, dtype=dtype) - - # TODO(minqiyang): Move this to the top. - from ..layer_helper import LayerHelper - self._helper = LayerHelper( - self.full_name(), - param_attr=param_attr, - bias_attr=bias_attr, - dtype=dtype, - act=act) - + super(Conv2D, self).__init__(name_scope) self._groups = groups self._stride = utils.convert_to_list(stride, 2, 'stride') self._padding = utils.convert_to_list(padding, 2, 'padding') self._dilation = utils.convert_to_list(dilation, 2, 'dilation') + self._act = act if not isinstance(use_cudnn, bool): raise ValueError("use_cudnn should be True or False") self._use_cudnn = use_cudnn @@ -80,28 +71,28 @@ class Conv2D(layers.Layer): std = (2.0 / filter_elem_num)**0.5 return Normal(0.0, std, 0) - self._filter_param = self._helper.create_parameter( - attr=self._helper.param_attr, + self._filter_param = self.create_parameter( + attr=param_attr, shape=filter_shape, dtype=self._dtype, default_initializer=_get_default_param_initializer()) if self._use_cudnn: - self._helper.create_variable( + self.create_variable( name="kCUDNNFwdAlgoCache", persistable=True, type=core.VarDesc.VarType.RAW) - self._helper.create_variable( + self.create_variable( name="kCUDNNBwdDataAlgoCache", persistable=True, type=core.VarDesc.VarType.RAW) - self._helper.create_variable( + self.create_variable( name="kCUDNNBwdFilterAlgoCache", persistable=True, type=core.VarDesc.VarType.RAW) - self._bias_param = self._helper.create_parameter( - attr=self._helper.bias_attr, + self._bias_param = self.create_parameter( + attr=bias_attr, shape=[num_filters], dtype=self._dtype, is_bias=True) @@ -137,7 +128,7 @@ class Conv2D(layers.Layer): attrs={'axis': 1}) # Currently, we don't support inplace in imperative mode - return self._helper.append_activation(pre_act) + return self._helper.append_activation(pre_act, act=self._act) class Pool2D(layers.Layer): @@ -167,9 +158,6 @@ class Pool2D(layers.Layer): super(Pool2D, self).__init__(name_scope, dtype=dtype) - from ..layer_helper import LayerHelper - self._helper = LayerHelper(self.full_name(), dtype=dtype) - self._pool_type = pool_type self._pool_size = utils.convert_to_list(pool_size, 2, 'pool_size') self._pool_padding = utils.convert_to_list(pool_padding, 2, @@ -216,28 +204,25 @@ class FC(layers.Layer): self._size = size self._num_flatten_dims = num_flatten_dims self._dtype = dtype - from ..layer_helper import LayerHelper - self._helper = LayerHelper( - self.full_name(), - param_attr=param_attr, - bias_attr=bias_attr, - act=act) + self._param_attr = param_attr + self._bias_attr = param_attr + self._act = act def _build_once(self, input): input_shape = input.shape param_shape = [ reduce(lambda a, b: a * b, input_shape[self._num_flatten_dims:], 1) ] + [self._size] - self._w = self._helper.create_parameter( - attr=self._helper.param_attr, + self._w = self.create_parameter( + attr=self._param_attr, shape=param_shape, dtype=self._dtype, is_bias=False) - if self._helper.bias_attr: + if self._param_attr: size = list([self._size]) - self._b = self._helper.create_parameter( - attr=self._helper.bias_attr, + self._b = self.create_parameter( + attr=self._param_attr, shape=size, dtype=self._dtype, is_bias=True) @@ -275,7 +260,7 @@ class FC(layers.Layer): else: pre_activation = pre_bias # Currently, we don't support inplace in imperative mode - return self._helper.append_activation(pre_activation) + return self._helper.append_activation(pre_activation, act=self._act) class BatchNorm(layers.Layer): @@ -297,16 +282,12 @@ class BatchNorm(layers.Layer): fuse_with_relu=False, use_global_stats=False): super(BatchNorm, self).__init__(name_scope) + self._param_attr = param_attr + self._param_attr = bias_attr + self._act = act assert bias_attr is not False, "bias_attr should not be False in batch_norm." - from ..layer_helper import LayerHelper - self._helper = LayerHelper( - self.full_name(), - param_attr=param_attr, - bias_attr=bias_attr, - act=act) - if dtype == core.VarDesc.VarType.FP16: self._dtype = core.VarDesc.VarType.FP32 else: @@ -315,23 +296,23 @@ class BatchNorm(layers.Layer): param_shape = [num_channels] # create parameter - self._scale = self._helper.create_parameter( - attr=self._helper.param_attr, + self._scale = self.create_parameter( + attr=self._param_attr, shape=param_shape, dtype=self._dtype, default_initializer=Constant(1.0)) - if use_global_stats and self._helper.param_attr.learning_rate == 0.: + if use_global_stats and self._param_attr.learning_rate == 0.: self._scale._stop_gradient = True - self._bias = self._helper.create_parameter( - attr=self._helper.bias_attr, + self._bias = self.create_parameter( + attr=self._param_attr, shape=param_shape, dtype=self._dtype, is_bias=True) - if use_global_stats and self._helper.bias_attr.learning_rate == 0.: + if use_global_stats and self._param_attr.learning_rate == 0.: self._bias._stop_gradient = True - self._mean = self._helper.create_parameter( + self._mean = self.create_parameter( attr=ParamAttr( name=moving_mean_name, initializer=Constant(0.0), @@ -341,7 +322,7 @@ class BatchNorm(layers.Layer): dtype=self._dtype) self._mean._stop_gradient = True - self._variance = self._helper.create_parameter( + self._variance = self.create_parameter( attr=ParamAttr( name=moving_variance_name, initializer=Constant(1.0), @@ -401,7 +382,7 @@ class BatchNorm(layers.Layer): }) # Currently, we don't support inplace in imperative mode - return self._helper.append_activation(batch_norm_out) + return self._helper.append_activation(batch_norm_out, self._act) class Embedding(layers.Layer): @@ -466,9 +447,7 @@ class Embedding(layers.Layer): if self._remote_prefetch: assert self._is_sparse is True and self._is_distributed is False - from ..layer_helper import LayerHelper - self._helper = LayerHelper(self.full_name(), param_attr=param_attr) - self._w = self._helper.create_parameter( + self._w = self.create_parameter( attr=self._param_attr, shape=self._size, dtype=self._dtype, diff --git a/python/paddle/fluid/initializer.py b/python/paddle/fluid/initializer.py index 190e7b5608a0cdf156b449e919e108a0917a0980..482dfa6fac05bd914efa384bd0f5ec54cfab1dca 100644 --- a/python/paddle/fluid/initializer.py +++ b/python/paddle/fluid/initializer.py @@ -19,7 +19,6 @@ import numpy as np from .wrapped_decorator import signature_safe_contextmanager from .core import VarDesc from . import unique_name -from .imperative import base as imperative_base __all__ = [ 'Constant', 'Uniform', 'Normal', 'TruncatedNormal', 'Xavier', 'Bilinear', @@ -166,7 +165,7 @@ class ConstantInitializer(Initializer): 'force_cpu': self._force_cpu or force_init_on_cpu() }, stop_gradient=True) - if not imperative_base.enabled(): + if not framework._in_imperative_mode(): var.op = op return op @@ -246,7 +245,7 @@ class UniformInitializer(Initializer): attrs={"in_dtype": out_var.dtype, "out_dtype": var.dtype}) - if not imperative_base.enabled(): + if not framework._in_imperative_mode(): var.op = op return op @@ -325,7 +324,7 @@ class NormalInitializer(Initializer): outputs={"Out": var}, attrs={"in_dtype": out_var.dtype, "out_dtype": var.dtype}) - if not imperative_base.enabled(): + if not framework._in_imperative_mode(): var.op = op return op @@ -404,7 +403,7 @@ class TruncatedNormalInitializer(Initializer): outputs={"Out": var}, attrs={"in_dtype": out_var.dtype, "out_dtype": var.dtype}) - if not imperative_base.enabled(): + if not framework._in_imperative_mode(): var.op = op return op @@ -510,7 +509,7 @@ class XavierInitializer(Initializer): "seed": self._seed }, stop_gradient=True) - if not imperative_base.enabled(): + if not framework._in_imperative_mode(): var.op = op return op @@ -611,7 +610,7 @@ class MSRAInitializer(Initializer): "seed": self._seed }, stop_gradient=True) - if not imperative_base.enabled(): + if not framework._in_imperative_mode(): var.op = op return op @@ -710,7 +709,7 @@ class BilinearInitializer(Initializer): 'shape': list(shape), value_name: values }) - if not imperative_base.enabled(): + if not framework._in_imperative_mode(): var.op = op return op @@ -769,7 +768,7 @@ class NumpyArrayInitializer(Initializer): value_name: values }, stop_gradient=True) - if not imperative_base.enabled(): + if not framework._in_imperative_mode(): var.op = op return op diff --git a/python/paddle/fluid/layer_helper.py b/python/paddle/fluid/layer_helper.py index 65864ca7e09cd4f0760637198d48154eed025c65..6f60fad94dca5b02bca14cda33df14c459d1a075 100644 --- a/python/paddle/fluid/layer_helper.py +++ b/python/paddle/fluid/layer_helper.py @@ -15,45 +15,29 @@ from __future__ import print_function import copy -import itertools import six -import sys -import numpy as np -from .framework import Variable, Parameter, default_main_program, default_startup_program, dtype_is_floating, _in_imperative_mode +from .framework import Parameter, dtype_is_floating, _in_imperative_mode from . import unique_name -from paddle.fluid.imperative import base as imperative_base from paddle.fluid.initializer import Constant, Xavier -from .param_attr import ParamAttr, WeightNormParamAttr +from .param_attr import ParamAttr from . import core from six.moves import zip +from .layer_helper_base import LayerHelperBase -class LayerHelper(object): +class LayerHelper(LayerHelperBase): def __init__(self, layer_type, **kwargs): self.kwargs = kwargs - self.layer_type = layer_type name = self.kwargs.get('name', None) # TODO(panyx0718, minqiyang): imperative mode # can not use both `layer_type` and `name`. Deprecate LayerHelper # and write a Helper for imperative mode. if name is None: - self.kwargs['name'] = unique_name.generate(self.layer_type) + self.kwargs['name'] = unique_name.generate(layer_type) - @property - def name(self): - return self.kwargs['name'] - - @property - def main_program(self): - return default_main_program() - - @property - def startup_program(self): - return default_startup_program() - - def to_variable(self, x): - return imperative_base.to_variable(x, self.main_program.current_block()) + super(LayerHelper, self).__init__( + self.kwargs['name'], layer_type=layer_type) def append_op(self, *args, **kwargs): return self.main_program.current_block().append_op(*args, **kwargs) @@ -82,6 +66,7 @@ class LayerHelper(object): def bias_attr(self): return ParamAttr._to_attr(self.kwargs.get('bias_attr', None)) + #TODO (jiabin): reconstruct this in LayerObjHelper and avoid dependency of param_attr def multiple_param_attr(self, length): param_attr = self.param_attr if isinstance(param_attr, ParamAttr): @@ -113,297 +98,13 @@ class LayerHelper(object): (dtype, each.dtype)) return dtype - def _create_weight_normalize(self, attr, shape, dtype): - from .layers import elementwise_mul, elementwise_div, reshape - - # Remove these ops when LayerHelper and layers support indicating - # program and block. - def __norm_op(x, - out=None, - p=2, - dim=None, - keep_dim=False, - block=self.startup_program.global_block()): - if out is None: - out = block.create_var( - name=unique_name.generate(".".join( - [self.name, 'weight_norm_norm'])), - dtype=dtype, - persistable=False) - abs_out = block.create_var( - name=unique_name.generate(".".join( - [self.name, 'weight_norm_abs'])), - dtype=dtype, - persistable=False) - block.append_op( - type='abs', inputs={'X': x}, outputs={'Out': abs_out}) - pow_out = block.create_var( - name=unique_name.generate(".".join( - [self.name, 'weight_norm_pow'])), - dtype=dtype, - persistable=False) - block.append_op( - type='pow', - inputs={'X': abs_out}, - outputs={'Out': pow_out}, - attrs={'factor': float(p)}) - sum_out = block.create_var( - name=unique_name.generate(".".join( - [self.name, 'weight_norm_sum'])), - dtype=dtype, - persistable=False) - block.append_op( - type='reduce_sum', - inputs={'X': pow_out}, - outputs={'Out': sum_out}, - attrs={ - 'dim': dim, - 'keep_dim': keep_dim, - 'reduce_all': True if dim is None else False - }) - block.append_op( - type='pow', - inputs={'X': sum_out}, - outputs={'Out': out}, - attrs={'factor': 1. / p}) - return out - - def __reshape_op(x, - shape, - out=None, - block=self.startup_program.global_block()): - if out is None: - out = block.create_var( - name=unique_name.generate(".".join( - [self.name, 'weight_norm_reshape'])), - dtype=dtype, - persistable=False) - block.append_op( - type='reshape', - inputs={'X': x}, - outputs={'Out': out}, - attrs={'shape': shape}) - return out - - def __transpose_op(x, - axis, - out=None, - block=self.startup_program.global_block()): - if out is None: - out = block.create_var( - name=unique_name.generate(".".join( - [self.name, 'weight_norm_transpose'])), - dtype=dtype, - persistable=False) - block.append_op( - type='transpose', - inputs={'X': x}, - outputs={'Out': out}, - attrs={'axis': axis}) - return out - - def __norm_except_dim(x, - out=None, - dim=None, - block=self.startup_program.global_block()): - """Computes the norm over all dimensions except dim""" - if out is None: - out = block.create_var( - name=unique_name.generate(".".join( - [self.name, 'weight_norm_norm'])), - dtype=dtype, - persistable=False) - if dim is None: - __norm_op(x, out, dim=dim, block=block) - elif dim == 0: - out_shape = [x.shape[0]] + [1] * (len(x.shape) - 1) - reshape = __reshape_op(x, shape=[x.shape[0], -1], block=block) - norm = __norm_op(reshape, dim=1, block=block) - __reshape_op(norm, out=out, shape=out_shape, block=block) - elif dim == len(x.shape) - 1: - out_shape = [1] * (len(x.shape) - 1) + [x.shape[-1]] - reshape = __reshape_op(x, shape=[-1, x.shape[-1]], block=block) - norm = __norm_op(reshape, dim=0, block=block) - __reshape_op(norm, out=out, shape=out_shape, block=block) - else: - perm = list(range(len(x.shape))) - perm[0], perm[dim] = dim, 0 - transpose = __transpose_op(x, perm, block=block) - norm = __norm_op(transpose, dim=0, block=block) - __transpose_op(norm, perm, out=out, block=block) - return out - - def __weight_normalize(g, v, dim): - """Calculations for weight normalization""" - norm = __norm_except_dim( - v, dim=dim, block=self.main_program.current_block()) - scale = elementwise_div( - x=g, y=norm) # The shapes of g and norm are the same. - # Currently, elementwise_mul only support broadcast when the shape - # of y is a subset of the shape of x. Thus, we reshape y to squeeze - # to achive the subset. - w = elementwise_mul( - x=v, - y=scale if dim is None else reshape( - x=scale, shape=[v.shape[dim]]), - axis=-1 if dim is None else dim) - # To serialize the original parameter for inference, maybe a - # parameter rather than a variable should be returned. - return w - - g_param_attr = copy.deepcopy(attr) - g_param_attr.name = attr.name + '_g' - g_param_shape = [1] * len(shape) - if attr.dim is not None: - g_param_shape[attr.dim] = shape[attr.dim] - v_param_attr = copy.deepcopy(attr) - v_param_attr.name = attr.name + '_v' - v_param_shape = shape - - # Add to startup_program to initialize g and v. - # Try to reconstruct the initializer of w by initializing g and v. - # Set the initializers of g and v as below, then the distribution - # of w is the same as initializing w with the given initializer. - # For Data-Dependent Initialization, please compute the init-values - # of g and v in external and then feed the values to g and v by - # executing an extra program. - g_param = self.startup_program.global_block().create_parameter( - dtype=dtype, - shape=g_param_shape, - **g_param_attr._to_kwargs(with_initializer=False)) - v_param = self.startup_program.global_block().create_parameter( - dtype=dtype, - shape=v_param_shape, - **v_param_attr._to_kwargs(with_initializer=True)) - __norm_except_dim( - x=v_param, - out=g_param, - dim=attr.dim, - block=self.startup_program.global_block()) - - # Add weight normalization to main_program - g_param = self.main_program.global_block().create_parameter( - dtype=dtype, shape=g_param_shape, **g_param_attr._to_kwargs()) - v_param = self.main_program.global_block().create_parameter( - dtype=dtype, shape=v_param_shape, **v_param_attr._to_kwargs()) - w_param = __weight_normalize(g_param, v_param, dim=attr.dim) - return w_param - - def create_parameter(self, - attr, - shape, - dtype, - is_bias=False, - default_initializer=None): - # Deepcopy the attr so that parameters can be shared in program - attr = copy.deepcopy(attr) - assert isinstance(attr, ParamAttr) - suffix = 'b' if is_bias else 'w' - if attr.name is None: - attr.name = unique_name.generate(".".join([self.name, suffix])) - - if default_initializer is None and attr.initializer is None: - if isinstance(dtype, core.VarDesc.VarType): - if dtype != core.VarDesc.VarType.FP32 and \ - dtype != core.VarDesc.VarType.FP64 and \ - dtype != core.VarDesc.VarType.FP16: - raise TypeError( - "Can not create parameter with default initializer when dtype is not float type. Set default_initializer to fit the parameter dtype!" - ) - else: - if not (dtype.startswith("float") or dtype == "double"): - raise TypeError( - "Can not create parameter with default initializer when dtype is not float type. Set default_initializer to fit the parameter dtype!" - ) - if is_bias: - attr._set_default_bias_initializer() - else: - attr._set_default_param_initializer() - else: - attr._set_default_initializer(default_initializer) - - # If weight normalization is set, insert extra parameters and ops. - # Refer to https://arxiv.org/pdf/1602.07868.pdf - if isinstance(attr, WeightNormParamAttr): - param = self._create_weight_normalize(attr, shape, dtype) - WeightNormParamAttr.params_with_weight_norm.append(param) - return param - if _in_imperative_mode(): - # In imperative mode, we want the returned parameter to be - # initialized so that it can be used imperatively. - return self.main_program.global_block().create_parameter( - dtype=dtype, - shape=shape, - **attr._to_kwargs(with_initializer=True)) - else: - self.startup_program.global_block().create_parameter( - dtype=dtype, - shape=shape, - **attr._to_kwargs(with_initializer=True)) - return self.main_program.global_block().create_parameter( - dtype=dtype, shape=shape, **attr._to_kwargs()) - def get_parameter(self, name): param = self.main_program.global_block().var(name) if not isinstance(param, Parameter): raise ValueError("no Parameter name %s found" % name) return param - def create_variable_for_type_inference(self, dtype, stop_gradient=False): - """Create a temporary variable that should be type inferred layer. - - Note: - The default type will be set to LOD_TENSOR. However, when - the var is used as operator output, its type will be updated - based on operator's `VarTypeInference` implementation in - infer_var_type. - """ - return self.main_program.current_block().create_var( - name=unique_name.generate(".".join([self.name, 'tmp'])), - dtype=dtype, - type=core.VarDesc.VarType.LOD_TENSOR, - persistable=False, - stop_gradient=stop_gradient) - - def create_variable(self, *args, **kwargs): - return self.main_program.current_block().create_var(*args, **kwargs) - - def create_global_variable(self, persistable=False, *args, **kwargs): - """ - create global variable, note that there is no initializer for this global variable. - Args: - persistable(bool): True if it is a checkpoint value. - *args: See create_var's documentation - **kwargs: See create_var's documentation - - Returns(Variable): the created variable. - """ - return self.main_program.global_block().create_var( - *args, persistable=persistable, **kwargs) - - def create_or_get_global_variable(self, name, *args, **kwargs): - """ - Creates a global variable if not exists and returns the variable and - a boolean flag which is true when it is a new variable. - """ - if self.main_program.global_block().has_var(name): - return self.main_program.global_block().var(name), False - else: - return self.create_global_variable(name=name, *args, **kwargs), True - - def set_variable_initializer(self, var, initializer): - assert isinstance(var, Variable) - if imperative_base.enabled(): - initializer(var, var.block) - else: - self.startup_program.global_block().create_var( - name=var.name, - type=var.type, - dtype=var.dtype, - shape=var.shape, - persistable=True, - initializer=initializer) - + #TODO (jiabin): reconstruct this in LayerObjHelper and avoid dependency of bias_attr def append_bias_op(self, input_var, dim_start=1, dim_end=None): """ Append bias operator and return its output. If the user does not set @@ -434,6 +135,7 @@ class LayerHelper(object): attrs={'axis': dim_start}) return tmp + #TODO (jiabin): reconstruct this in LayerObjHelper and avoid dependency of act def append_activation(self, input_var): act = self.kwargs.get('act', None) if act is None: @@ -448,10 +150,11 @@ class LayerHelper(object): if 'use_mkldnn' in self.kwargs: act['use_mkldnn'] = self.kwargs.get('use_mkldnn') act_type = act.pop('type') + tmp = input_var # NOTE(dzhwinter): some activation support inplace compution. # NOTE(minqiyang): currently, we don't support inplace in imperative mode - if not imperative_base.enabled() and core.IsInplace(act_type): + if not _in_imperative_mode() and core.IsInplace(act_type): tmp = input_var else: tmp = self.create_variable_for_type_inference(dtype=input_var.dtype) @@ -462,6 +165,7 @@ class LayerHelper(object): attrs=act) return tmp + #TODO (jiabin): should we remove this since it has never be used def _get_default_initializer(self, dtype): if dtype is None or dtype_is_floating(dtype) is True: return Xavier() @@ -469,6 +173,7 @@ class LayerHelper(object): # For integer and boolean types, initialize with all zeros return Constant() + #TODO (jiabin): reconstruct this in LayerObjHelper and avoid dependency of kwargs def is_instance(self, param_name, cls): param = self.kwargs.get(param_name, None) if not isinstance(param, cls): diff --git a/python/paddle/fluid/layer_helper_base.py b/python/paddle/fluid/layer_helper_base.py new file mode 100644 index 0000000000000000000000000000000000000000..d4b38137e4e014d0244fe206bd964a304a291345 --- /dev/null +++ b/python/paddle/fluid/layer_helper_base.py @@ -0,0 +1,381 @@ +# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import print_function + +import copy +import numpy as np + +from .framework import Variable, default_main_program, default_startup_program, _in_imperative_mode, _current_expected_place +from . import unique_name +from .param_attr import ParamAttr, WeightNormParamAttr +from . import core + + +class LayerHelperBase(object): + def __init__(self, name, layer_type): + self._layer_type = layer_type + self._name = name + + @property + def name(self): + return self._name + + @property + def layer_type(self): + return self._layer_type + + @property + def main_program(self): + return default_main_program() + + @property + def startup_program(self): + return default_startup_program() + + def to_variable(self, value, block=None): + """convert value to variable + + Args: + value: value to be convert + block: the block of the variable + + Return Variable construct from value + """ + if isinstance(value, np.ndarray): + assert _in_imperative_mode( + ), "to_variable could only be called in imperative mode" + + if not block: + block = default_main_program().current_block() + py_var = Variable( + block, + type=core.VarDesc.VarType.LOD_TENSOR, + name=None, + shape=value.shape, + dtype=value.dtype) + var = py_var._ivar.value() + tensor = var.get_tensor() + tensor.set(value, _current_expected_place()) + return py_var + elif isinstance(value, Variable): + return value + + def _create_weight_normalize(self, attr, shape, dtype): + from .layers import elementwise_mul, elementwise_div, reshape + + # Remove these ops when LayerHelper and layers support indicating + # program and block. + def __norm_op(x, + out=None, + p=2, + dim=None, + keep_dim=False, + block=self.startup_program.global_block()): + if out is None: + out = block.create_var( + name=unique_name.generate(".".join( + [self.name, 'weight_norm_norm'])), + dtype=dtype, + persistable=False) + abs_out = block.create_var( + name=unique_name.generate(".".join( + [self.name, 'weight_norm_abs'])), + dtype=dtype, + persistable=False) + block.append_op( + type='abs', inputs={'X': x}, outputs={'Out': abs_out}) + pow_out = block.create_var( + name=unique_name.generate(".".join( + [self.name, 'weight_norm_pow'])), + dtype=dtype, + persistable=False) + block.append_op( + type='pow', + inputs={'X': abs_out}, + outputs={'Out': pow_out}, + attrs={'factor': float(p)}) + sum_out = block.create_var( + name=unique_name.generate(".".join( + [self.name, 'weight_norm_sum'])), + dtype=dtype, + persistable=False) + block.append_op( + type='reduce_sum', + inputs={'X': pow_out}, + outputs={'Out': sum_out}, + attrs={ + 'dim': dim, + 'keep_dim': keep_dim, + 'reduce_all': True if dim is None else False + }) + block.append_op( + type='pow', + inputs={'X': sum_out}, + outputs={'Out': out}, + attrs={'factor': 1. / p}) + return out + + def __reshape_op(x, + shape, + out=None, + block=self.startup_program.global_block()): + if out is None: + out = block.create_var( + name=unique_name.generate(".".join( + [self.name, 'weight_norm_reshape'])), + dtype=dtype, + persistable=False) + block.append_op( + type='reshape', + inputs={'X': x}, + outputs={'Out': out}, + attrs={'shape': shape}) + return out + + def __transpose_op(x, + axis, + out=None, + block=self.startup_program.global_block()): + if out is None: + out = block.create_var( + name=unique_name.generate(".".join( + [self.name, 'weight_norm_transpose'])), + dtype=dtype, + persistable=False) + block.append_op( + type='transpose', + inputs={'X': x}, + outputs={'Out': out}, + attrs={'axis': axis}) + return out + + def __norm_except_dim(x, + out=None, + dim=None, + block=self.startup_program.global_block()): + """Computes the norm over all dimensions except dim""" + if out is None: + out = block.create_var( + name=unique_name.generate(".".join( + [self.name, 'weight_norm_norm'])), + dtype=dtype, + persistable=False) + if dim is None: + __norm_op(x, out, dim=dim, block=block) + elif dim == 0: + out_shape = [x.shape[0]] + [1] * (len(x.shape) - 1) + reshape = __reshape_op(x, shape=[x.shape[0], -1], block=block) + norm = __norm_op(reshape, dim=1, block=block) + __reshape_op(norm, out=out, shape=out_shape, block=block) + elif dim == len(x.shape) - 1: + out_shape = [1] * (len(x.shape) - 1) + [x.shape[-1]] + reshape = __reshape_op(x, shape=[-1, x.shape[-1]], block=block) + norm = __norm_op(reshape, dim=0, block=block) + __reshape_op(norm, out=out, shape=out_shape, block=block) + else: + perm = list(range(len(x.shape))) + perm[0], perm[dim] = dim, 0 + transpose = __transpose_op(x, perm, block=block) + norm = __norm_op(transpose, dim=0, block=block) + __transpose_op(norm, perm, out=out, block=block) + return out + + def __weight_normalize(g, v, dim): + """Calculations for weight normalization""" + norm = __norm_except_dim( + v, dim=dim, block=self.main_program.current_block()) + scale = elementwise_div( + x=g, y=norm) # The shapes of g and norm are the same. + # Currently, elementwise_mul only support broadcast when the shape + # of y is a subset of the shape of x. Thus, we reshape y to squeeze + # to achive the subset. + w = elementwise_mul( + x=v, + y=scale if dim is None else reshape( + x=scale, shape=[v.shape[dim]]), + axis=-1 if dim is None else dim) + # To serialize the original parameter for inference, maybe a + # parameter rather than a variable should be returned. + return w + + g_param_attr = copy.deepcopy(attr) + g_param_attr.name = attr.name + '_g' + g_param_shape = [1] * len(shape) + if attr.dim is not None: + g_param_shape[attr.dim] = shape[attr.dim] + v_param_attr = copy.deepcopy(attr) + v_param_attr.name = attr.name + '_v' + v_param_shape = shape + + # Add to startup_program to initialize g and v. + # Try to reconstruct the initializer of w by initializing g and v. + # Set the initializers of g and v as below, then the distribution + # of w is the same as initializing w with the given initializer. + # For Data-Dependent Initialization, please compute the init-values + # of g and v in external and then feed the values to g and v by + # executing an extra program. + g_param = self.startup_program.global_block().create_parameter( + dtype=dtype, + shape=g_param_shape, + **g_param_attr._to_kwargs(with_initializer=False)) + v_param = self.startup_program.global_block().create_parameter( + dtype=dtype, + shape=v_param_shape, + **v_param_attr._to_kwargs(with_initializer=True)) + __norm_except_dim( + x=v_param, + out=g_param, + dim=attr.dim, + block=self.startup_program.global_block()) + + # Add weight normalization to main_program + g_param = self.main_program.global_block().create_parameter( + dtype=dtype, shape=g_param_shape, **g_param_attr._to_kwargs()) + v_param = self.main_program.global_block().create_parameter( + dtype=dtype, shape=v_param_shape, **v_param_attr._to_kwargs()) + w_param = __weight_normalize(g_param, v_param, dim=attr.dim) + return w_param + + # TODO: hide the func after we move the layers to Layers + def create_parameter(self, + attr, + shape, + dtype, + is_bias=False, + default_initializer=None): + """Create parameters for this layers. + + Args: + attr: [ParamAttr] should be the parameter attribute for this parameter + shape: shape of the paramter + dtype: data type of this parameter + is_bias: if this is a bias parameter + default_initializer: set the default initializer for this parameter + + Returns created parameter Variable. + """ + # Deepcopy the attr so that parameters can be shared in program + attr = copy.deepcopy(attr) + if attr is None: + attr = ParamAttr._to_attr(attr) + assert isinstance(attr, ParamAttr) + suffix = 'b' if is_bias else 'w' + if attr.name is None: + attr.name = unique_name.generate(".".join([self.name, suffix])) + + if default_initializer is None and attr.initializer is None: + if isinstance(dtype, core.VarDesc.VarType): + if dtype != core.VarDesc.VarType.FP32 and \ + dtype != core.VarDesc.VarType.FP64 and \ + dtype != core.VarDesc.VarType.FP16: + raise TypeError( + "Can not create parameter with default initializer when dtype is not float type. Set default_initializer to fit the parameter dtype!" + ) + else: + if not (dtype.startswith("float") or dtype == "double"): + raise TypeError( + "Can not create parameter with default initializer when dtype is not float type. Set default_initializer to fit the parameter dtype!" + ) + if is_bias: + attr._set_default_bias_initializer() + else: + attr._set_default_param_initializer() + else: + attr._set_default_initializer(default_initializer) + + # If weight normalization is set, insert extra parameters and ops. + # Refer to https://arxiv.org/pdf/1602.07868.pdf + if isinstance(attr, WeightNormParamAttr): + param = self._create_weight_normalize(attr, shape, dtype) + WeightNormParamAttr.params_with_weight_norm.append(param) + return param + if _in_imperative_mode(): + # In imperative mode, we want the returned parameter to be + # initialized so that it can be used imperatively. + return self.main_program.global_block().create_parameter( + dtype=dtype, + shape=shape, + **attr._to_kwargs(with_initializer=True)) + else: + self.startup_program.global_block().create_parameter( + dtype=dtype, + shape=shape, + **attr._to_kwargs(with_initializer=True)) + return self.main_program.global_block().create_parameter( + dtype=dtype, shape=shape, **attr._to_kwargs()) + + def create_variable_for_type_inference(self, dtype, stop_gradient=False): + """Create a temporary variable that should be type inferred layer. + + Note: + The default type will be set to LOD_TENSOR. However, when + the var is used as operator output, its type will be updated + based on operator's `VarTypeInference` implementation in + infer_var_type. + """ + return self.main_program.current_block().create_var( + name=unique_name.generate(".".join([self.name, 'tmp'])), + dtype=dtype, + type=core.VarDesc.VarType.LOD_TENSOR, + persistable=False, + stop_gradient=stop_gradient) + + def create_variable(self, *args, **kwargs): + """Create Variable for this layers. + Returns created Variable. + """ + return self.main_program.current_block().create_var(*args, **kwargs) + + def create_global_variable(self, persistable=False, *args, **kwargs): + """ + create global variable, note that there is no initializer for this global variable. + Args: + persistable(bool): True if it is a checkpoint value. + *args: See create_var's documentation + **kwargs: See create_var's documentation + + Returns(Variable): the created variable. + """ + return self.main_program.global_block().create_var( + *args, persistable=persistable, **kwargs) + + def create_or_get_global_variable(self, name, *args, **kwargs): + """ + Creates a global variable if not exists and returns the variable and + a boolean flag which is true when it is a new variable. + """ + if self.main_program.global_block().has_var(name): + return self.main_program.global_block().var(name), False + else: + return self.create_global_variable(name=name, *args, **kwargs), True + + def set_variable_initializer(self, var, initializer): + """Set target Variable's initializer + + Args: + var: target Variable + initializer: initializer to use + """ + assert isinstance(var, Variable) + if _in_imperative_mode(): + initializer(var, var.block) + else: + self.startup_program.global_block().create_var( + name=var.name, + type=var.type, + dtype=var.dtype, + shape=var.shape, + persistable=True, + initializer=initializer) diff --git a/python/paddle/fluid/layers/control_flow.py b/python/paddle/fluid/layers/control_flow.py index 539c9675b2d69b599fc63350c0c7c3b14e32995a..e7f704515df947f107df6d83a644530a0e468430 100644 --- a/python/paddle/fluid/layers/control_flow.py +++ b/python/paddle/fluid/layers/control_flow.py @@ -848,7 +848,7 @@ def create_array(dtype): @templatedoc() -def less_than(x, y, force_cpu=None, cond=None, **ignored): +def less_than(x, y, force_cpu=None, cond=None): """ ${comment} @@ -1800,7 +1800,7 @@ def reorder_lod_tensor_by_rank(x, rank_table): return out -def is_empty(x, cond=None, **ignored): +def is_empty(x, cond=None): """ Test whether a Variable is empty. diff --git a/python/paddle/fluid/layers/detection.py b/python/paddle/fluid/layers/detection.py index 61a7d4f31d5245e635e2e1fe33e418ce20e94180..cbedd70f857b3f767492826cda08ae1171d72bad 100644 --- a/python/paddle/fluid/layers/detection.py +++ b/python/paddle/fluid/layers/detection.py @@ -51,6 +51,8 @@ __all__ = [ 'yolov3_loss', 'box_clip', 'multiclass_nms', + 'distribute_fpn_proposals', + 'box_decoder_and_assign', ] @@ -2221,3 +2223,138 @@ def multiclass_nms(bboxes, output.stop_gradient = True return output + + +def distribute_fpn_proposals(fpn_rois, + min_level, + max_level, + refer_level, + refer_scale, + name=None): + """ + In Feature Pyramid Networks (FPN) models, it is needed to distribute all + proposals into different FPN level, with respect to scale of the proposals, + the referring scale and the referring level. Besides, to restore the order + of proposals, we return an array which indicates the original index of rois + in current proposals. To compute FPN level for each roi, the formula is + given as follows: + + .. math:: + + roi\_scale &= \sqrt{BBoxArea(fpn\_roi)} + + level = floor(&\log(\\frac{roi\_scale}{refer\_scale}) + refer\_level) + + where BBoxArea is a function to compute the area of each roi. + + Args: + fpn_rois(variable): The input fpn_rois, the second dimension is 4. + min_level(int): The lowest level of FPN layer where the proposals come + from. + max_level(int): The highest level of FPN layer where the proposals + come from. + refer_level(int): The referring level of FPN layer with specified scale. + refer_scale(int): The referring scale of FPN layer with specified level. + name(str|None): The name of this operator. + + Returns: + tuple: + A tuple(multi_rois, restore_ind) is returned. The multi_rois is + a list of segmented tensor variables. The restore_ind is a 2D + Tensor with shape [N, 1], N is the number of total rois. It is + used to restore the order of fpn_rois. + + Examples: + .. code-block:: python + + fpn_rois = fluid.layers.data( + name='data', shape=[4], dtype='float32', lod_level=1) + multi_rois, restore_ind = fluid.layers.distribute_fpn_proposals( + fpn_rois=fpn_rois, + min_level=2, + max_level=5, + refer_level=4, + refer_scale=224) + """ + + helper = LayerHelper('distribute_fpn_proposals', **locals()) + dtype = helper.input_dtype() + num_lvl = max_level - min_level + 1 + multi_rois = [ + helper.create_variable_for_type_inference(dtype) for i in range(num_lvl) + ] + restore_ind = helper.create_variable_for_type_inference(dtype='int32') + helper.append_op( + type='distribute_fpn_proposals', + inputs={'FpnRois': fpn_rois}, + outputs={'MultiFpnRois': multi_rois, + 'RestoreIndex': restore_ind}, + attrs={ + 'min_level': min_level, + 'max_level': max_level, + 'refer_level': refer_level, + 'refer_scale': refer_scale + }) + return multi_rois, restore_ind + + +@templatedoc() +def box_decoder_and_assign(prior_box, + prior_box_var, + target_box, + box_score, + box_clip, + name=None): + """ + ${comment} + Args: + prior_box(${prior_box_type}): ${prior_box_comment} + prior_box_var(${prior_box_var_type}): ${prior_box_var_comment} + target_box(${target_box_type}): ${target_box_comment} + box_score(${box_score_type}): ${box_score_comment} + box_clip(${box_clip_type}): ${box_clip_comment} + name(str|None): The name of this operator + Returns: + decode_box(Variable), output_assign_box(Variable): + + two variables: + + - decode_box(${decode_box_type}): ${decode_box_comment} + - output_assign_box(${output_assign_box_type}): ${output_assign_box_comment} + + Examples: + .. code-block:: python + + pb = fluid.layers.data( + name='prior_box', shape=[20, 4], dtype='float32') + pbv = fluid.layers.data( + name='prior_box_var', shape=[1, 4], dtype='float32') + loc = fluid.layers.data( + name='target_box', shape=[20, 4*81], dtype='float32') + scores = fluid.layers.data( + name='scores', shape=[20, 81], dtype='float32') + decoded_box, output_assign_box = fluid.layers.box_decoder_and_assign( + pb, pbv, loc, scores, 4.135) + + """ + helper = LayerHelper("box_decoder_and_assign", **locals()) + + decoded_box = helper.create_variable_for_type_inference( + dtype=prior_box.dtype) + output_assign_box = helper.create_variable_for_type_inference( + dtype=prior_box.dtype) + + helper.append_op( + type="box_decoder_and_assign", + inputs={ + "PriorBox": prior_box, + "PriorBoxVar": prior_box_var, + "TargetBox": target_box, + "BoxScore": box_score + }, + attrs={"box_clip": box_clip}, + outputs={ + "DecodeBox": decoded_box, + "OutputAssignBox": output_assign_box + }) + return decoded_box, output_assign_box diff --git a/python/paddle/fluid/layers/nn.py b/python/paddle/fluid/layers/nn.py index 0f4fe1b559e1e79bace82e13f0f8828b869d69b7..5b4f1efe479b12cb8ec390b8753d097764d70860 100644 --- a/python/paddle/fluid/layers/nn.py +++ b/python/paddle/fluid/layers/nn.py @@ -4833,11 +4833,6 @@ def matmul(x, y, transpose_x=False, transpose_y=False, alpha=1.0, name=None): """ def __check_input(x, y): - if len(y.shape) > len(x.shape): - raise ValueError( - "Invalid inputs for matmul. " - "x's rank should be always greater than or equal to y'rank.") - x_shape = list(x.shape) y_shape = list(y.shape) if len(x_shape) == 1: @@ -4853,10 +4848,11 @@ def matmul(x, y, transpose_x=False, transpose_y=False, alpha=1.0, name=None): if x_shape[-1] != y_shape[-2]: raise ValueError("Invalid inputs for matmul.") - if len(y_shape) > 2: + if len(y_shape) > 2 and len(x_shape) > 2: for i, dim_x in enumerate(x_shape[:-2]): if dim_x != y_shape[i]: - raise ValueError("Invalid inputs for matmul.") + raise ValueError("Invalid inputs for matmul. x(%s), y(%s)" % + (x.shape, y.shape)) __check_input(x, y) diff --git a/python/paddle/fluid/layers/tensor.py b/python/paddle/fluid/layers/tensor.py index af747c3cecac66492bb2e2642a88f66a5cfae3db..cb973986988c2909f5ef1e15dd32db3e83b1d269 100644 --- a/python/paddle/fluid/layers/tensor.py +++ b/python/paddle/fluid/layers/tensor.py @@ -142,7 +142,8 @@ def create_global_var(shape, def cast(x, dtype): """ This layer takes in the Variable :attr:`x` with :attr:`x.dtype` and casts - it to the output with :attr:`dtype`. + it to the output with :attr:`dtype`. It's meaningless if the output + dtype equals the input dtype, but it's fine if you do so. Args: x (Variable): The input Variable for casting. diff --git a/python/paddle/fluid/optimizer.py b/python/paddle/fluid/optimizer.py index cb799b639648fc0af64a890ffe788d23e7f4f9eb..86b7716664c54fb389c671d0c0d2d69d2a0e4a2d 100644 --- a/python/paddle/fluid/optimizer.py +++ b/python/paddle/fluid/optimizer.py @@ -379,7 +379,7 @@ class Optimizer(object): self._dtype = loss.dtype program = loss.block.program optimize_ops = [] - if imperative_base.enabled(): + if framework._in_imperative_mode(): if parameter_list is not None: parameters = parameter_list else: diff --git a/python/paddle/fluid/parallel_executor.py b/python/paddle/fluid/parallel_executor.py index 2ebaab3b1024878e28ae7064bfc5c3d1d091ad94..517418da1cf2f745ee5578e3c2b118394db7fae7 100644 --- a/python/paddle/fluid/parallel_executor.py +++ b/python/paddle/fluid/parallel_executor.py @@ -106,13 +106,18 @@ class ParallelExecutor(object): else framework.default_main_program() self._compiled_program = compiler.CompiledProgram(main_program) + if share_vars_from: + assert isinstance( + share_vars_from, ParallelExecutor + ), "The share_vars_from should be ParallelExecutor." self._compiled_program.with_data_parallel( loss_name=loss_name, build_strategy=build_strategy, exec_strategy=exec_strategy, - share_vars_from=share_vars_from) + share_vars_from=share_vars_from._compiled_program + if share_vars_from else None) self._place = core.CUDAPlace(0) if use_cuda else core.CPUPlace() - self._executor = executor.Executor(self._place) + self._exe = executor.Executor(self._place) self._compiled_program._compile(place=self._place, scope=self._scope) def run(self, fetch_list, feed=None, feed_dict=None, return_numpy=True): @@ -180,11 +185,11 @@ class ParallelExecutor(object): loss = pe.run(feed=feeder.feed(cur_batch), fetch_list=[avg_cost.name])) """ - return self._executor.run(program=self._compiled_program, - scope=self._scope, - feed=feed, - fetch_list=fetch_list, - return_numpy=return_numpy) + return self._exe.run(program=self._compiled_program, + scope=self._scope, + feed=feed, + fetch_list=fetch_list, + return_numpy=return_numpy) @property def device_count(self): diff --git a/python/paddle/fluid/tests/test_detection.py b/python/paddle/fluid/tests/test_detection.py index 0d39a139eed87f900b1f59fd0569b6acaec0962b..6218db73459a2bb55d72545c738f88dbd8cce0f7 100644 --- a/python/paddle/fluid/tests/test_detection.py +++ b/python/paddle/fluid/tests/test_detection.py @@ -504,5 +504,21 @@ class TestMulticlassNMS(unittest.TestCase): self.assertIsNotNone(output) +class TestDistributeFpnProposals(unittest.TestCase): + def test_distribute_fpn_proposals(self): + program = Program() + with program_guard(program): + fpn_rois = fluid.layers.data( + name='data', shape=[4], dtype='float32', lod_level=1) + multi_rois, restore_ind = layers.distribute_fpn_proposals( + fpn_rois=fpn_rois, + min_level=2, + max_level=5, + refer_level=4, + refer_scale=224) + self.assertIsNotNone(multi_rois) + self.assertIsNotNone(restore_ind) + + if __name__ == '__main__': unittest.main() diff --git a/python/paddle/fluid/tests/unittests/mkldnn/mkldnn_op_test.py b/python/paddle/fluid/tests/unittests/mkldnn/mkldnn_op_test.py index 871f8403f812c87ac493b82482fe01fdf61037d4..57a5714fc7853905703e9db31bc143fb5cabfacb 100644 --- a/python/paddle/fluid/tests/unittests/mkldnn/mkldnn_op_test.py +++ b/python/paddle/fluid/tests/unittests/mkldnn/mkldnn_op_test.py @@ -70,3 +70,17 @@ def check_if_mkldnn_primitives_exist_in_bwd(test_case, op_type, x, out, fetch_list=['x@GRAD', 'out']) __assert_close(x_grad, out[0], 'x@GRAD') + + +def format_reorder(out, size): + in_n = size[0] + out_h = size[2] + out_w = size[3] + out_c = size[1] + out_tmp = np.zeros((in_n, out_h, out_w, out_c)) + for n in range(in_n): + for i in range(out_h): + for j in range(out_w): + for m in range(out_c): + out_tmp[n, i, j, m] = out[n, m, i, j] + return out_tmp.reshape(in_n, out_c, out_h, out_w) diff --git a/python/paddle/fluid/tests/unittests/mkldnn/test_conv2d_int8_mkldnn_op.py b/python/paddle/fluid/tests/unittests/mkldnn/test_conv2d_int8_mkldnn_op.py index 100a03cea0f740a615c4a08810d4ad9e8c974d7a..c7b8a096bf1a7e2f5b63b136c7036edad863c888 100644 --- a/python/paddle/fluid/tests/unittests/mkldnn/test_conv2d_int8_mkldnn_op.py +++ b/python/paddle/fluid/tests/unittests/mkldnn/test_conv2d_int8_mkldnn_op.py @@ -20,6 +20,7 @@ import numpy as np import paddle.fluid.core as core from paddle.fluid.tests.unittests.op_test import OpTest from paddle.fluid.tests.unittests.test_conv2d_op import conv2d_forward_naive, TestConv2dOp +from mkldnn_op_test import format_reorder def conv2d_forward_refer(input, filter, group, conv_param): @@ -29,20 +30,6 @@ def conv2d_forward_refer(input, filter, group, conv_param): return format_reorder(out, size) -def format_reorder(out, size): - in_n = size[0] - out_h = size[2] - out_w = size[3] - out_c = size[1] - out_tmp = np.zeros((in_n, out_h, out_w, out_c)) - for n in range(in_n): - for i in range(out_h): - for j in range(out_w): - for m in range(out_c): - out_tmp[n, i, j, m] = out[n, m, i, j] - return out_tmp.reshape(in_n, out_c, out_h, out_w) - - class TestConv2dInt8Op(TestConv2dOp): def setUp(self): self.op_type = "conv2d" diff --git a/python/paddle/fluid/tests/unittests/mkldnn/test_requantize_mkldnn_op.py b/python/paddle/fluid/tests/unittests/mkldnn/test_requantize_mkldnn_op.py new file mode 100644 index 0000000000000000000000000000000000000000..b7a4683558539d3f9daa6a1146355acc3ff2bab7 --- /dev/null +++ b/python/paddle/fluid/tests/unittests/mkldnn/test_requantize_mkldnn_op.py @@ -0,0 +1,93 @@ +# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import print_function + +import unittest +import numpy as np +from paddle.fluid.tests.unittests.op_test import OpTest +from mkldnn_op_test import format_reorder + + +class TestReQuantizeOp(OpTest): + def setUp(self): + self.op_type = 'requantize' + self.scale_in = 2.0 + self.scale_out = 1.5 + self.input_size = [1, 1, 5, 5] + self.data_type = 'int8' + self.set_scale() + self.set_data_type() + + scale_shift = self.scale_out / self.scale_in + + if self.data_type == 'int8': + input = (np.random.randint(0, 100, self.input_size) - 50 + ).astype(self.data_type) + output_tmp = np.round(input.astype('float32') * + scale_shift).astype('int8') + else: + input = (np.random.randint(0, 100, + self.input_size)).astype(self.data_type) + output_tmp = np.round(input.astype('float32') * + scale_shift).astype('uint8') + + output = format_reorder(output_tmp, self.input_size) + + self.inputs = {'Input': OpTest.np_dtype_to_fluid_dtype(input)} + + self.outputs = {'Output': output} + + self.attrs = {'Scale_in': self.scale_in, 'Scale_out': self.scale_out} + + def test_check_output(self): + self.check_output() + + def set_scale(self): + pass + + def set_data_type(OpTest): + pass + + +#--------------------test requantize with s8 input-------------------- + + +class TestReQuantizeOp1(TestReQuantizeOp): + def set_scale(self): + self.scale_in = 1.5 + self.scale_out = 1.5 + + +class TestReQuantizeOp2(TestReQuantizeOp): + def set_scale(self): + self.scale_in = 0.1 + self.scale_out = 0.2 + + +#--------------------test requantize with u8 input-------------------- + + +class TestReQuantizeOp3(TestReQuantizeOp1): + def set_data_type(self): + self.data_type = 'uint8' + + +class TestReQuantizeOp4(TestReQuantizeOp2): + def set_data_type(self): + self.data_type = 'uint8' + + +if __name__ == '__main__': + unittest.main() diff --git a/python/paddle/fluid/tests/unittests/test_base_layer.py b/python/paddle/fluid/tests/unittests/test_base_layer.py index caf9750e58889ac40c7cdde022f0b6aa5e77fc42..b12aaea3219cb81e8fa0e7584120db510fb7b62c 100644 --- a/python/paddle/fluid/tests/unittests/test_base_layer.py +++ b/python/paddle/fluid/tests/unittests/test_base_layer.py @@ -16,27 +16,17 @@ import unittest import numpy as np import paddle.fluid as fluid -from paddle.fluid.layer_helper import LayerHelper class L1(fluid.imperative.Layer): def __init__(self, prefix): super(L1, self).__init__(prefix) - self._helper = LayerHelper( - self.full_name(), - param_attr=fluid.ParamAttr( - initializer=fluid.initializer.Constant(value=0.1))) - - self.w1 = self._helper.create_parameter( - attr=self._helper.param_attr, - shape=[2, 2], - dtype='float32', - is_bias=False) - self.w2 = self._helper.create_parameter( - attr=self._helper.param_attr, - shape=[2, 2], - dtype='float32', - is_bias=False) + self._param_attr = fluid.ParamAttr( + initializer=fluid.initializer.Constant(value=0.1)) + self.w1 = self.create_parameter( + attr=self._param_attr, shape=[2, 2], dtype='float32', is_bias=False) + self.w2 = self.create_parameter( + attr=self._param_attr, shape=[2, 2], dtype='float32', is_bias=False) def forward(self): return self.w1 + self.w2 @@ -67,8 +57,8 @@ class TestBaseLayer(unittest.TestCase): with fluid.imperative.guard(): l = L1('test_one_level') ret = l() - self.assertEqual(l.w1.name, "test_one_level/L1_0_0.w_0") - self.assertEqual(l.w2.name, "test_one_level/L1_0_0.w_1") + self.assertEqual(l.w1.name, "test_one_level/L1_0.w_0") + self.assertEqual(l.w2.name, "test_one_level/L1_0.w_1") self.assertTrue(np.allclose(ret._numpy(), 0.2 * np.ones([2, 2]))) def test_three_level(self): @@ -76,12 +66,12 @@ class TestBaseLayer(unittest.TestCase): l = L3('test_three_level') names = [p.name for p in l.parameters()] ret = l() - self.assertEqual(names[0], "test_three_level/L3_0/L2_0/L1_0_0.w_0") - self.assertEqual(names[1], "test_three_level/L3_0/L2_0/L1_0_0.w_1") - self.assertEqual(names[2], "test_three_level/L3_0/L2_0/L1_1_0.w_0") - self.assertEqual(names[3], "test_three_level/L3_0/L2_0/L1_1_0.w_1") - self.assertEqual(names[4], "test_three_level/L3_0/L2_1/L1_0_0.w_0") - self.assertEqual(names[5], "test_three_level/L3_0/L2_1/L1_0_0.w_1") + self.assertEqual(names[0], "test_three_level/L3_0/L2_0/L1_0.w_0") + self.assertEqual(names[1], "test_three_level/L3_0/L2_0/L1_0.w_1") + self.assertEqual(names[2], "test_three_level/L3_0/L2_0/L1_1.w_0") + self.assertEqual(names[3], "test_three_level/L3_0/L2_0/L1_1.w_1") + self.assertEqual(names[4], "test_three_level/L3_0/L2_1/L1_0.w_0") + self.assertEqual(names[5], "test_three_level/L3_0/L2_1/L1_0.w_1") self.assertTrue(np.allclose(ret._numpy(), 0.8 * np.ones([2, 2]))) diff --git a/python/paddle/fluid/tests/unittests/test_box_decoder_and_assign_op.py b/python/paddle/fluid/tests/unittests/test_box_decoder_and_assign_op.py new file mode 100644 index 0000000000000000000000000000000000000000..b0afc2a2e4ad7b72b341536babfc595c2b6c3455 --- /dev/null +++ b/python/paddle/fluid/tests/unittests/test_box_decoder_and_assign_op.py @@ -0,0 +1,96 @@ +# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import print_function + +import unittest +import numpy as np +import sys +import math +from op_test import OpTest + + +def box_decoder_and_assign(deltas, weights, boxes, box_score, box_clip): + boxes = boxes.astype(deltas.dtype, copy=False) + widths = boxes[:, 2] - boxes[:, 0] + 1.0 + heights = boxes[:, 3] - boxes[:, 1] + 1.0 + ctr_x = boxes[:, 0] + 0.5 * widths + ctr_y = boxes[:, 1] + 0.5 * heights + wx, wy, ww, wh = weights + dx = deltas[:, 0::4] * wx + dy = deltas[:, 1::4] * wy + dw = deltas[:, 2::4] * ww + dh = deltas[:, 3::4] * wh + # Prevent sending too large values into np.exp() + dw = np.minimum(dw, box_clip) + dh = np.minimum(dh, box_clip) + pred_ctr_x = dx * widths[:, np.newaxis] + ctr_x[:, np.newaxis] + pred_ctr_y = dy * heights[:, np.newaxis] + ctr_y[:, np.newaxis] + pred_w = np.exp(dw) * widths[:, np.newaxis] + pred_h = np.exp(dh) * heights[:, np.newaxis] + pred_boxes = np.zeros(deltas.shape, dtype=deltas.dtype) + # x1 + pred_boxes[:, 0::4] = pred_ctr_x - 0.5 * pred_w + # y1 + pred_boxes[:, 1::4] = pred_ctr_y - 0.5 * pred_h + # x2 (note: "- 1" is correct; don't be fooled by the asymmetry) + pred_boxes[:, 2::4] = pred_ctr_x + 0.5 * pred_w - 1 + # y2 (note: "- 1" is correct; don't be fooled by the asymmetry) + pred_boxes[:, 3::4] = pred_ctr_y + 0.5 * pred_h - 1 + + output_assign_box = [] + for ino in range(len(pred_boxes)): + rank = np.argsort(-box_score[ino]) + maxidx = rank[0] + if maxidx == 0: + maxidx = rank[1] + beg_pos = maxidx * 4 + end_pos = maxidx * 4 + 4 + output_assign_box.append(pred_boxes[ino, beg_pos:end_pos]) + output_assign_box = np.array(output_assign_box) + + return pred_boxes, output_assign_box + + +class TestBoxDecoderAndAssignOpWithLoD(OpTest): + def test_check_output(self): + self.check_output() + + def setUp(self): + self.op_type = "box_decoder_and_assign" + lod = [[4, 8, 8]] + num_classes = 10 + prior_box = np.random.random((20, 4)).astype('float32') + prior_box_var = np.array([0.1, 0.1, 0.2, 0.2], dtype=np.float32) + target_box = np.random.random((20, 4 * num_classes)).astype('float32') + box_score = np.random.random((20, num_classes)).astype('float32') + box_clip = 4.135 + output_box, output_assign_box = box_decoder_and_assign( + target_box, prior_box_var, prior_box, box_score, box_clip) + + self.inputs = { + 'PriorBox': (prior_box, lod), + 'PriorBoxVar': prior_box_var, + 'TargetBox': (target_box, lod), + 'BoxScore': (box_score, lod), + } + self.attrs = {'box_clip': box_clip} + self.outputs = { + 'DecodeBox': output_box, + 'OutputAssignBox': output_assign_box + } + + +if __name__ == '__main__': + unittest.main() diff --git a/python/paddle/fluid/tests/unittests/test_dist_mnist_pg.py b/python/paddle/fluid/tests/unittests/test_dist_mnist_pg.py new file mode 100644 index 0000000000000000000000000000000000000000..d063f8473e0f50256dc424429ce1244a4b893ccf --- /dev/null +++ b/python/paddle/fluid/tests/unittests/test_dist_mnist_pg.py @@ -0,0 +1,40 @@ +# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import print_function +import unittest +from test_dist_base import TestDistBase + + +class TestDistMnistNCCL2(TestDistBase): + def _setup_config(self): + self._sync_mode = True + self._use_reduce = False + self._use_reader_alloc = False + self._nccl2_mode = True + + def test_dist_train(self): + import paddle.fluid as fluid + if fluid.core.is_compiled_with_cuda(): + self.check_with_place( + "dist_mnist.py", + delta=1, + need_envs={ + "FLAGS_enable_parallel_graph": "1", + "FLAGS_sync_nccl_allreduce": "1" + }) + + +if __name__ == "__main__": + unittest.main() diff --git a/python/paddle/fluid/tests/unittests/test_distribute_fpn_proposals_op.py b/python/paddle/fluid/tests/unittests/test_distribute_fpn_proposals_op.py new file mode 100644 index 0000000000000000000000000000000000000000..1464060f5961aff7fe513ae9edb2cd974bffb316 --- /dev/null +++ b/python/paddle/fluid/tests/unittests/test_distribute_fpn_proposals_op.py @@ -0,0 +1,117 @@ +# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import print_function + +import unittest +import numpy as np +import math +import sys +from op_test import OpTest + + +class TestDistributeFPNProposalsOp(OpTest): + def set_data(self): + self.init_test_case() + self.make_rois() + self.rois_fpn, self.rois_idx_restore = self.calc_rois_distribute() + self.inputs = {'FpnRois': (self.rois[:, 1:5], self.rois_lod)} + self.attrs = { + 'max_level': self.roi_max_level, + 'min_level': self.roi_min_level, + 'refer_scale': self.canonical_scale, + 'refer_level': self.canonical_level + } + output = [('out%d' % i, self.rois_fpn[i]) + for i in range(len(self.rois_fpn))] + self.outputs = { + 'MultiFpnRois': output, + 'RestoreIndex': self.rois_idx_restore + } + + def init_test_case(self): + self.roi_max_level = 5 + self.roi_min_level = 2 + self.canonical_scale = 224 + self.canonical_level = 4 + self.images_shape = [512, 512] + + def boxes_area(self, boxes): + w = (boxes[:, 2] - boxes[:, 0] + 1) + h = (boxes[:, 3] - boxes[:, 1] + 1) + areas = w * h + assert np.all(areas >= 0), 'Negative areas founds' + return areas + + def map_rois_to_fpn_levels(self, rois, lvl_min, lvl_max): + s = np.sqrt(self.boxes_area(rois)) + s0 = self.canonical_scale + lvl0 = self.canonical_level + target_lvls = np.floor(lvl0 + np.log2(s / s0 + 1e-6)) + target_lvls = np.clip(target_lvls, lvl_min, lvl_max) + return target_lvls + + def get_sub_lod(self, sub_lvl): + sub_lod = [] + max_batch_id = sub_lvl[-1] + for i in range(max_batch_id.astype(np.int32) + 1): + sub_lod.append(np.where(sub_lvl == i)[0].size) + return sub_lod + + def add_multilevel_roi(self, rois, target_lvls, lvl_min, lvl_max): + rois_idx_order = np.empty((0, )) + rois_fpn = [] + for lvl in range(lvl_min, lvl_max + 1): + idx_lvl = np.where(target_lvls == lvl)[0] + if len(idx_lvl) == 0: + rois_fpn.append((np.empty(shape=(0, 4)), [[0, 0]])) + continue + sub_lod = self.get_sub_lod(rois[idx_lvl, 0]) + rois_fpn.append((rois[idx_lvl, 1:], [sub_lod])) + rois_idx_order = np.concatenate((rois_idx_order, idx_lvl)) + rois_idx_restore = np.argsort(rois_idx_order).astype( + np.int32, copy=False) + return rois_fpn, rois_idx_restore + + def calc_rois_distribute(self): + lvl_min = self.roi_min_level + lvl_max = self.roi_max_level + target_lvls = self.map_rois_to_fpn_levels(self.rois[:, 1:5], lvl_min, + lvl_max) + rois_fpn, rois_idx_restore = self.add_multilevel_roi( + self.rois, target_lvls, lvl_min, lvl_max) + return rois_fpn, rois_idx_restore + + def make_rois(self): + self.rois_lod = [[100, 200]] + rois = [] + lod = self.rois_lod[0] + bno = 0 + for roi_num in lod: + for i in range(roi_num): + xywh = np.random.rand(4) + xy1 = xywh[0:2] * 20 + wh = xywh[2:4] * (self.images_shape - xy1) + xy2 = xy1 + wh + roi = [bno, xy1[0], xy1[1], xy2[0], xy2[1]] + rois.append(roi) + bno += 1 + self.rois = np.array(rois).astype("float32") + + def setUp(self): + self.op_type = "distribute_fpn_proposals" + self.set_data() + + def test_check_output(self): + self.check_output() diff --git a/python/paddle/fluid/tests/unittests/test_imperative_basic.py b/python/paddle/fluid/tests/unittests/test_imperative_basic.py index dae0c466ee5ea919688b29100f77f17f5f3b8c6d..97fc1eab3d372b07834e8b4e6b504eb7d677b0c7 100644 --- a/python/paddle/fluid/tests/unittests/test_imperative_basic.py +++ b/python/paddle/fluid/tests/unittests/test_imperative_basic.py @@ -53,11 +53,15 @@ class MLP(fluid.imperative.Layer): super(MLP, self).__init__(name_scope) self._fc1 = FC(self.full_name(), 3, - fluid.ParamAttr( + param_attr=fluid.ParamAttr( + initializer=fluid.initializer.Constant(value=0.1)), + bias_attr=fluid.ParamAttr( initializer=fluid.initializer.Constant(value=0.1))) self._fc2 = FC(self.full_name(), 4, - fluid.ParamAttr( + param_attr=fluid.ParamAttr( + initializer=fluid.initializer.Constant(value=0.1)), + bias_attr=fluid.ParamAttr( initializer=fluid.initializer.Constant(value=0.1))) def forward(self, inputs): @@ -74,41 +78,37 @@ class SimpleRNNCell(fluid.imperative.Layer): self.step_input_size = step_input_size self.hidden_size = hidden_size self.output_size = output_size - self._dype = core.VarDesc.VarType.FP32 - from paddle.fluid.layer_helper import LayerHelper - self._helper = LayerHelper( - 'SimpleRNNCell', act="tanh", param_attr=param_attr) + self._dtype = core.VarDesc.VarType.FP32 + self.param_attr = param_attr def _build_once(self, inputs, pre_hidden): i2h_param_shape = [self.step_input_size, self.hidden_size] h2h_param_shape = [self.hidden_size, self.hidden_size] h2o_param_shape = [self.output_size, self.hidden_size] - self._i2h_w = self._helper.create_parameter( - attr=self._helper.param_attr, + self._i2h_w = self.create_parameter( + attr=self.param_attr, shape=i2h_param_shape, dtype=self._dtype, is_bias=False) - self._h2h_w = self._helper.create_parameter( - attr=self._helper.param_attr, + self._h2h_w = self.create_parameter( + attr=self.param_attr, shape=h2h_param_shape, dtype=self._dtype, is_bias=False) - self._h2o_w = self._helper.create_parameter( - attr=self._helper.param_attr, + self._h2o_w = self.create_parameter( + attr=self.param_attr, shape=h2o_param_shape, dtype=self._dtype, is_bias=False) def forward(self, input, pre_hidden): - tmp_i2h = self._helper.create_variable_for_type_inference(self._dtype) - tmp_h2h = self._helper.create_variable_for_type_inference(self._dtype) - hidden = self._helper.create_variable_for_type_inference(self._dype) - out = self._helper.create_variable_for_type_inference(self._dype) - softmax_out = self._helper.create_variable_for_type_inference( - self._dtype) - reduce_out = self._helper.create_variable_for_type_inference( - self._dtype) + tmp_i2h = self.create_variable(dtype=self._dtype) + tmp_h2h = self.create_variable(dtype=self._dtype) + hidden = self.create_variable(dtype=self._dtype) + out = self.create_variable(dtype=self._dtype) + softmax_out = self.create_variable(dtype=self._dtype) + reduce_out = self.create_variable(dtype=self._dtype) self._helper.append_op( type="mul", inputs={"X": input, @@ -132,7 +132,7 @@ class SimpleRNNCell(fluid.imperative.Layer): outputs={'Out': hidden}, attrs={'axis': -1, 'use_mkldnn': False}) - hidden = self._helper.append_activation(hidden) + hidden = self._helper.append_activation(hidden, act='tanh') self._helper.append_op( type="mul", @@ -174,7 +174,7 @@ class SimpleRNN(fluid.imperative.Layer): outs = list() pre_hiddens = list() - init_hidden = fluid.layers.tensor.create_parameter( + init_hidden = self.create_parameter( attr=fluid.ParamAttr( initializer=fluid.initializer.Constant(value=0.1)), shape=[1, 3], @@ -337,10 +337,10 @@ class TestImperative(unittest.TestCase): self.assertTrue(np.allclose(dy_grad, static_grad)) params = mlp.parameters(True) - self.assertEqual("mlp/MLP_0/FC_0_0.w_0", params[0].name) - self.assertEqual("mlp/MLP_0/FC_0_0.b_0", params[1].name) - self.assertEqual("mlp/MLP_0/FC_1_0.w_0", params[2].name) - self.assertEqual("mlp/MLP_0/FC_1_0.b_0", params[3].name) + self.assertEqual("mlp/MLP_0/FC_0.w_0", params[0].name) + self.assertEqual("mlp/MLP_0/FC_0.b_0", params[1].name) + self.assertEqual("mlp/MLP_0/FC_1.w_0", params[2].name) + self.assertEqual("mlp/MLP_0/FC_1.b_0", params[3].name) self.assertEqual(len(params), 4) sublayers = mlp.sublayers(True) diff --git a/python/paddle/fluid/tests/unittests/test_imperative_optimizer.py b/python/paddle/fluid/tests/unittests/test_imperative_optimizer.py index 7afbf61472a3d09ba5e34731d3a3ebbb8076e310..5b3c250501386a7854313218f5ea338281824252 100644 --- a/python/paddle/fluid/tests/unittests/test_imperative_optimizer.py +++ b/python/paddle/fluid/tests/unittests/test_imperative_optimizer.py @@ -78,7 +78,7 @@ class SimpleImgConvPool(fluid.imperative.Layer): class MNIST(fluid.imperative.Layer): - def __init__(self, name_scope, param_attr=None, bias_attr=None): + def __init__(self, name_scope): super(MNIST, self).__init__(name_scope) self._simple_img_conv_pool_1 = SimpleImgConvPool( diff --git a/python/paddle/fluid/tests/unittests/test_imperative_ptb_rnn.py b/python/paddle/fluid/tests/unittests/test_imperative_ptb_rnn.py index 878c27d9344111d18e1ff27a1d4f41f8ae0df4b0..3b602303ae9a183c7b66f5613321f58898fdfcc2 100644 --- a/python/paddle/fluid/tests/unittests/test_imperative_ptb_rnn.py +++ b/python/paddle/fluid/tests/unittests/test_imperative_ptb_rnn.py @@ -41,19 +41,17 @@ class SimpleLSTMRNN(fluid.imperative.Layer): self._dropout = dropout self._input = None self._num_steps = num_steps - from paddle.fluid.layer_helper import LayerHelper - self._helper = LayerHelper('SimpleLSTMRNN', act="tanh") + self.cell_array = [] + self.hidden_array = [] def _build_once(self, input_embedding, init_hidden=None, init_cell=None): self.weight_1_arr = [] self.weight_2_arr = [] self.bias_arr = [] - self.hidden_array = [] - self.cell_array = [] self.mask_array = [] for i in range(self._num_layers): - weight_1 = self._helper.create_parameter( + weight_1 = self.create_parameter( attr=fluid.ParamAttr( initializer=fluid.initializer.UniformInitializer( low=-self._init_scale, high=self._init_scale)), @@ -62,7 +60,7 @@ class SimpleLSTMRNN(fluid.imperative.Layer): default_initializer=fluid.initializer.UniformInitializer( low=-self._init_scale, high=self._init_scale)) self.weight_1_arr.append(weight_1) - bias_1 = self._helper.create_parameter( + bias_1 = self.create_parameter( attr=fluid.ParamAttr( initializer=fluid.initializer.UniformInitializer( low=-self._init_scale, high=self._init_scale)), @@ -71,6 +69,11 @@ class SimpleLSTMRNN(fluid.imperative.Layer): default_initializer=fluid.initializer.Constant(0.0)) self.bias_arr.append(bias_1) + def forward(self, input_embedding, init_hidden=None, init_cell=None): + self.cell_array = [] + self.hidden_array = [] + + for i in range(self._num_layers): pre_hidden = fluid.layers.slice( init_hidden, axes=[0], starts=[i], ends=[i + 1]) pre_cell = fluid.layers.slice( @@ -82,7 +85,6 @@ class SimpleLSTMRNN(fluid.imperative.Layer): self.hidden_array.append(pre_hidden) self.cell_array.append(pre_cell) - def forward(self, input_embedding, init_hidden=None, init_cell=None): res = [] for index in range(self._num_steps): self._input = fluid.layers.slice( @@ -145,8 +147,6 @@ class PtbModel(fluid.imperative.Layer): self.num_layers = num_layers self.num_steps = num_steps self.dropout = dropout - from paddle.fluid.layer_helper import LayerHelper - self._helper = LayerHelper('PtbModel', act="tanh") self.simple_lstm_rnn = SimpleLSTMRNN( self.full_name(), hidden_size, @@ -163,13 +163,13 @@ class PtbModel(fluid.imperative.Layer): name='embedding_para', initializer=fluid.initializer.UniformInitializer( low=-init_scale, high=init_scale))) - self.softmax_weight = self._helper.create_parameter( + self.softmax_weight = self.create_parameter( attr=fluid.ParamAttr(), shape=[self.hidden_size, self.vocab_size], dtype="float32", default_initializer=fluid.initializer.UniformInitializer( low=-self.init_scale, high=self.init_scale)) - self.softmax_bias = self._helper.create_parameter( + self.softmax_bias = self.create_parameter( attr=fluid.ParamAttr(), shape=[self.vocab_size], dtype="float32", @@ -180,7 +180,6 @@ class PtbModel(fluid.imperative.Layer): pass def forward(self, input, label, init_hidden, init_cell): - init_h = fluid.layers.reshape( init_hidden, shape=[self.num_layers, -1, self.hidden_size])