提交 2c4fcaa6 编写于 作者: S sneaxiy

merge develop

......@@ -75,8 +75,9 @@ RUN curl -s -q https://glide.sh/get | sh
# and its size is only one-third of the official one.
# 2. Manually add ~IPluginFactory() in IPluginFactory class of NvInfer.h, otherwise, it couldn't work in paddle.
# See https://github.com/PaddlePaddle/Paddle/issues/10129 for details.
RUN wget -qO- http://paddlepaddledeps.cdn.bcebos.com/TensorRT-4.0.0.3.Ubuntu-16.04.4.x86_64-gnu.cuda-8.0.cudnn7.0.tar.gz | \
tar -xz -C /usr/local && \
RUN wget -q https://paddlepaddledeps.cdn.bcebos.com/TensorRT-4.0.1.6-ubuntu14.04.x86_64-gnu.cuda.8.0.cudnn7.0.tar.gz --no-check-certificate && \
tar -zxf TensorRT-4.0.1.6-ubuntu14.04.x86_64-gnu.cuda.8.0.cudnn7.0.tar.gz -C /usr/local && \
cp -rf /usr/local/TensorRT/include /usr && \
cp -rf /usr/local/TensorRT/lib /usr
......
......@@ -238,7 +238,7 @@ paddle.fluid.layers.load (ArgSpec(args=['out', 'file_path', 'load_as_fp16'], var
paddle.fluid.layers.create_tensor (ArgSpec(args=['dtype', 'name', 'persistable'], varargs=None, keywords=None, defaults=(None, False)), ('document', 'c0c3d0194f83fff8ea99ce0820657dae'))
paddle.fluid.layers.create_parameter (ArgSpec(args=['shape', 'dtype', 'name', 'attr', 'is_bias', 'default_initializer'], varargs=None, keywords=None, defaults=(None, None, False, None)), ('document', 'd62b866c899bc1fedb5385f95b88e1f8'))
paddle.fluid.layers.create_global_var (ArgSpec(args=['shape', 'value', 'dtype', 'persistable', 'force_cpu', 'name'], varargs=None, keywords=None, defaults=(False, False, None)), ('document', 'ab914fac893607e29ac6e52bbdbea1a4'))
paddle.fluid.layers.cast (ArgSpec(args=['x', 'dtype'], varargs=None, keywords=None, defaults=None), ('document', '60cb8f843d625abf33f8bf12455b8f99'))
paddle.fluid.layers.cast (ArgSpec(args=['x', 'dtype'], varargs=None, keywords=None, defaults=None), ('document', '992eb42590fc1c380841a6db72ce78b3'))
paddle.fluid.layers.tensor_array_to_tensor (ArgSpec(args=['input', 'axis', 'name'], varargs=None, keywords=None, defaults=(1, None)), ('document', 'b12717d3d4567e6119589f7f655b0cbb'))
paddle.fluid.layers.concat (ArgSpec(args=['input', 'axis', 'name'], varargs=None, keywords=None, defaults=(0, None)), ('document', 'b19b79be4f05e85d1d6cec642c9fb535'))
paddle.fluid.layers.sums (ArgSpec(args=['input', 'out'], varargs=None, keywords=None, defaults=(None,)), ('document', '42912092418620b4be07f36af31e7816'))
......@@ -262,7 +262,7 @@ paddle.fluid.layers.Switch.default (ArgSpec(args=['self'], varargs=None, keyword
paddle.fluid.layers.increment (ArgSpec(args=['x', 'value', 'in_place'], varargs=None, keywords=None, defaults=(1.0, True)), ('document', '73bb96ec4783ec1a11e760e8851b0e77'))
paddle.fluid.layers.array_write (ArgSpec(args=['x', 'i', 'array'], varargs=None, keywords=None, defaults=(None,)), ('document', '40b6d15f4c86b2b09df340d7778ad713'))
paddle.fluid.layers.create_array (ArgSpec(args=['dtype'], varargs=None, keywords=None, defaults=None), ('document', '2d4f20087080ba5105b55205ad5c5b6a'))
paddle.fluid.layers.less_than (ArgSpec(args=['x', 'y', 'force_cpu', 'cond'], varargs=None, keywords='ignored', defaults=(None, None)), ('document', '067bbc799c66289ca8b8924c26b6673f'))
paddle.fluid.layers.less_than (ArgSpec(args=['x', 'y', 'force_cpu', 'cond'], varargs=None, keywords=None, defaults=(None, None)), ('document', '067bbc799c66289ca8b8924c26b6673f'))
paddle.fluid.layers.equal (ArgSpec(args=['x', 'y', 'cond'], varargs=None, keywords=None, defaults=(None,)), ('document', '80c29b1dc64718f0116de90d1ac88a77'))
paddle.fluid.layers.array_read (ArgSpec(args=['array', 'i'], varargs=None, keywords=None, defaults=None), ('document', '0275133f1dde2aed528b4d3230edf823'))
paddle.fluid.layers.array_length (ArgSpec(args=['array'], varargs=None, keywords=None, defaults=None), ('document', 'ffb8b9578ec66db565b223d313aa82a2'))
......@@ -287,7 +287,7 @@ paddle.fluid.layers.StaticRNN.step_output (ArgSpec(args=['self', 'o'], varargs=N
paddle.fluid.layers.StaticRNN.update_memory (ArgSpec(args=['self', 'mem', 'var'], varargs=None, keywords=None, defaults=None), ('document', '6adf97f83acf6453d4a6a4b1070f3754'))
paddle.fluid.layers.reorder_lod_tensor_by_rank (ArgSpec(args=['x', 'rank_table'], varargs=None, keywords=None, defaults=None), ('document', '3545f529ef04e8f6ecb76b47fa3df01a'))
paddle.fluid.layers.Print (ArgSpec(args=['input', 'first_n', 'message', 'summarize', 'print_tensor_name', 'print_tensor_type', 'print_tensor_shape', 'print_tensor_lod', 'print_phase'], varargs=None, keywords=None, defaults=(-1, None, -1, True, True, True, True, 'both')), ('document', '5fef91b0e21c93610785f2b1f7161732'))
paddle.fluid.layers.is_empty (ArgSpec(args=['x', 'cond'], varargs=None, keywords='ignored', defaults=(None,)), ('document', 'bbe578dbb49ad13e15b014e98c22b519'))
paddle.fluid.layers.is_empty (ArgSpec(args=['x', 'cond'], varargs=None, keywords=None, defaults=(None,)), ('document', 'bbe578dbb49ad13e15b014e98c22b519'))
paddle.fluid.layers.sigmoid (ArgSpec(args=['x', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', '29a25ba78de79152076cacfc5443137d'))
paddle.fluid.layers.logsigmoid (ArgSpec(args=['x', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', '81ccb7acafd06c7728e11581f5d342e3'))
paddle.fluid.layers.exp (ArgSpec(args=['x', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', 'e6b3e769413d96aab4176f96db25984b'))
......@@ -329,6 +329,8 @@ paddle.fluid.layers.polygon_box_transform (ArgSpec(args=['input', 'name'], varar
paddle.fluid.layers.yolov3_loss (ArgSpec(args=['x', 'gtbox', 'gtlabel', 'anchors', 'anchor_mask', 'class_num', 'ignore_thresh', 'downsample_ratio', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', '991e934c3e09abf0edec7c9c978b4691'))
paddle.fluid.layers.box_clip (ArgSpec(args=['input', 'im_info', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', '397e9e02b451d99c56e20f268fa03f2e'))
paddle.fluid.layers.multiclass_nms (ArgSpec(args=['bboxes', 'scores', 'score_threshold', 'nms_top_k', 'keep_top_k', 'nms_threshold', 'normalized', 'nms_eta', 'background_label', 'name'], varargs=None, keywords=None, defaults=(0.3, True, 1.0, 0, None)), ('document', 'ca7d1107b6c5d2d6d8221039a220fde0'))
paddle.fluid.layers.distribute_fpn_proposals (ArgSpec(args=['fpn_rois', 'min_level', 'max_level', 'refer_level', 'refer_scale', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', '7bb011ec26bace2bc23235aa4a17647d'))
paddle.fluid.layers.box_decoder_and_assign (ArgSpec(args=['prior_box', 'prior_box_var', 'target_box', 'box_score', 'box_clip', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', '005a5ae47d6c8fff721931d69d072b9f'))
paddle.fluid.layers.accuracy (ArgSpec(args=['input', 'label', 'k', 'correct', 'total'], varargs=None, keywords=None, defaults=(1, None, None)), ('document', '9808534c12c5e739a10f73ebb0b4eafd'))
paddle.fluid.layers.auc (ArgSpec(args=['input', 'label', 'curve', 'num_thresholds', 'topk', 'slide_steps'], varargs=None, keywords=None, defaults=('ROC', 4095, 1, 1)), ('document', 'e0e95334fce92d16c2d9db6e7caffc47'))
paddle.fluid.layers.exponential_decay (ArgSpec(args=['learning_rate', 'decay_steps', 'decay_rate', 'staircase'], varargs=None, keywords=None, defaults=(False,)), ('document', '98a5050bee8522fcea81aa795adaba51'))
......
......@@ -20,6 +20,9 @@
#include <numeric>
#include <sstream>
#include <string>
#include <unordered_map>
#include <unordered_set>
#include "paddle/fluid/framework/operator.h"
#include "paddle/fluid/framework/var_desc.h"
#include "paddle/fluid/platform/cpu_info.h"
......@@ -302,7 +305,10 @@ std::string OrderedSet::ToString() const {
bool NodeCanReused(ir::Node* node) {
// valid the node is a var node
if (node == nullptr || !node->IsVar() || node->IsCtrlVar()) return false;
// vars can be @EMPTY@, @LR_DECAY_REUSE_ID@. For example, while_grad
if (node == nullptr || !node->IsVar() || node->IsCtrlVar() ||
node->Name() == kEmptyVarName)
return false;
bool flag = true;
// op output force generated in cpu, can not be reused.
......@@ -348,10 +354,6 @@ bool NodeCanReused(const VarDesc& node) {
if (shape.empty() || size < MinChunkSize()) {
return false;
}
// vars can be @EMPTY@, @LR_DECAY_REUSE_ID@. For example, while_grad
std::string name = node.Name();
if (!name.empty() && name[0] == '@' && name[name.size() - 1] == '@')
return false;
return true;
}
......
......@@ -13,6 +13,8 @@
// limitations under the License.
#include "paddle/fluid/framework/details/parallel_ssa_graph_executor.h"
#include <memory>
#include <utility>
#include "paddle/fluid/framework/ir/graph_helper.h"
namespace paddle {
......@@ -29,6 +31,11 @@ ParallelSSAGraphExecutor::SeparateMultiDevicesGraph(ir::Graph *graph) {
auto &g = graphs.back();
g->Set(kGraphVars, new GraphVars(1UL));
g->Set(kGraphDepVars, new GraphDepVars);
auto &stale_ops =
graph->Get<const std::vector<OpDesc *>>(details::kStaleProgramOpDescs);
g->Erase(details::kStaleProgramOpDescs);
g->Set<const std::vector<OpDesc *>>(details::kStaleProgramOpDescs,
new std::vector<OpDesc *>(stale_ops));
}
auto op_handles = ir::FilterByNodeWrapper<OpHandleBase>(*graph);
......
......@@ -14,7 +14,6 @@
#pragma once
#include <array>
#include <deque>
#include "paddle/fluid/platform/enforce.h"
......@@ -22,7 +21,7 @@ namespace paddle {
namespace framework {
template <typename T, size_t N>
class SmallStack {
class InlinedStack {
static_assert(N > 0, "N must be larger than 0");
public:
......@@ -66,8 +65,8 @@ class SmallStack {
private:
T head_[N];
size_t size_{0};
std::deque<T> tail_;
size_t size_;
};
} // namespace framework
......
......@@ -14,6 +14,7 @@
#pragma once
#include <string>
#include "paddle/fluid/framework/ir/graph.h"
#include "paddle/fluid/framework/ir/pass.h"
#include "paddle/fluid/framework/scope.h"
......@@ -24,6 +25,10 @@ namespace ir {
static const char kParamScopeAttr[] = "__param_scope__";
static const char kFuseStatisAttr[] = "__fuse_statis__";
// When we use trt or other third_party lib, the parameters are managed by
// the lib, but not the fluid. So we need to record them to avoid duplicate
// allocation.
static const char kRepetitiveParamAttr[] = "__repetitive_param__";
enum FuseOptions {
DO_NOT_FUSE, // fusing will not be done
......
......@@ -130,15 +130,21 @@ std::map<ir::Node *, std::unordered_set<ir::Node *>> BuildOperationAdjList(
if (adj_list.find(n) == adj_list.end()) {
adj_list[n] = std::unordered_set<ir::Node *>();
}
std::vector<ir::Node *> nodes;
for (auto &var : n->inputs) {
for (auto &adj_n : var->inputs) {
PADDLE_ENFORCE(adj_n->NodeType() == ir::Node::Type::kOperation);
VLOG(4) << "adj " << adj_n->Name() << reinterpret_cast<void *>(adj_n)
<< " -> " << n->Name() << reinterpret_cast<void *>(n)
<< " via " << var->Name() << reinterpret_cast<void *>(var);
adj_list[n].insert(adj_n);
nodes.push_back(adj_n);
}
}
std::sort(nodes.begin(), nodes.end(), [](ir::Node *node1, ir::Node *node2) {
return node1->id() > node2->id();
});
adj_list[n].insert(std::make_move_iterator(nodes.begin()),
std::make_move_iterator(nodes.end()));
}
return adj_list;
}
......
......@@ -467,12 +467,6 @@ const Variable* ExecutionContext::InputVar(const std::string& name) const {
return it->second.empty() ? nullptr : it->second[0];
}
const Variable* ExecutionContext::LegacyInputVar(
const std::string& name) const {
auto ipt = op_.Input(name);
return ipt == kEmptyVarName ? nullptr : scope_.FindVar(ipt);
}
Variable* ExecutionContext::OutputVar(const std::string& name) const {
auto it = ctx_.outputs.find(name);
if (it == ctx_.outputs.end()) return nullptr;
......@@ -483,22 +477,11 @@ Variable* ExecutionContext::OutputVar(const std::string& name) const {
return it->second.empty() ? nullptr : it->second[0];
}
Variable* ExecutionContext::LegacyOutputVar(const std::string& name) const {
auto opt = op_.Output(name);
return opt == kEmptyVarName ? nullptr : scope_.FindVar(opt);
}
template <>
const Tensor* ExecutionContext::Input<Tensor>(const std::string& name) const {
return Input<LoDTensor>(name);
}
template <>
const Tensor* ExecutionContext::LegacyInput<Tensor>(
const std::string& name) const {
return LegacyInput<LoDTensor>(name);
}
template <>
const std::vector<const Tensor*> ExecutionContext::MultiInput<Tensor>(
const std::string& name) const {
......@@ -521,35 +504,11 @@ const std::vector<const Tensor*> ExecutionContext::MultiInput<Tensor>(
return res;
}
template <>
const std::vector<const Tensor*> ExecutionContext::LegacyMultiInput<Tensor>(
const std::string& name) const {
auto names = op().Inputs(name);
std::vector<const Tensor*> res;
res.reserve(names.size());
std::transform(names.begin(), names.end(), std::back_inserter(res),
[&](const std::string& sub_name) -> const Tensor* {
auto var = scope_.FindVar(sub_name);
if (var == nullptr) return nullptr;
PADDLE_ENFORCE(
var->IsType<LoDTensor>(),
"%s should be LoDTensor, but the received type is %s",
sub_name, ToTypeName(var->Type()));
return &(var->Get<LoDTensor>());
});
return res;
}
template <>
Tensor* ExecutionContext::Output<Tensor>(const std::string& name) const {
return Output<LoDTensor>(name);
}
template <>
Tensor* ExecutionContext::LegacyOutput<Tensor>(const std::string& name) const {
return LegacyOutput<LoDTensor>(name);
}
template <>
std::vector<Tensor*> ExecutionContext::MultiOutput<Tensor>(
const std::string& name) const {
......
......@@ -16,9 +16,11 @@ limitations under the License. */
#include <algorithm>
#include <atomic>
#include <memory>
#include <string>
#include <tuple>
#include <unordered_map>
#include <utility>
#include <vector>
#include "glog/logging.h" // For VLOG
......@@ -253,31 +255,6 @@ class ExecutionContext {
return it->second;
}
const std::vector<Variable*> LegacyMultiInputVar(
const std::string& name) const {
auto names = op_.Inputs(name);
std::vector<Variable*> res;
res.reserve(names.size());
std::transform(names.begin(), names.end(), std::back_inserter(res),
[this](const std::string& name) {
return name == kEmptyVarName ? nullptr
: scope_.FindVar(name);
});
return res;
}
std::vector<Variable*> LegacyMultiOutputVar(const std::string& name) const {
auto names = op_.Outputs(name);
std::vector<Variable*> res;
res.reserve(names.size());
std::transform(names.begin(), names.end(), std::back_inserter(res),
[this](const std::string& name) {
return name == kEmptyVarName ? nullptr
: scope_.FindVar(name);
});
return res;
}
template <typename T>
const T* Input(const std::string& name) const {
auto* var = InputVar(name);
......@@ -290,22 +267,6 @@ class ExecutionContext {
return var == nullptr ? nullptr : var->GetMutable<T>();
}
template <typename T>
const T* LegacyInput(const std::string& name) const {
auto* var = LegacyInputVar(name);
return var == nullptr ? nullptr : &var->Get<T>();
}
template <typename T>
T* LegacyOutput(const std::string& name) const {
auto var = LegacyOutputVar(name);
return var == nullptr ? nullptr : var->GetMutable<T>();
}
const Variable* LegacyInputVar(const std::string& name) const;
Variable* LegacyOutputVar(const std::string& name) const;
template <typename T>
const std::vector<const T*> MultiInput(const std::string& name) const {
auto it = ctx_.inputs.find(name);
......@@ -338,32 +299,6 @@ class ExecutionContext {
return res;
}
template <typename T>
const std::vector<const T*> LegacyMultiInput(const std::string& name) const {
auto names = op_.Inputs(name);
std::vector<const T*> res;
res.reserve(names.size());
std::transform(names.begin(), names.end(), std::back_inserter(res),
[&](const std::string& sub_name) -> const T* {
auto var = scope_.FindVar(sub_name);
return var == nullptr ? nullptr : &var->Get<T>();
});
return res;
}
template <typename T>
std::vector<T*> LegacyMultiOutput(const std::string& name) const {
auto names = op_.Outputs(name);
std::vector<T*> res;
res.reserve(names.size());
std::transform(names.begin(), names.end(), std::back_inserter(res),
[&](const std::string& sub_name) -> T* {
auto var = scope_.FindVar(sub_name);
return var == nullptr ? nullptr : var->GetMutable<T>();
});
return res;
}
platform::Place GetPlace() const { return device_context_.GetPlace(); }
template <typename DeviceContextType>
......@@ -433,24 +368,13 @@ class ExecutionContext {
template <>
const Tensor* ExecutionContext::Input<Tensor>(const std::string& name) const;
template <>
const Tensor* ExecutionContext::LegacyInput<Tensor>(
const std::string& name) const;
template <>
const std::vector<const Tensor*> ExecutionContext::MultiInput<Tensor>(
const std::string& name) const;
template <>
const std::vector<const Tensor*> ExecutionContext::LegacyMultiInput<Tensor>(
const std::string& name) const;
template <>
Tensor* ExecutionContext::Output<Tensor>(const std::string& name) const;
template <>
Tensor* ExecutionContext::LegacyOutput<Tensor>(const std::string& name) const;
template <>
std::vector<Tensor*> ExecutionContext::MultiOutput<Tensor>(
const std::string& name) const;
......
......@@ -23,8 +23,12 @@
#pragma once
#include <memory>
#include <string>
#include <unordered_map>
#include <unordered_set>
#include <vector>
#include "paddle/fluid/framework/ir/graph.h"
#include "paddle/fluid/framework/program_desc.h"
#include "paddle/fluid/framework/scope.h"
......@@ -133,6 +137,8 @@ struct Argument {
DECL_ARGUMENT_FIELD(tensorrt_min_subgraph_size, TensorRtMinSubgraphSize, int);
DECL_ARGUMENT_FIELD(tensorrt_precision_mode, TensorRtPrecisionMode,
AnalysisConfig::Precision);
DECL_ARGUMENT_FIELD(tensorrt_use_static_engine, TensorRtUseStaticEngine,
bool);
// Memory optimized related.
DECL_ARGUMENT_FIELD(enable_memory_optim, EnableMemoryOptim, bool);
......
......@@ -17,10 +17,12 @@ limitations under the License. */
#include <sys/stat.h>
#include <cstdio>
#include <fstream>
#include <memory>
#include <set>
#include <string>
#include <typeindex>
#include <unordered_map>
#include <utility>
#include <vector>
#include "paddle/fluid/framework/framework.pb.h"
......@@ -217,6 +219,35 @@ static std::string GetTrtCalibTableData(const std::string &model_opt_cache_dir,
return "";
}
static std::string GetTrtEngineSerializedPath(const std::string &model_root,
const std::string &engine_key) {
return model_root + "/trt_serialized_" + engine_key;
}
static std::string GetTrtEngineSerializedData(
const std::string &model_opt_cache_dir, const std::string &engine_key) {
std::string trt_serialized_path =
GetTrtEngineSerializedPath(model_opt_cache_dir, engine_key);
if (FileExists(trt_serialized_path)) {
VLOG(3) << "Trt serialized file: " << trt_serialized_path
<< "is found here";
std::ifstream infile(trt_serialized_path, std::ios::in);
std::stringstream buffer;
buffer << infile.rdbuf();
std::string trt_engine_serialized_data(buffer.str());
return trt_engine_serialized_data;
}
return "";
}
static void SaveTrtEngineSerializedDataToFile(
const std::string &trt_serialized_path,
const std::string &engine_serialized_data) {
std::ofstream outfile(trt_serialized_path);
outfile << engine_serialized_data;
outfile.close();
}
} // namespace analysis
} // namespace inference
} // namespace paddle
......
......@@ -81,6 +81,9 @@ void IRPassManager::CreatePasses(Argument *argument,
pass->Set(
"model_opt_cache_dir",
new std::string(GetOrCreateModelOptCacheDir(model_opt_cache_dir)));
pass->Set("gpu_device_id", new int(argument->gpu_device_id()));
pass->Set("use_static_engine",
new bool(argument->tensorrt_use_static_engine()));
}
pre_pass = pass_name;
......
......@@ -22,7 +22,10 @@
#pragma once
#include <memory>
#include <string>
#include <unordered_set>
#include <utility>
#include <vector>
#include "paddle/fluid/framework/ir/graph.h"
#include "paddle/fluid/framework/ir/pass.h"
......
......@@ -14,13 +14,13 @@
#include <algorithm>
#include <set>
#include <string>
#include <vector>
#include "paddle/fluid/framework/ir/graph_pattern_detector.h"
#include "paddle/fluid/inference/analysis/helper.h"
#include "paddle/fluid/inference/analysis/ir_passes/subgraph_detector.h"
#include "paddle/fluid/inference/analysis/ir_passes/tensorrt_subgraph_pass.h"
#include "paddle/fluid/inference/tensorrt/convert/op_converter.h"
#include "paddle/fluid/inference/tensorrt/engine.h"
#include "paddle/fluid/inference/tensorrt/op_teller.h"
#include "paddle/fluid/string/pretty_log.h"
......@@ -33,8 +33,15 @@ using framework::ir::Node;
std::vector<std::string> ExtractParameters(
const std::unordered_set<Node *> &nodes);
std::unique_ptr<framework::ir::Graph> analysis::TensorRtSubgraphPass::ApplyImpl(
void RenameAndGetOutputs(
const std::vector<framework::ir::Node *> &subgraph_nodes,
framework::BlockDesc *block_desc,
const std::set<std::string> &input_names_with_id,
std::set<std::string> *output_names_with_id,
std::set<std::string> *output_names,
std::unordered_map<std::string, std::string> *output_name_map);
std::unique_ptr<framework::ir::Graph> analysis::TensorRtSubgraphPass::ApplyImpl(
std::unique_ptr<framework::ir::Graph> graph) const {
framework::ir::FusePassBase::Init("tensorrt_subgraph_pass", graph.get());
......@@ -47,9 +54,16 @@ std::unique_ptr<framework::ir::Graph> analysis::TensorRtSubgraphPass::ApplyImpl(
Get<int>("min_subgraph_size") /*min subgraph size*/);
fuser();
std::vector<std::string> graph_param_names =
ExtractParameters(graph->Nodes());
// those parameter already exist in trt, and should not have another copy in
// fluid.
std::vector<std::string> repetitive_params;
for (auto *node : graph->Nodes()) {
if (node->IsOp() && !Agent(node).subgraph()->empty()) {
CreateTensorRTOp(node, graph.get());
CreateTensorRTOp(node, graph.get(), graph_param_names,
&repetitive_params);
std::unordered_set<const Node *> nodes2remove(
Agent(node).subgraph()->begin(), Agent(node).subgraph()->end());
......@@ -64,12 +78,15 @@ std::unique_ptr<framework::ir::Graph> analysis::TensorRtSubgraphPass::ApplyImpl(
}
}
framework::ir::GraphSafeRemoveNodes(graph.get(), nodes2remove);
graph->Set(framework::ir::kRepetitiveParamAttr,
new std::vector<std::string>(repetitive_params));
return graph;
}
std::string GenerateEngineKey(const std::set<std::string> &engine_inputs,
const std::set<std::string> &engine_outputs) {
const std::set<std::string> &engine_outputs,
const std::string &predictor_id) {
std::string engine_hash_key = "";
for (auto name : engine_inputs) {
engine_hash_key += name;
......@@ -77,12 +94,15 @@ std::string GenerateEngineKey(const std::set<std::string> &engine_inputs,
for (auto name : engine_outputs) {
engine_hash_key += name;
}
engine_hash_key += predictor_id;
auto engine_key = std::to_string(std::hash<std::string>()(engine_hash_key));
return engine_key;
}
void TensorRtSubgraphPass::CreateTensorRTOp(framework::ir::Node *node,
Graph *graph) const {
void TensorRtSubgraphPass::CreateTensorRTOp(
framework::ir::Node *node, Graph *graph,
const std::vector<std::string> &graph_params,
std::vector<std::string> *repetitive_params) const {
auto *op_desc = node->Op();
auto &subgraph = *Agent(node).subgraph();
PADDLE_ENFORCE(!subgraph.empty());
......@@ -116,12 +136,16 @@ void TensorRtSubgraphPass::CreateTensorRTOp(framework::ir::Node *node,
// is unique.
std::set<std::string> input_names;
std::set<std::string> input_names_with_id;
std::vector<std::string> params;
// The node->inputs containes input tensors and parameters.
for (auto *x : node->inputs) {
input_names.insert(x->Name());
input_names_with_id.insert(x->Name() + std::to_string(x->id()));
if (std::count(graph_params.begin(), graph_params.end(), x->Name()) > 0) {
params.push_back(x->Name());
}
}
op_desc->SetInput(
"Xs", std::vector<std::string>(input_names.begin(), input_names.end()));
std::set<std::string> output_names;
std::set<std::string> output_names_with_id;
......@@ -130,11 +154,8 @@ void TensorRtSubgraphPass::CreateTensorRTOp(framework::ir::Node *node,
output_names_with_id.insert(x->Name() + std::to_string(x->id()));
}
op_desc->SetOutput(
"Ys", std::vector<std::string>(output_names.begin(), output_names.end()));
op_desc->SetType("tensorrt_engine");
std::unordered_map<std::string, std::string> output_name_map;
auto &subgraph_nodes = *Agent(node).subgraph();
// The following procedure is used to rename all the intermediate
// variables and the output variables of the subgraph.
......@@ -148,61 +169,8 @@ void TensorRtSubgraphPass::CreateTensorRTOp(framework::ir::Node *node,
// input of a OP, but also the output of a Op, there will be problems.
// So we have to rename the variable in the subgraph to make sure
// it is either an OP's input or an OP's output.
auto &subgraph_nodes = *Agent(node).subgraph();
for (size_t index = 0; index < block_desc.OpSize(); ++index) {
framework::proto::OpDesc *op = block_desc.Op(index)->Proto();
auto correspond_node = subgraph_nodes[index];
PADDLE_ENFORCE_EQ(correspond_node->Name(), op->type());
std::unordered_map<std::string, size_t> var2id;
for (auto *in_var : correspond_node->inputs) {
var2id[in_var->Name()] = in_var->id();
}
// rename for the input variables of op inside subgraph
for (int i = 0; i < op->inputs_size(); i++) {
// one input
auto *in_var = op->mutable_inputs(i);
std::vector<std::string> replaced_names;
for (int k = 0; k < in_var->arguments_size(); k++) { // all the arguments
std::string arg_value = in_var->arguments(k);
std::string arg_value_with_id =
arg_value + std::to_string(var2id[arg_value]);
if (input_names_with_id.count(arg_value_with_id)) {
replaced_names.push_back(arg_value);
} else {
replaced_names.push_back(arg_value_with_id);
}
}
in_var->clear_arguments();
for (size_t k = 0; k < replaced_names.size(); k++) {
in_var->add_arguments(replaced_names[k]);
}
}
var2id.clear();
for (auto out_var : correspond_node->outputs) {
var2id[out_var->Name()] = out_var->id();
}
// rename for the output variables of op inside subgraph
for (int i = 0; i < op->outputs_size(); i++) {
framework::proto::OpDesc_Var *out_var = op->mutable_outputs(i);
std::vector<std::string> replaced_names;
for (int k = 0; k < out_var->arguments_size(); k++) {
std::string arg_value = out_var->arguments(k);
std::string arg_value_with_id =
arg_value + std::to_string(var2id[arg_value]);
if (output_names_with_id.count(arg_value_with_id)) {
output_name_map[arg_value] = arg_value_with_id;
}
replaced_names.push_back(arg_value_with_id);
}
out_var->clear_arguments();
for (size_t k = 0; k < replaced_names.size(); k++) {
out_var->add_arguments(replaced_names[k]);
}
}
}
RenameAndGetOutputs(subgraph_nodes, &block_desc, input_names_with_id,
&output_names_with_id, &output_names, &output_name_map);
// When tensorrt engine runs at the end of the operation,
// output_mapping help us copy the data from the renamed ITensor
......@@ -212,6 +180,7 @@ void TensorRtSubgraphPass::CreateTensorRTOp(framework::ir::Node *node,
PADDLE_ENFORCE(output_name_map.count(name) != 0);
output_mapping.push_back(output_name_map[name]);
}
PADDLE_ENFORCE(!output_mapping.empty());
auto *vars = block_desc.Proto()->mutable_vars();
for (framework::ir::Node *node : graph->Nodes()) {
......@@ -222,26 +191,83 @@ void TensorRtSubgraphPass::CreateTensorRTOp(framework::ir::Node *node,
PADDLE_ENFORCE(!block_desc.Proto()->vars().empty(),
"the block has no var-desc");
PADDLE_ENFORCE(!output_mapping.empty());
// Set attrs
op_desc->SetType("tensorrt_engine");
op_desc->SetInput(
"Xs", std::vector<std::string>(input_names.begin(), input_names.end()));
op_desc->SetOutput(
"Ys", std::vector<std::string>(output_names.begin(), output_names.end()));
op_desc->SetBlockAttr("sub_block", new_block);
SetAttr(op_desc->Proto(), "subgraph",
block_desc.Proto()->SerializeAsString());
// Set attrs
SetAttr(op_desc->Proto(), "max_batch_size", Get<int>("max_batch_size"));
SetAttr(op_desc->Proto(), "workspace_size", Get<int>("workspace_size"));
SetAttr(op_desc->Proto(), "parameters", ExtractParameters(graph->Nodes()));
SetAttr(op_desc->Proto(), "output_name_mapping", output_mapping);
SetAttr(op_desc->Proto(), "parameters", params);
auto enable_int8 = Get<bool>("enable_int8");
auto engine_key =
GenerateEngineKey(input_names_with_id, output_names_with_id);
auto engine_key = GenerateEngineKey(input_names_with_id, output_names_with_id,
std::to_string(0));
// Get "" when there is no cached calibration table data.
std::string calibration_data = GetTrtCalibTableData(
Get<std::string>("model_opt_cache_dir"), engine_key, enable_int8);
SetAttr(op_desc->Proto(), "calibration_data", calibration_data);
SetAttr(op_desc->Proto(), "enable_int8", enable_int8);
SetAttr(op_desc->Proto(), "engine_key", engine_key);
SetAttr(op_desc->Proto(), "engine_serialized_data", std::string(""));
std::unique_ptr<tensorrt::TRTInt8Calibrator> calibrator;
if (enable_int8 && calibration_data.size() != 0) {
calibrator.reset(new tensorrt::TRTInt8Calibrator(calibration_data));
}
bool use_static_engine = Get<bool>("use_static_engine");
// When in int8 mode and calibration_mode, the program just produce the
// calibration table data.
bool calibration_mode = (enable_int8 && calibration_data.size() == 0);
if (!calibration_mode && use_static_engine) {
std::copy(params.begin(), params.end(),
std::back_inserter(*repetitive_params));
std::string trt_engine_serialized_data = GetTrtEngineSerializedData(
Get<std::string>("model_opt_cache_dir"), engine_key);
if (trt_engine_serialized_data.empty()) {
LOG(INFO) << "Prepare TRT engine (Optimize model structure, Select OP "
"kernel etc). This process may cost a lot of time.";
std::unique_ptr<tensorrt::TensorRTEngine> trt_engine(
new tensorrt::TensorRTEngine(
Get<int>("max_batch_size"), Get<int>("workspace_size"),
enable_int8, calibrator.get(), Get<int>("gpu_device_id")));
auto *scope = param_scope();
framework::BlockDesc block_desc_temp(nullptr, block_desc.Proto());
std::unordered_set<std::string> param_set(params.begin(), params.end());
inference::Singleton<inference::tensorrt::OpConverter>::Global()
.ConvertBlockToTRTEngine(
&block_desc_temp, *scope,
std::vector<std::string>(input_names.begin(), input_names.end()),
param_set, output_mapping, trt_engine.get());
nvinfer1::IHostMemory *serialized_engine_data = trt_engine->Serialize();
trt_engine_serialized_data =
std::string((const char *)serialized_engine_data->data(),
serialized_engine_data->size());
SaveTrtEngineSerializedDataToFile(
GetTrtEngineSerializedPath(Get<std::string>("model_opt_cache_dir"),
engine_key),
trt_engine_serialized_data);
} else {
LOG(INFO) << "Load TRT Optimized Info from "
<< GetTrtEngineSerializedPath(
Get<std::string>("model_opt_cache_dir"), engine_key);
}
SetAttr(op_desc->Proto(), "engine_serialized_data",
trt_engine_serialized_data);
}
}
std::vector<std::string> ExtractParameters(
......@@ -253,7 +279,7 @@ std::vector<std::string> ExtractParameters(
for (const auto &node : nodes) {
if (!node->IsOp()) continue;
std::string op_type = node->Op()->Type();
if (op_type == "feed") {
if (op_type == "feed" || op_type == "fetch") {
std::vector<std::string> output_names = node->Op()->OutputArgumentNames();
std::copy(output_names.begin(), output_names.end(),
std::back_inserter(feed_outputs));
......@@ -272,6 +298,99 @@ std::vector<std::string> ExtractParameters(
return parameters;
}
void RenameAndGetOutputs(
const std::vector<framework::ir::Node *> &subgraph_nodes,
framework::BlockDesc *block_desc,
const std::set<std::string> &input_names_with_id,
std::set<std::string> *output_names_with_id,
std::set<std::string> *output_names,
std::unordered_map<std::string, std::string> *output_name_map) {
//// In the normal case, the paddle-trt exists bug when runing the googlenet.
// When there are more than two convolutions of 1 * 1 with the same input, the
// paddle-tensorrt will do the merging optimization, which fuse those conv
// into one conv, and then trigger bug. So, We should use strategy to avoid
// this optimization for the time being. This bug will be fixed in the future.
std::unordered_map<std::string /*name*/, int /*ITensor_quote_num*/>
same_hierarchy_conv2d_num_map;
for (size_t index = 0; index < block_desc->OpSize(); ++index) {
framework::proto::OpDesc *op = block_desc->Op(index)->Proto();
framework::OpDesc op_desc(*op, nullptr);
auto correspond_node = subgraph_nodes[index];
PADDLE_ENFORCE_EQ(correspond_node->Name(), op->type());
std::unordered_map<std::string, size_t> var2id;
std::unordered_map<std::string, framework::ir::Node *> in_vars;
for (auto *in_var : correspond_node->inputs) {
var2id[in_var->Name()] = in_var->id();
in_vars[in_var->Name()] = in_var;
}
// rename for the input variables of op inside subgraph
for (int i = 0; i < op->inputs_size(); i++) {
// one input
auto *in_var = op->mutable_inputs(i);
std::vector<std::string> replaced_names;
for (int k = 0; k < in_var->arguments_size(); k++) { // all the arguments
std::string arg_value = in_var->arguments(k);
std::string arg_value_with_id =
arg_value + std::to_string(var2id[arg_value]);
if (input_names_with_id.count(arg_value_with_id)) {
replaced_names.push_back(arg_value);
} else {
replaced_names.push_back(arg_value_with_id);
}
}
in_var->clear_arguments();
for (size_t k = 0; k < replaced_names.size(); k++) {
in_var->add_arguments(replaced_names[k]);
}
}
var2id.clear();
for (auto out_var : correspond_node->outputs) {
var2id[out_var->Name()] = out_var->id();
}
if (op_desc.Type() == "conv2d") {
auto input_var_name = op_desc.Input("Input").front();
auto filter_var_name = op_desc.Input("Filter").front();
auto out_var_name = op_desc.Output("Output").front();
auto filter_shape = in_vars[filter_var_name]->Var()->GetShape();
const std::vector<int> strides =
boost::get<std::vector<int>>(op_desc.GetAttr("strides"));
const std::vector<int> paddings =
boost::get<std::vector<int>>(op_desc.GetAttr("paddings"));
if (same_hierarchy_conv2d_num_map[input_var_name] > 0) {
(*output_names_with_id)
.insert(out_var_name + std::to_string(var2id[out_var_name]));
(*output_names).insert(out_var_name);
} else if (filter_shape[2] == 1 && filter_shape[3] == 1 &&
strides[0] == 1 && strides[1] == 1 && paddings[0] == 0 &&
paddings[1] == 0) {
same_hierarchy_conv2d_num_map[input_var_name] += 1;
}
}
// rename for the output variables of op inside subgraph
for (int i = 0; i < op->outputs_size(); i++) {
framework::proto::OpDesc_Var *out_var = op->mutable_outputs(i);
std::vector<std::string> replaced_names;
for (int k = 0; k < out_var->arguments_size(); k++) {
std::string arg_value = out_var->arguments(k);
std::string arg_value_with_id =
arg_value + std::to_string(var2id[arg_value]);
if (output_names_with_id->count(arg_value_with_id)) {
(*output_name_map)[arg_value] = arg_value_with_id;
}
replaced_names.push_back(arg_value_with_id);
}
out_var->clear_arguments();
for (size_t k = 0; k < replaced_names.size(); k++) {
out_var->add_arguments(replaced_names[k]);
}
}
}
}
} // namespace analysis
} // namespace inference
} // namespace paddle
......
......@@ -13,7 +13,12 @@
// limitations under the License.
#pragma once
#include <paddle/fluid/framework/ir/fuse_pass_base.h>
#include <memory>
#include <string>
#include <unordered_map>
#include <unordered_set>
#include <vector>
#include "paddle/fluid/framework/ir/fuse_pass_base.h"
#include "paddle/fluid/framework/ir/pass.h"
namespace paddle {
......@@ -26,8 +31,9 @@ class TensorRtSubgraphPass : public framework::ir::FusePassBase {
std::unique_ptr<framework::ir::Graph> graph) const override;
private:
void CreateTensorRTOp(framework::ir::Node *x,
framework::ir::Graph *graph) const;
void CreateTensorRTOp(framework::ir::Node *x, framework::ir::Graph *graph,
const std::vector<std::string> &graph_params,
std::vector<std::string> *repetitive_params) const;
void CleanIntermediateOutputs(framework::ir::Node *node);
};
......
......@@ -31,6 +31,13 @@ void IrParamsSyncAmongDevicesPass::RunImpl(Argument *argument) {
// The parameters are on the cpu, therefore, synchronization is not necessary.
if (!argument->use_gpu()) return;
auto &graph = argument->main_graph();
std::vector<std::string> repetitive_params;
if (graph.Has(framework::ir::kRepetitiveParamAttr))
repetitive_params = graph.Get<std::vector<std::string>>(
framework::ir::kRepetitiveParamAttr);
LOG(INFO) << "Sync params from CPU to GPU";
PADDLE_ENFORCE(argument->gpu_device_id_valid());
......@@ -43,6 +50,10 @@ void IrParamsSyncAmongDevicesPass::RunImpl(Argument *argument) {
// Because there exists the case that new parameter variables are not added to
// the program in the analysis pass.
for (auto &var_name : all_vars) {
if (std::count(repetitive_params.begin(), repetitive_params.end(),
var_name)) {
continue;
}
auto *var = scope->FindLocalVar(var_name);
PADDLE_ENFORCE(var != nullptr);
if (var->IsType<framework::LoDTensor>() ||
......
......@@ -17,6 +17,7 @@
#include <string>
#include <vector>
#include "paddle/fluid/framework/ir/fuse_pass_base.h"
#include "paddle/fluid/framework/scope.h"
#include "paddle/fluid/inference/analysis/analysis_pass.h"
#include "paddle/fluid/platform/place.h"
......
......@@ -103,6 +103,7 @@ AnalysisConfig::AnalysisConfig(const AnalysisConfig &other) {
CP_MEMBER(tensorrt_max_batchsize_);
CP_MEMBER(tensorrt_min_subgraph_size_);
CP_MEMBER(tensorrt_precision_mode_);
CP_MEMBER(trt_use_static_engine_);
// MKLDNN related.
CP_MEMBER(use_mkldnn_);
CP_MEMBER(mkldnn_enabled_op_types_);
......@@ -144,7 +145,7 @@ void AnalysisConfig::EnableMKLDNN() {
void AnalysisConfig::EnableTensorRtEngine(
int workspace_size, int max_batch_size, int min_subgraph_size,
AnalysisConfig::Precision precision_mode) {
AnalysisConfig::Precision precision_mode, bool use_static) {
#ifdef PADDLE_WITH_CUDA
if (!use_gpu()) {
LOG(ERROR) << "To use TensorRT engine, please call EnableGpu() first";
......@@ -156,6 +157,7 @@ void AnalysisConfig::EnableTensorRtEngine(
tensorrt_max_batchsize_ = max_batch_size;
tensorrt_min_subgraph_size_ = min_subgraph_size;
tensorrt_precision_mode_ = precision_mode;
trt_use_static_engine_ = use_static;
Update();
#else
......
......@@ -183,6 +183,9 @@ void AnalysisPredictor::SetMkldnnThreadID(int tid) {
bool AnalysisPredictor::Run(const std::vector<PaddleTensor> &inputs,
std::vector<PaddleTensor> *output_data,
int batch_size) {
if (UNLIKELY(config_.cpu_math_library_num_threads() > 1)) {
paddle::platform::SetNumThreads(config_.cpu_math_library_num_threads());
}
VLOG(3) << "Predictor::predict";
inference::Timer timer;
timer.tic();
......@@ -362,6 +365,7 @@ void AnalysisPredictor::OptimizeInferenceProgram() {
argument_.SetTensorRtMaxBatchSize(config_.tensorrt_max_batchsize_);
argument_.SetTensorRtMinSubgraphSize(config_.tensorrt_min_subgraph_size_);
argument_.SetTensorRtPrecisionMode(config_.tensorrt_precision_mode_);
argument_.SetTensorRtUseStaticEngine(config_.trt_use_static_engine_);
}
if (config_.use_mkldnn_) {
......@@ -435,12 +439,14 @@ void AnalysisPredictor::PrepareFeedFetch() {
}
feeds_[idx] = op;
feed_names_[op->Output("Out")[0]] = idx;
idx2feeds_[idx] = op->Output("Out")[0];
} else if (op->Type() == "fetch") {
int idx = boost::get<int>(op->GetAttr("col"));
if (fetches_.size() <= static_cast<size_t>(idx)) {
fetches_.resize(idx + 1);
}
fetches_[idx] = op;
idx2fetches_[idx] = op->Input("X")[0];
}
}
}
......@@ -453,6 +459,22 @@ void AnalysisPredictor::CreateFeedFetchVar(framework::Scope *scope) {
var->GetMutable<framework::FeedFetchList>();
}
std::vector<std::string> AnalysisPredictor::GetInputNames() {
std::vector<std::string> input_names;
for (auto &item : idx2feeds_) {
input_names.push_back(item.second);
}
return input_names;
}
std::vector<std::string> AnalysisPredictor::GetOutputNames() {
std::vector<std::string> output_names;
for (auto &item : idx2fetches_) {
output_names.push_back(item.second);
}
return output_names;
}
std::unique_ptr<ZeroCopyTensor> AnalysisPredictor::GetInputTensor(
const std::string &name) {
PADDLE_ENFORCE(executor_->scope()->FindVar(name), "no name called %s", name);
......@@ -460,6 +482,13 @@ std::unique_ptr<ZeroCopyTensor> AnalysisPredictor::GetInputTensor(
new ZeroCopyTensor(static_cast<void *>(executor_->scope())));
res->input_or_output_ = true;
res->SetName(name);
if (platform::is_cpu_place(place_)) {
res->SetPlace(PaddlePlace::kCPU);
} else {
auto gpu_place = boost::get<platform::CUDAPlace>(place_);
res->SetPlace(PaddlePlace::kGPU, gpu_place.GetDeviceId());
}
return res;
}
......@@ -470,6 +499,12 @@ std::unique_ptr<ZeroCopyTensor> AnalysisPredictor::GetOutputTensor(
new ZeroCopyTensor(static_cast<void *>(executor_->scope())));
res->input_or_output_ = false;
res->SetName(name);
if (platform::is_cpu_place(place_)) {
res->SetPlace(PaddlePlace::kCPU);
} else {
auto gpu_place = boost::get<platform::CUDAPlace>(place_);
res->SetPlace(PaddlePlace::kGPU, gpu_place.GetDeviceId());
}
return res;
}
......
......@@ -15,12 +15,14 @@
#pragma once
#include <algorithm>
#include <map>
#include <memory>
#include <string>
#include <vector>
#include "paddle/fluid/framework/naive_executor.h"
#include "paddle/fluid/inference/analysis/analyzer.h"
#include "paddle/fluid/inference/api/api_impl.h"
#include "paddle/fluid/inference/api/details/reset_tensor_array.h"
#include "paddle/fluid/inference/api/helper.h"
#include "paddle/fluid/inference/api/paddle_inference_api.h"
#include "paddle/fluid/string/printf.h"
#ifdef PADDLE_WITH_TESTING
......@@ -53,6 +55,9 @@ class AnalysisPredictor : public PaddlePredictor {
std::vector<PaddleTensor> *output_data,
int batch_size = -1) override;
std::vector<std::string> GetInputNames();
std::vector<std::string> GetOutputNames();
std::unique_ptr<ZeroCopyTensor> GetInputTensor(
const std::string &name) override;
std::unique_ptr<ZeroCopyTensor> GetOutputTensor(
......@@ -131,7 +136,11 @@ class AnalysisPredictor : public PaddlePredictor {
std::shared_ptr<framework::ProgramDesc> inference_program_;
std::vector<framework::OpDesc *> feeds_;
std::map<std::string, size_t> feed_names_;
// Sorted according to the idx.
std::map<size_t, std::string> idx2feeds_;
std::vector<framework::OpDesc *> fetches_;
std::map<size_t, std::string> idx2fetches_;
// Memory buffer for feed inputs. The temporary LoDTensor will cause serious
// concurrency problems, wrong results and memory leak, so cache them.
std::vector<framework::LoDTensor> feed_tensors_;
......
......@@ -131,6 +131,9 @@ NativePaddlePredictor::~NativePaddlePredictor() {
bool NativePaddlePredictor::Run(const std::vector<PaddleTensor> &inputs,
std::vector<PaddleTensor> *output_data,
int batch_size) {
if (UNLIKELY(config_.cpu_math_library_num_threads() > 1)) {
paddle::platform::SetNumThreads(config_.cpu_math_library_num_threads());
}
VLOG(3) << "Predictor::predict";
Timer timer;
timer.tic();
......
......@@ -15,6 +15,7 @@
#include "paddle/fluid/framework/lod_tensor.h"
#include "paddle/fluid/framework/scope.h"
#include "paddle/fluid/inference/api/paddle_inference_api.h"
#include "paddle/fluid/memory/memcpy.h"
#include "paddle/fluid/platform/enforce.h"
namespace paddle {
......@@ -73,6 +74,61 @@ T *ZeroCopyTensor::data(PaddlePlace *place, int *size) const {
return res;
}
template <typename T>
void ZeroCopyTensor::copy_from_cpu(const T *data) {
EAGER_GET_TENSOR;
PADDLE_ENFORCE_GE(
tensor->numel(), 0,
"You should call ZeroCopyTensor::Reshape(const std::vector<int> &shape)"
"function before copy data from cpu.");
size_t ele_size = tensor->numel() * sizeof(T);
if (place_ == PaddlePlace::kCPU) {
auto *t_data = tensor->mutable_data<T>(platform::CPUPlace());
std::memcpy(static_cast<void *>(t_data), data, ele_size);
} else {
#ifdef PADDLE_WITH_CUDA
platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance();
platform::CUDAPlace gpu_place(device_);
auto *t_data = tensor->mutable_data<T>(gpu_place);
auto *dev_ctx =
static_cast<const platform::CUDADeviceContext *>(pool.Get(gpu_place));
memory::Copy(gpu_place, static_cast<void *>(t_data), platform::CPUPlace(),
data, ele_size, dev_ctx->stream());
#else
PADDLE_THROW("Not compile with CUDA, should not reach here.");
#endif
}
}
template <typename T>
void ZeroCopyTensor::copy_to_cpu(T *data) {
EAGER_GET_TENSOR;
auto ele_num = tensor->numel();
auto *t_data = tensor->data<T>();
auto t_place = tensor->place();
if (platform::is_cpu_place(t_place)) {
std::memcpy(static_cast<void *>(data), t_data, ele_num * sizeof(T));
} else {
#ifdef PADDLE_WITH_CUDA
platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance();
auto gpu_place = boost::get<platform::CUDAPlace>(t_place);
auto *dev_ctx =
static_cast<const platform::CUDADeviceContext *>(pool.Get(gpu_place));
memory::Copy(platform::CPUPlace(), static_cast<void *>(data), gpu_place,
t_data, ele_num * sizeof(T), dev_ctx->stream());
#else
PADDLE_THROW("Not compile with CUDA, should not reach here.");
#endif
}
}
template void ZeroCopyTensor::copy_from_cpu<float>(const float *data);
template void ZeroCopyTensor::copy_from_cpu<int64_t>(const int64_t *data);
template void ZeroCopyTensor::copy_to_cpu<float>(float *data);
template void ZeroCopyTensor::copy_to_cpu<int64_t>(int64_t *data);
template float *ZeroCopyTensor::data<float>(PaddlePlace *place,
int *size) const;
template int64_t *ZeroCopyTensor::data<int64_t>(PaddlePlace *place,
......@@ -92,10 +148,10 @@ void *ZeroCopyTensor::FindTensor() const {
return tensor;
}
std::vector<int64_t> ZeroCopyTensor::shape() const {
std::vector<int> ZeroCopyTensor::shape() const {
EAGER_GET_TENSOR;
PADDLE_ENFORCE(tensor_, "not found tensor called %s in the scope", name_);
return framework::vectorize(tensor->dims());
return framework::vectorize2int(tensor->dims());
}
void ZeroCopyTensor::SetLoD(const std::vector<std::vector<size_t>> &x) {
......
......@@ -37,7 +37,7 @@ template int64_t *ZeroCopyTensor::mutable_data(PaddlePlace place);
void *ZeroCopyTensor::FindTensor() const { return nullptr; }
std::vector<int64_t> ZeroCopyTensor::shape() const { return {}; }
std::vector<int> ZeroCopyTensor::shape() const { return {}; }
void ZeroCopyTensor::SetLoD(const std::vector<std::vector<size_t>> &x) {}
......
......@@ -50,6 +50,11 @@ class Timer {
}
};
static int GetUniqueId() {
static int id = 0;
return id++;
}
static void split(const std::string &str, char sep,
std::vector<std::string> *pieces) {
pieces->clear();
......
......@@ -135,7 +135,8 @@ struct AnalysisConfig {
*/
void EnableTensorRtEngine(int workspace_size = 1 << 20,
int max_batch_size = 1, int min_subgraph_size = 3,
Precision precision = Precision::kFloat32);
Precision precision = Precision::kFloat32,
bool use_static = true);
/** A boolean state telling whether the TensorRT engine is used.
*/
bool tensorrt_engine_enabled() const { return use_tensorrt_; }
......@@ -233,6 +234,7 @@ struct AnalysisConfig {
// subgraph, 3 as default value.
int tensorrt_min_subgraph_size_{3};
Precision tensorrt_precision_mode_;
bool trt_use_static_engine_;
// memory reuse related.
bool enable_memory_optim_{false};
......
......@@ -160,11 +160,21 @@ class ZeroCopyTensor {
template <typename T>
T* data(PaddlePlace* place, int* size) const;
std::vector<int64_t> shape() const;
template <typename T>
void copy_from_cpu(const T* data);
template <typename T>
void copy_to_cpu(T* data);
std::vector<int> shape() const;
void SetLoD(const std::vector<std::vector<size_t>>& x);
std::vector<std::vector<size_t>> lod() const;
const std::string& name() const { return name_; }
void SetPlace(PaddlePlace place, int device = -1) {
place_ = place;
device_ = device;
}
protected:
explicit ZeroCopyTensor(void* scope) : scope_{scope} {}
......@@ -179,6 +189,8 @@ class ZeroCopyTensor {
// The corresponding tensor pointer inside Paddle workspace is cached for
// performance.
mutable void* tensor_{nullptr};
PaddlePlace place_;
int device_;
};
/** A simple Inference API for Paddle.
......@@ -200,6 +212,14 @@ class PaddlePredictor {
std::vector<PaddleTensor>* output_data,
int batch_size = -1) = 0;
/** \brief Get input names of the model
*/
virtual std::vector<std::string> GetInputNames() { return {}; }
/** \brief Get output names of the model
*/
virtual std::vector<std::string> GetOutputNames() { return {}; }
/** \brief Get a mutable tensor directly.
*
* NOTE Only works in AnalysisPredictor.
......
......@@ -49,11 +49,6 @@ class EngineBase {
// Execute the engine, that will run the inference network.
virtual void Execute(int batch_size) = 0;
// Return the IO buffer that allocated in engine. One can read/write directly
// on the buffer. If the buffer's buffer is nullptr, one can also allocate
// memory and maintain it outside the engine.
virtual Buffer& buffer(const std::string& name) = 0;
virtual ~EngineBase() {}
}; // class EngineBase
......
......@@ -18,21 +18,6 @@ namespace paddle {
namespace inference {
namespace tensorrt {
bool to_skip_merging_optimize(TensorRTEngine* engine,
const std::vector<int>& filters,
const std::vector<int>& strides,
const std::vector<int>& paddings,
std::string input_name) {
if (engine->itensor_quote_num[input_name] > 0) {
return true;
}
if (filters[0] == 1 && filters[1] == 1 && strides[0] == 1 &&
strides[1] == 1 && paddings[0] == 0 && paddings[1] == 0)
engine->itensor_quote_num[input_name] += 1;
return false;
}
template <typename RegistFunc, typename SetDilationFunc>
void ConvertConv2d(TensorRTEngine* engine, const framework::proto::OpDesc& op,
const framework::Scope& scope, bool test_mode,
......@@ -59,7 +44,7 @@ void ConvertConv2d(TensorRTEngine* engine, const framework::proto::OpDesc& op,
weight_tensor->Resize(Y_t->dims());
TensorCopySync((*Y_t), cpu_place, weight_tensor.get());
auto* weight_data = weight_tensor->mutable_data<float>(platform::CPUPlace());
auto* weight_data = weight_tensor->mutable_data<float>(cpu_place);
PADDLE_ENFORCE_EQ(weight_tensor->dims().size(), 4UL);
const int n_output = weight_tensor->dims()[0];
......@@ -100,9 +85,7 @@ void ConvertConv2d(TensorRTEngine* engine, const framework::proto::OpDesc& op,
layer->getOutput(0)->setName(output_name.c_str());
engine->SetITensor(output_name, layer->getOutput(0));
if (test_mode ||
to_skip_merging_optimize(engine, {filter_h, filter_w}, strides, paddings,
op_desc.Input("Input").front())) {
if (test_mode) {
engine->DeclareOutput(output_name);
}
}
......
......@@ -153,7 +153,6 @@ class ElementwiseTensorOpConverter : public OpConverter {
if (CheckDims(dims_x, dims_y)) {
// The two input tensor should have the same dims
VLOG(3) << "Convert a fluid elementwise op to TensorRT IElementWiseLayer";
nvinfer1::IElementWiseLayer* layer = TRT_ENGINE_ADD_LAYER(
engine_, ElementWise, *const_cast<nvinfer1::ITensor*>(X),
*const_cast<nvinfer1::ITensor*>(Y), op_pair->second);
......@@ -166,7 +165,7 @@ class ElementwiseTensorOpConverter : public OpConverter {
"ElementWisePluginLayer";
plugin::ElementWisePlugin* plugin =
new plugin::ElementWisePlugin(op_pair->second, dims_x, dims_y, axis);
new plugin::ElementWisePlugin(op_type_, dims_x, dims_y, axis);
plugin->AddInput(X);
plugin->AddInput(Y);
nvinfer1::IPluginLayer* layer = engine_->AddPlugin(
......
......@@ -85,10 +85,10 @@ class FcOpConverter : public OpConverter {
Y_t->dims()[0] * Y_t->dims()[1] * sizeof(float));
TensorRTEngine::Weight weight{nvinfer1::DataType::kFLOAT,
static_cast<void*>(weight_data),
Y_t->memory_size() / sizeof(float)};
static_cast<size_t>(Y_t->numel())};
TensorRTEngine::Weight tmp_weight(nvinfer1::DataType::kFLOAT,
static_cast<void*>(tmp->data<float>()),
Y_t->memory_size() / sizeof(float));
static_cast<size_t>(Y_t->numel()));
weight.dims.assign({Y_t->dims()[0], Y_t->dims()[1]});
tmp_weight.dims = weight.dims;
......
......@@ -16,9 +16,12 @@ limitations under the License. */
#include <string>
#include <unordered_map>
#include <unordered_set>
#include <vector>
#include "paddle/fluid/framework/block_desc.h"
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/framework/scope.h"
#include "paddle/fluid/inference/analysis/helper.h"
#include "paddle/fluid/inference/tensorrt/engine.h"
#include "paddle/fluid/inference/utils/singleton.h"
......@@ -26,6 +29,37 @@ namespace paddle {
namespace inference {
namespace tensorrt {
using FluidDT = framework::proto::VarType_Type;
using TRT_DT = nvinfer1::DataType;
namespace { // NOLINT
TRT_DT FluidDataType2TRT(FluidDT type) {
switch (type) {
case FluidDT::VarType_Type_FP32:
return TRT_DT::kFLOAT;
case FluidDT::VarType_Type_INT32:
return TRT_DT::kINT32;
default:
return TRT_DT::kINT32;
}
PADDLE_THROW("unkown type");
return TRT_DT::kINT32;
}
nvinfer1::Dims Vec2TRT_Dims(const std::vector<int64_t>& shape) {
PADDLE_ENFORCE_GT(shape.size(), 1UL,
"TensorRT' tensor input requires at least 2 dimensions");
PADDLE_ENFORCE_LE(shape.size(), 4UL,
"TensorRT' tensor input requires at most 4 dimensions");
PADDLE_ENFORCE(shape.size() == 4UL || shape.size() == 2UL);
if (shape.size() == 4UL)
return nvinfer1::DimsCHW(shape[1], shape[2], shape[3]);
return nvinfer1::DimsCHW(shape[1], 1, 1);
}
} // namespace // NOLINT
/*
* Convert Op from Fluid to TensorRT Engine.
*/
......@@ -110,6 +144,34 @@ class OpConverter {
}
}
// The scope here should be inited with the parameter vars.
void ConvertBlockToTRTEngine(
framework::BlockDesc* block_desc, const framework::Scope& scope,
const std::vector<std::string>& inputs,
const std::unordered_set<std::string>& parameters,
const std::vector<std::string>& outputs, TensorRTEngine* engine) {
engine->InitNetwork();
for (auto& input : inputs) {
if (parameters.count(input)) continue;
auto* var = block_desc->FindVar(input);
PADDLE_ENFORCE(var, "no variable called %s", input);
PADDLE_ENFORCE_EQ(var->GetType(), FluidDT::VarType_Type_LOD_TENSOR,
"TensorRT engine only takes LoDTensor as input");
auto var_shape = var->GetShape();
engine->DeclareInput(
input, FluidDataType2TRT(
var->Proto()->type().lod_tensor().tensor().data_type()),
Vec2TRT_Dims(var_shape));
}
framework::proto::BlockDesc* block_proto = block_desc->Proto();
ConvertBlock(*block_proto, parameters, scope, engine);
for (auto& output : outputs) {
engine->DeclareOutput(output);
}
engine->FreezeNetwork();
}
void SetEngine(TensorRTEngine* engine) { engine_ = engine; }
virtual ~OpConverter() {}
......
......@@ -43,23 +43,20 @@ class PReluOpConverter : public OpConverter {
PADDLE_ENFORCE_NOT_NULL(alpha_var);
auto* alpha_tensor = alpha_var->GetMutable<framework::LoDTensor>();
platform::CUDAPlace place;
std::unique_ptr<framework::LoDTensor> alpha_tensor_device(
platform::CPUPlace cpu_place;
std::unique_ptr<framework::LoDTensor> alpha_tensor_temp(
new framework::LoDTensor());
alpha_tensor_device->Resize(alpha_tensor->dims());
TensorCopySync(*alpha_tensor, place, alpha_tensor_device.get());
float* alpha_data = alpha_tensor_device->mutable_data<float>(place);
alpha_tensor_temp->Resize(alpha_tensor->dims());
TensorCopySync(*alpha_tensor, cpu_place, alpha_tensor_temp.get());
float* alpha_data = alpha_tensor_temp->mutable_data<float>(cpu_place);
// Transform alpha to TensorRTEngine::Weight
TensorRTEngine::Weight alpha_rt(nvinfer1::DataType::kFLOAT,
static_cast<void*>(alpha_data),
alpha_tensor_device->numel());
plugin::PReluPlugin* plugin = new plugin::PReluPlugin(alpha_rt, mode);
plugin::PReluPlugin* plugin =
new plugin::PReluPlugin(alpha_data, alpha_tensor_temp->numel(), mode);
nvinfer1::IPluginLayer* layer =
engine_->AddPlugin(&input, input_num, plugin);
// keep alpha tensor to avoid release it's memory
engine_->weight_map[op_desc.Input("Alpha")[0]] =
std::move(alpha_tensor_device);
std::move(alpha_tensor_temp);
std::string layer_name = "prelu (Output: ";
auto output_name = op_desc.Output("Out")[0];
......
......@@ -19,7 +19,9 @@ limitations under the License. */
#pragma once
#include <memory>
#include <string>
#include <unordered_set>
#include <vector>
#include "paddle/fluid/framework/lod_tensor.h"
......@@ -79,7 +81,8 @@ class TRTConvertValidation {
if_add_batch_(if_add_batch),
max_batch_size_(max_batch_size) {
PADDLE_ENFORCE_EQ(cudaStreamCreate(&stream_), 0);
engine_.reset(new TensorRTEngine(max_batch_size, workspace_size, stream_));
engine_.reset(
new TensorRTEngine(max_batch_size, workspace_size, false, nullptr, 0));
engine_->InitNetwork();
}
......@@ -114,13 +117,12 @@ class TRTConvertValidation {
}
void DeclVar(const std::string& name, const std::vector<int> dim_vec) {
platform::CUDAPlace place;
platform::CUDADeviceContext ctx(place);
platform::CUDADeviceContext ctx(place_);
auto* x = scope_.Var(name);
auto* x_tensor = x->GetMutable<framework::LoDTensor>();
x_tensor->Resize(framework::make_ddim(dim_vec));
RandomizeTensor(x_tensor, place, ctx);
RandomizeTensor(x_tensor, place_, ctx);
}
// Declare a variable in a fluid Scope.
void DeclVar(const std::string& name, const nvinfer1::Dims& dims,
......@@ -146,19 +148,6 @@ class TRTConvertValidation {
// Declare outputs.
op_desc_.reset(new framework::OpDesc(desc, nullptr));
// Set Inputs.
for (const auto& input : op_desc_->InputArgumentNames()) {
if (parameters_.count(input)) continue;
auto* var = scope_.FindVar(input);
PADDLE_ENFORCE(var);
auto tensor = var->GetMutable<framework::LoDTensor>();
engine_->SetInputFromGPU(
input, static_cast<void*>(tensor->data<void>()),
sizeof(float) *
analysis::AccuDims(tensor->dims(), tensor->dims().size()));
}
}
// We use the set 'neglected_output' here, because some Ops like batch norm,
......@@ -168,43 +157,71 @@ class TRTConvertValidation {
std::unordered_set<std::string> neglected_output = {}) {
// Execute Fluid Op
PADDLE_ENFORCE_LE(batch_size, max_batch_size_);
platform::CUDAPlace place;
platform::CUDADeviceContext ctx(place);
op_->Run(scope_, place);
// Execute TRT.
engine_->Execute(batch_size);
cudaStreamSynchronize(engine_->stream());
platform::CUDADeviceContext ctx(place_);
op_->Run(scope_, place_);
ASSERT_FALSE(op_desc_->OutputArgumentNames().empty());
const size_t output_space_size = 3000;
std::vector<std::string> input_output_names;
// Note: we need filter the parameter
for (const auto& input : op_desc_->InputArgumentNames()) {
if (parameters_.count(input)) continue;
input_output_names.push_back(input);
}
// Collect the fluid outputs.
std::vector<std::vector<float>> fluid_outs;
for (const auto& output : op_desc_->OutputArgumentNames()) {
if (neglected_output.count(output)) continue;
input_output_names.push_back(output);
std::vector<float> fluid_out;
std::vector<float> trt_out(output_space_size);
engine_->GetOutputInCPU(output, &trt_out[0], output_space_size);
cudaStreamSynchronize(engine_->stream());
auto* var = scope_.FindVar(output);
auto tensor = var->GetMutable<framework::LoDTensor>();
auto* tensor = var->GetMutable<framework::LoDTensor>();
framework::TensorToVector(*tensor, ctx, &fluid_out);
fluid_outs.push_back(fluid_out);
}
// Bind input and output for TRT.
const int num_bindings = input_output_names.size();
std::vector<void*> buffers(num_bindings);
for (const std::string& name : input_output_names) {
auto* var = scope_.FindVar(name);
auto* tensor = var->GetMutable<framework::LoDTensor>();
const int bind_index = engine_->engine()->getBindingIndex(name.c_str());
buffers[bind_index] =
static_cast<void*>(tensor->mutable_data<float>(place_));
}
// Execute TRT.
engine_->Execute(batch_size, &buffers, stream_);
size_t fluid_out_size = fluid_out.size();
ASSERT_FALSE(op_desc_->OutputArgumentNames().empty());
int index = 0;
for (const auto& output : op_desc_->OutputArgumentNames()) {
if (neglected_output.count(output)) continue;
std::vector<float> trt_out;
auto* var = scope_.FindVar(output);
auto* tensor = var->GetMutable<framework::LoDTensor>();
framework::TensorToVector(*tensor, ctx, &trt_out);
size_t fluid_out_size = fluid_outs[index].size();
if (if_add_batch_ == true) {
fluid_out_size =
batch_size * (framework::product(tensor->dims()) / max_batch_size_);
}
// Compare two output
ASSERT_FALSE(fluid_out.empty());
for (size_t i = 0; i < fluid_out_size; i++) {
// Loose the threshold for CI in different machine model.
EXPECT_LT(std::abs(fluid_out[i] - trt_out[i]), 2e-5);
EXPECT_LT(std::abs(fluid_outs[index][i] - trt_out[i]), 2e-5);
}
index += 1;
}
}
framework::Scope& scope() { return scope_; }
private:
platform::CUDAPlace place_;
std::unique_ptr<TensorRTEngine> engine_;
cudaStream_t stream_;
std::unique_ptr<framework::OperatorBase> op_;
......
......@@ -32,36 +32,18 @@ void TensorRTEngine::Build(const DescType &paddle_model) {
PADDLE_ENFORCE(false, "not implemented");
}
void TensorRTEngine::Execute(int batch_size) {
void TensorRTEngine::Execute(int batch_size, std::vector<void *> *buffers,
cudaStream_t stream) {
freshDeviceId();
batch_size_ = batch_size;
std::vector<void *> buffers;
for (auto &buf : buffers_) {
PADDLE_ENFORCE_NOT_NULL(buf.buffer, "buffer should be allocated");
PADDLE_ENFORCE_GT(buf.max_size, 0);
PADDLE_ENFORCE(buf.device == DeviceType::GPU);
buffers.push_back(buf.buffer);
}
infer_context_->enqueue(batch_size, buffers.data(), stream_, nullptr);
cudaStreamSynchronize(stream_);
infer_context_->enqueue(batch_size, buffers->data(), stream, nullptr);
cudaStreamSynchronize(stream);
SetRuntimeBatch(batch_size);
}
TensorRTEngine::~TensorRTEngine() {
cudaStreamSynchronize(stream_);
// clean buffer
for (auto &buf : buffers_) {
if (buf.device == DeviceType::GPU && buf.buffer != nullptr) {
PADDLE_ENFORCE_EQ(0, cudaFree(buf.buffer));
buf.buffer = nullptr;
buf.max_size = 0;
}
}
}
void TensorRTEngine::FreezeNetwork() {
VLOG(3) << "TRT to freeze network";
freshDeviceId();
VLOG(3) << "TRT to freeze network";
PADDLE_ENFORCE(infer_builder_ != nullptr,
"Call InitNetwork first to initialize network.");
PADDLE_ENFORCE(infer_network_ != nullptr,
......@@ -81,30 +63,6 @@ void TensorRTEngine::FreezeNetwork() {
PADDLE_ENFORCE(infer_engine_ != nullptr, "build cuda engine failed!");
infer_context_.reset(infer_engine_->createExecutionContext());
// allocate GPU buffers.
buffers_.resize(buffer_sizes_.size());
for (auto &item : buffer_sizes_) {
// The output buffers are not set in the network building phrase, need to
// infer from the TesorRT network.
if (item.second == 0) {
auto slot_offset = infer_engine_->getBindingIndex(item.first.c_str());
auto dims = infer_engine_->getBindingDimensions(slot_offset);
item.second = kDataTypeSize[static_cast<int>(
infer_engine_->getBindingDataType(slot_offset))] *
analysis::AccuDims(dims.d, dims.nbDims) * max_batch_;
PADDLE_ENFORCE_GT(item.second, 0);
}
auto &buf = buffer(item.first);
buf.max_size = item.second * max_batch_;
CHECK(buf.buffer == nullptr); // buffer should be allocated only once.
PADDLE_ENFORCE_EQ(0, cudaMalloc(&buf.buffer, item.second * max_batch_));
buf.size = 0;
PADDLE_ENFORCE_LE(buf.max_size, 1 << 30); // 10G
buf.device = DeviceType::GPU;
}
}
nvinfer1::ITensor *TensorRTEngine::DeclareInput(const std::string &name,
......@@ -158,83 +116,6 @@ void TensorRTEngine::DeclareOutput(const std::string &name) {
buffer_sizes_[name] = 0;
}
void *TensorRTEngine::GetOutputInGPU(const std::string &name) {
return buffer(name).buffer;
}
void TensorRTEngine::GetOutputInGPU(const std::string &name, void *dst,
size_t max_size) {
// determine data size
auto *output = TensorRTEngine::GetITensor(name);
nvinfer1::Dims dims = output->getDimensions();
auto dim_size = analysis::AccuDims(dims.d, dims.nbDims);
size_t dst_size = dim_size * runtime_batch_ *
kDataTypeSize[static_cast<int>(output->getType())];
auto it = buffer_sizes_.find(name);
PADDLE_ENFORCE(it != buffer_sizes_.end());
PADDLE_ENFORCE_GT(it->second, 0);
PADDLE_ENFORCE_LE(dst_size, it->second);
PADDLE_ENFORCE_GE(max_size, dst_size);
auto &buf = buffer(name);
PADDLE_ENFORCE_NOT_NULL(buf.buffer, "buffer should be allocated before");
PADDLE_ENFORCE_EQ(cudaMemcpyAsync(dst, buf.buffer, dst_size,
cudaMemcpyDeviceToDevice, stream_),
0);
}
void TensorRTEngine::GetOutputInCPU(const std::string &name, void *dst,
size_t max_size) {
// determine data size
auto *output = TensorRTEngine::GetITensor(name);
nvinfer1::Dims dims = output->getDimensions();
auto dim_size = analysis::AccuDims(dims.d, dims.nbDims);
size_t dst_size = dim_size * runtime_batch_ *
kDataTypeSize[static_cast<int>(output->getType())];
auto it = buffer_sizes_.find(name);
PADDLE_ENFORCE(it != buffer_sizes_.end());
PADDLE_ENFORCE_GT(it->second, 0);
PADDLE_ENFORCE_LE(dst_size, it->second);
PADDLE_ENFORCE_GE(max_size, dst_size);
auto &buf = buffer(name);
PADDLE_ENFORCE_NOT_NULL(buf.buffer, "buffer should be allocated before");
PADDLE_ENFORCE_EQ(0, cudaMemcpyAsync(dst, buf.buffer, dst_size,
cudaMemcpyDeviceToHost, stream_));
}
Buffer &TensorRTEngine::buffer(const std::string &name) {
PADDLE_ENFORCE(infer_engine_ != nullptr, "call FreezeNetwork first.");
auto it = buffer_sizes_.find(name);
PADDLE_ENFORCE(it != buffer_sizes_.end(), "tried to access buffer named %s",
name);
auto slot_offset = infer_engine_->getBindingIndex(name.c_str());
return buffers_[slot_offset];
}
void TensorRTEngine::SetInputFromCPU(const std::string &name, const void *data,
size_t size) {
auto &buf = buffer(name);
PADDLE_ENFORCE_NOT_NULL(buf.buffer);
PADDLE_ENFORCE_NOT_NULL(data);
PADDLE_ENFORCE_LE(size, buf.max_size, "buffer is too small");
PADDLE_ENFORCE(buf.device == DeviceType::GPU);
buf.size = size;
PADDLE_ENFORCE_EQ(0, cudaMemcpyAsync(buf.buffer, data, size,
cudaMemcpyHostToDevice, stream_));
}
void TensorRTEngine::SetInputFromGPU(const std::string &name, const void *data,
size_t size) {
auto &buf = buffer(name);
buf.size = size;
PADDLE_ENFORCE_NOT_NULL(buf.buffer);
PADDLE_ENFORCE_LE(size, buf.max_size, "buffer is too small");
PADDLE_ENFORCE(buf.device == DeviceType::GPU);
PADDLE_ENFORCE_EQ(0, cudaMemcpyAsync(buf.buffer, data, size,
cudaMemcpyDeviceToDevice, stream_));
}
void TensorRTEngine::SetITensor(const std::string &name,
nvinfer1::ITensor *tensor) {
PADDLE_ENFORCE(tensor != nullptr);
......@@ -254,13 +135,6 @@ void TensorRTEngine::SetRuntimeBatch(size_t batch_size) {
int TensorRTEngine::GetRuntimeBatch() { return runtime_batch_; }
void TensorRTEngine::freshDeviceId() {
int count;
cudaGetDeviceCount(&count);
PADDLE_ENFORCE_LT(device_, count);
cudaSetDevice(device_);
}
nvinfer1::IPluginLayer *TensorRTEngine::AddPlugin(
nvinfer1::ITensor *const *inputs, int num_inputs,
plugin::PluginTensorRT *plugin) {
......@@ -268,6 +142,13 @@ nvinfer1::IPluginLayer *TensorRTEngine::AddPlugin(
return infer_network_.get()->addPluginExt(inputs, num_inputs, *plugin);
}
void TensorRTEngine::freshDeviceId() {
int count;
cudaGetDeviceCount(&count);
PADDLE_ENFORCE_LT(device_id_, count);
cudaSetDevice(device_id_);
}
} // namespace tensorrt
} // namespace inference
} // namespace paddle
......@@ -23,6 +23,7 @@ limitations under the License. */
#include "paddle/fluid/inference/engine.h"
#include "paddle/fluid/inference/tensorrt/helper.h"
#include "paddle/fluid/inference/tensorrt/plugin/trt_plugin.h"
#include "paddle/fluid/inference/tensorrt/plugin/trt_plugin_factory.h"
#include "paddle/fluid/inference/tensorrt/trt_int8_calibrator.h"
#include "paddle/fluid/inference/utils/singleton.h"
......@@ -37,7 +38,9 @@ class TRTInt8Calibrator;
* There are two alternative ways to use it, one is to build from a paddle
* protobuf model, another way is to manully construct the network.
*/
class TensorRTEngine : public EngineBase {
class TensorRTEngine {
using DescType = ::paddle::framework::proto::BlockDesc;
public:
// Weight is model parameter.
class Weight {
......@@ -56,28 +59,28 @@ class TensorRTEngine : public EngineBase {
nvinfer1::Weights w_;
};
TensorRTEngine(int max_batch, int max_workspace, cudaStream_t stream,
int device = 0, bool enable_int8 = false,
TRTInt8Calibrator* calibrator = nullptr,
TensorRTEngine(int max_batch, int max_workspace, bool enable_int8 = false,
TRTInt8Calibrator* calibrator = nullptr, int device_id = 0,
nvinfer1::ILogger& logger = NaiveLogger::Global())
: max_batch_(max_batch),
max_workspace_(max_workspace),
stream_(stream),
device_(device),
enable_int8_(enable_int8),
calibrator_(calibrator),
device_id_(device_id),
logger_(logger) {}
virtual ~TensorRTEngine();
~TensorRTEngine() {}
// TODO(Superjomn) implement it later when graph segmentation is supported.
void Build(const DescType& paddle_model) override;
void Build(const DescType& paddle_model);
void Execute(int batch_size) override;
void Execute(int batch_size, std::vector<void*>* buffers,
cudaStream_t stream);
// Initialize the inference network, so that TensorRT layers can add to this
// network.
void InitNetwork() {
freshDeviceId();
infer_builder_.reset(createInferBuilder(&logger_));
infer_network_.reset(infer_builder_->createNetwork());
}
......@@ -98,37 +101,34 @@ class TensorRTEngine : public EngineBase {
// Check if the ITensor has been declared
bool HasDeclared(const std::string& name);
// GPU memory address for an ITensor with specific name. One can operate on
// these memory directly for acceleration, for example, output the converted
// data directly to the buffer to save data copy overhead.
// NOTE this should be used after calling `FreezeNetwork`.
Buffer& buffer(const std::string& name) override;
cudaStream_t stream() { return stream_; }
// Fill an input from CPU memory with name and size.
void SetInputFromCPU(const std::string& name, const void* data, size_t size);
// TODO(Superjomn) is this method necessary given that buffer(xxx) can be
// accessed directly. Fill an input from GPU memory with name and size.
void SetInputFromGPU(const std::string& name, const void* data, size_t size);
// Get an output called name, the output of tensorrt is in GPU, so this method
// Return the output's GPU memory address without copy.
void* GetOutputInGPU(const std::string& name);
// Copy data into dst inside the GPU device.
void GetOutputInGPU(const std::string& name, void* dst, size_t max_size);
// LOW EFFICENCY! Get output to CPU, this will trigger a memory copy from GPU
// to CPU.
void GetOutputInCPU(const std::string& name, void* dst, size_t max_size);
// Fill an ITensor into map itensor_map_.
void SetITensor(const std::string& name, nvinfer1::ITensor* tensor);
// Get an ITensor called name.
nvinfer1::ITensor* GetITensor(const std::string& name);
nvinfer1::ICudaEngine* engine() { return infer_engine_.get(); }
nvinfer1::INetworkDefinition* network() { return infer_network_.get(); }
nvinfer1::IHostMemory* Serialize() {
PADDLE_ENFORCE(infer_engine_ != nullptr,
"You should build engine first and then serialize");
ihost_memory_.reset(infer_engine_->serialize());
return ihost_memory_.get();
}
void Deserialize(const std::string& engine_serialized_data) {
freshDeviceId();
infer_ptr<nvinfer1::IRuntime> runtime(createInferRuntime(&logger_));
infer_engine_.reset(runtime->deserializeCudaEngine(
engine_serialized_data.c_str(), engine_serialized_data.size(),
&inference::Singleton<plugin::PluginFactoryTensorRT>::Global()));
PADDLE_ENFORCE(infer_engine_ != nullptr,
"build cuda engine failed when deserialize engine info.!");
infer_context_.reset(infer_engine_->createExecutionContext());
}
void SetRuntimeBatch(size_t batch_size);
int GetRuntimeBatch();
int GetDevice() { return device_; }
int GetDeviceId() { return device_id_; }
nvinfer1::IPluginLayer* AddPlugin(nvinfer1::ITensor* const* inputs,
int num_inputs, plugin::PluginTensorRT*);
......@@ -140,17 +140,12 @@ class TensorRTEngine : public EngineBase {
std::unordered_map<std::string /*name*/, std::unique_ptr<framework::Tensor>>
weight_map;
// TODO(NHZLX)
// In the normal case, the paddle-trt exists bug when runing the googlenet.
// When there are more than two convolutions of 1 * 1 with the same input, the
// paddle-tensorrt will do the merging optimization, which fuse those conv
// into one conv, and then trigger bug. So, We should use strategy to avoid
// this
// optimization for the time being. This bug will be fixed in the future.
std::unordered_map<std::string /*name*/, int /*ITensor_quote_num*/>
itensor_quote_num;
private:
// Each ICudaEngine object is bound to a specific GPU when it is instantiated,
// ensure that the thread is associated with the correct device by calling
// freshDeviceId().
void freshDeviceId();
// the max batch size
int max_batch_;
// the runtime batch size
......@@ -158,18 +153,14 @@ class TensorRTEngine : public EngineBase {
// the max memory size the engine uses
int max_workspace_;
cudaStream_t stream_;
// The specific GPU id that the TensorRTEngine bounded to.
int device_;
bool enable_int8_;
TRTInt8Calibrator* calibrator_;
// batch size of the current data, will be updated each Executation.
int batch_size_{-1};
int device_id_;
nvinfer1::ILogger& logger_;
std::vector<Buffer> buffers_;
// max data size for the buffers.
std::unordered_map<std::string /*name*/, size_t /*max size*/> buffer_sizes_;
std::unordered_map<std::string /*name*/, nvinfer1::ITensor* /*ITensor*/>
......@@ -192,15 +183,11 @@ class TensorRTEngine : public EngineBase {
infer_ptr<nvinfer1::INetworkDefinition> infer_network_;
infer_ptr<nvinfer1::ICudaEngine> infer_engine_;
infer_ptr<nvinfer1::IExecutionContext> infer_context_;
// Each ICudaEngine object is bound to a specific GPU when it is instantiated,
// ensure that the thread is associated with the correct device by calling
// freshDeviceId().
void freshDeviceId();
infer_ptr<nvinfer1::IHostMemory> ihost_memory_;
}; // class TensorRTEngine
// Add an layer__ into engine__ with args ARGS.
// For example:
// TRT_ENGINE_ADD_LAYER(xxx, FullyConnected, input, dim, weights, bias)
//
// Reference
// https://docs.nvidia.com/deeplearning/sdk/tensorrt-developer-guide/index.html#charRNN_define_network
......
......@@ -17,6 +17,9 @@
#include <NvInfer.h>
#include <cuda.h>
#include <glog/logging.h>
#include <string>
#include <utility>
#include <vector>
#include "paddle/fluid/platform/dynload/tensorrt.h"
#include "paddle/fluid/platform/enforce.h"
......@@ -74,6 +77,32 @@ class NaiveLogger : public nvinfer1::ILogger {
~NaiveLogger() override {}
};
class NaiveProfiler : public nvinfer1::IProfiler {
public:
typedef std::pair<std::string, float> Record;
std::vector<Record> mProfile;
virtual void reportLayerTime(const char* layerName, float ms) {
auto record =
std::find_if(mProfile.begin(), mProfile.end(),
[&](const Record& r) { return r.first == layerName; });
if (record == mProfile.end())
mProfile.push_back(std::make_pair(layerName, ms));
else
record->second += ms;
}
void printLayerTimes() {
float totalTime = 0;
for (size_t i = 0; i < mProfile.size(); i++) {
printf("%-40.40s %4.3fms\n", mProfile[i].first.c_str(),
mProfile[i].second);
totalTime += mProfile[i].second;
}
printf("Time over all layers: %4.3f\n", totalTime);
}
};
} // namespace tensorrt
} // namespace inference
} // namespace paddle
nv_library(tensorrt_plugin
SRCS trt_plugin.cc split_op_plugin.cu elementwise_op_plugin.cu prelu_op_plugin.cu
SRCS trt_plugin.cc split_op_plugin.cu elementwise_op_plugin.cu
prelu_op_plugin.cu trt_plugin_factory.cc
avg_pool_op_plugin.cu
DEPS enforce tensorrt_engine prelu)
......@@ -13,6 +13,7 @@
// limitations under the License.
#include "paddle/fluid/inference/tensorrt/plugin/avg_pool_op_plugin.h"
#include "paddle/fluid/inference/tensorrt/plugin/trt_plugin_factory.h"
#include "paddle/fluid/operators/math/pooling.h"
namespace paddle {
......@@ -20,6 +21,12 @@ namespace inference {
namespace tensorrt {
namespace plugin {
AvgPoolPlugin* CreateAvgPoolPluginDeserialize(const void* buffer,
size_t length) {
return new AvgPoolPlugin(buffer, length);
}
REGISTER_TRT_PLUGIN("avg_pool_plugin", CreateAvgPoolPluginDeserialize);
nvinfer1::Dims AvgPoolPlugin::getOutputDimensions(
int index, const nvinfer1::Dims* inputDims, int nbInputs) {
assert(nbInputs == 1);
......
......@@ -33,24 +33,27 @@ class AvgPoolPlugin : public PluginTensorRT {
protected:
size_t getSerializationSize() override {
return SerializedSize(ceil_mode_) + SerializedSize(ksize_) +
SerializedSize(strides_) + SerializedSize(paddings_) +
SerializedSize(input_shape_) + getBaseSerializationSize();
return SerializedSize(getPluginType()) + SerializedSize(ceil_mode_) +
SerializedSize(ksize_) + SerializedSize(strides_) +
SerializedSize(paddings_) + SerializedSize(input_shape_) +
SerializedSize(output_shape_) + getBaseSerializationSize();
}
// TRT will call this func when we need to serialize the configuration of
// tensorrt.
// It should not be called by users.
void serialize(void *buffer) override {
SerializeValue(&buffer, getPluginType());
serializeBase(buffer);
SerializeValue(&buffer, ceil_mode_);
SerializeValue(&buffer, ksize_);
SerializeValue(&buffer, strides_);
SerializeValue(&buffer, paddings_);
SerializeValue(&buffer, input_shape_);
SerializeValue(&buffer, output_shape_);
}
public:
AvgPoolPlugin() {}
AvgPoolPlugin(bool ceil_mode, std::vector<int> ksize,
std::vector<int> strides, std::vector<int> paddings,
std::vector<int> input_shape)
......@@ -89,6 +92,7 @@ class AvgPoolPlugin : public PluginTensorRT {
DeserializeValue(&serialData, &serialLength, &strides_);
DeserializeValue(&serialData, &serialLength, &paddings_);
DeserializeValue(&serialData, &serialLength, &input_shape_);
DeserializeValue(&serialData, &serialLength, &output_shape_);
}
AvgPoolPlugin *clone() const override {
......@@ -96,7 +100,7 @@ class AvgPoolPlugin : public PluginTensorRT {
input_shape_);
}
const char *getPluginType() const override { return "avg_pool"; }
const char *getPluginType() const override { return "avg_pool_plugin"; }
int getNbOutputs() const override { return 1; }
nvinfer1::Dims getOutputDimensions(int index, const nvinfer1::Dims *inputs,
int nbInputDims) override;
......
......@@ -14,12 +14,19 @@ limitations under the License. */
#include <glog/logging.h>
#include "paddle/fluid/inference/tensorrt/plugin/elementwise_op_plugin.h"
#include "paddle/fluid/inference/tensorrt/plugin/trt_plugin_factory.h"
namespace paddle {
namespace inference {
namespace tensorrt {
namespace plugin {
ElementWisePlugin* CreateElementWisePluginDeserialize(const void* buffer,
size_t length) {
return new ElementWisePlugin(buffer, length);
}
REGISTER_TRT_PLUGIN("elementwise_plugin", CreateElementWisePluginDeserialize);
namespace details {
template <typename T>
......@@ -119,10 +126,10 @@ int ElementWisePlugin::enqueue(int batch_size, const void* const* inputs,
const float* y = reinterpret_cast<const float*>(inputs[1]);
float* out = reinterpret_cast<float*>(outputs[0]);
if (type_ == nvinfer1::ElementWiseOperation::kSUM) {
if (type_ == "add") {
details::ElementWise(details::Add<float>(), x, y, out, batch_size,
prev_size_, midd_size_, post_size_, stream);
} else if (type_ == nvinfer1::ElementWiseOperation::kPROD) {
} else if (type_ == "mul") {
details::ElementWise(details::Mul<float>(), x, y, out, batch_size,
prev_size_, midd_size_, post_size_, stream);
} else {
......
......@@ -14,6 +14,7 @@ limitations under the License. */
#pragma once
#include <string>
#include <vector>
#include "paddle/fluid/inference/tensorrt/plugin/trt_plugin.h"
......@@ -24,9 +25,8 @@ namespace plugin {
class ElementWisePlugin : public PluginTensorRT {
public:
ElementWisePlugin(nvinfer1::ElementWiseOperation type,
nvinfer1::Dims const &dims_x, nvinfer1::Dims const &dims_y,
int axis)
ElementWisePlugin(std::string type, nvinfer1::Dims const &dims_x,
nvinfer1::Dims const &dims_y, int axis)
: type_(type),
dims_x_(dims_x),
dims_y_(dims_y),
......@@ -37,6 +37,9 @@ class ElementWisePlugin : public PluginTensorRT {
ElementWisePlugin(void const *serial_data, size_t serial_length) {
deserializeBase(serial_data, serial_length);
const char *elementwise_type;
DeserializeValue(&serial_data, &serial_length, &elementwise_type);
type_ = std::string(elementwise_type);
DeserializeValue(&serial_data, &serial_length, &axis_);
DeserializeValue(&serial_data, &serial_length, &dims_x_);
DeserializeValue(&serial_data, &serial_length, &dims_y_);
......@@ -47,7 +50,7 @@ class ElementWisePlugin : public PluginTensorRT {
return nullptr;
}
const char *getPluginType() const override { return "elementwise"; }
const char *getPluginType() const override { return "elementwise_plugin"; }
nvinfer1::Dims getOutputDimensions(int index,
const nvinfer1::Dims *input_dims,
......@@ -61,18 +64,21 @@ class ElementWisePlugin : public PluginTensorRT {
protected:
size_t getSerializationSize() override {
return SerializedSize(axis_) + SerializedSize(dims_x_) +
SerializedSize(dims_y_) + getBaseSerializationSize();
return SerializedSize(getPluginType()) + SerializedSize(axis_) +
SerializedSize(dims_x_) + SerializedSize(dims_y_) +
getBaseSerializationSize();
}
void serialize(void *buffer) override {
SerializeValue(&buffer, getPluginType());
serializeBase(buffer);
SerializeValue(&buffer, type_.c_str());
SerializeValue(&buffer, axis_);
SerializeValue(&buffer, dims_x_);
SerializeValue(&buffer, dims_y_);
}
nvinfer1::ElementWiseOperation type_;
std::string type_;
nvinfer1::Dims dims_x_;
nvinfer1::Dims dims_y_;
int axis_;
......
......@@ -17,6 +17,7 @@
#include <vector>
#include "glog/logging.h"
#include "paddle/fluid/inference/tensorrt/plugin/prelu_op_plugin.h"
#include "paddle/fluid/inference/tensorrt/plugin/trt_plugin_factory.h"
#include "paddle/fluid/operators/math/prelu.h"
namespace paddle {
......@@ -24,6 +25,17 @@ namespace inference {
namespace tensorrt {
namespace plugin {
PReluPlugin *CreatePreluPluginDeserialize(const void *buffer, size_t length) {
return new PReluPlugin(buffer, length);
}
REGISTER_TRT_PLUGIN("prelu_plugin", CreatePreluPluginDeserialize);
int PReluPlugin::initialize() {
cudaMalloc(&p_gpu_weight_, sizeof(float) * weight_.size());
cudaMemcpy(p_gpu_weight_, weight_.data(), weight_.size() * sizeof(float),
cudaMemcpyHostToDevice);
}
nvinfer1::Dims PReluPlugin::getOutputDimensions(int index,
const nvinfer1::Dims *inputDims,
int nbInputs) {
......@@ -39,7 +51,8 @@ int PReluPlugin::enqueue(int batch_size, const void *const *inputs,
// input dims is CHW.
const auto &input_dims = this->getInputDims(0);
const float *input = reinterpret_cast<const float *>(inputs[0]);
const float *alpha = reinterpret_cast<const float *>(alpha_.get().values);
// const float *alpha = reinterpret_cast<const float *>(alpha_.get().values);
const float *alpha = p_gpu_weight_;
float *output = reinterpret_cast<float **>(outputs)[0];
std::vector<int> input_shape;
......
......@@ -14,7 +14,12 @@
#pragma once
#include <algorithm>
#include <string>
#include <vector>
#include "paddle/fluid/framework/tensor.h"
#include "paddle/fluid/framework/tensor_util.h"
#include "paddle/fluid/inference/tensorrt/engine.h"
#include "paddle/fluid/inference/tensorrt/plugin/trt_plugin.h"
......@@ -24,39 +29,51 @@ namespace tensorrt {
namespace plugin {
class PReluPlugin : public PluginTensorRT {
TensorRTEngine::Weight alpha_;
std::vector<float> weight_;
float *p_gpu_weight_;
std::string mode_;
protected:
size_t getSerializationSize() override {
// return getBaseSerializationSize(alpha_) + SerializedSize(mode_);
return 0;
return getBaseSerializationSize() + SerializedSize(mode_.c_str()) +
SerializedSize(weight_) + SerializedSize(getPluginType());
}
// TRT will call this func when we need to serialize the configuration of
// tensorrt.
// It should not be called by users.
void serialize(void *buffer) override {
// serializeBase(buffer);
// SerializeValue(&buffer, alpha_);
// SerializeValue(&buffer, mode_);
SerializeValue(&buffer, getPluginType());
serializeBase(buffer);
SerializeValue(&buffer, weight_);
SerializeValue(&buffer, mode_.c_str());
}
public:
PReluPlugin(TensorRTEngine::Weight const &alpha, std::string const &mode)
: alpha_(alpha), mode_(mode) {}
PReluPlugin(const float *weight, const int weight_num,
std::string const &mode)
: mode_(mode) {
weight_.resize(weight_num);
std::copy(weight, weight + weight_num, weight_.data());
}
// It was used for tensorrt deserialization.
// It should not be called by users.
PReluPlugin(void const *serialData, size_t serialLength) {
// deserializeBase(serialData, serialLength);
// DeserializeValue(&serialData, &serialLength, &alpha_);
// DeserializeValue(&serialData, &serialLength, &mode_);
deserializeBase(serialData, serialLength);
DeserializeValue(&serialData, &serialLength, &weight_);
const char *prelu_mode;
DeserializeValue(&serialData, &serialLength, &prelu_mode);
mode_ = std::string(prelu_mode);
}
~PReluPlugin() { cudaFree(p_gpu_weight_); }
int initialize() override;
PReluPlugin *clone() const override { return new PReluPlugin(alpha_, mode_); }
PReluPlugin *clone() const override {
return new PReluPlugin(weight_.data(), weight_.size(), mode_);
}
const char *getPluginType() const override { return "prelu"; }
const char *getPluginType() const override { return "prelu_plugin"; }
int getNbOutputs() const override { return 1; }
nvinfer1::Dims getOutputDimensions(int index, const nvinfer1::Dims *inputs,
int nbInputDims) override;
......
......@@ -15,12 +15,18 @@
#include <cuda_fp16.h>
#include <algorithm>
#include "paddle/fluid/inference/tensorrt/plugin/split_op_plugin.h"
#include "paddle/fluid/inference/tensorrt/plugin/trt_plugin_factory.h"
namespace paddle {
namespace inference {
namespace tensorrt {
namespace plugin {
SplitPlugin* CreateSplitPluginDeserialize(const void* buffer, size_t length) {
return new SplitPlugin(buffer, length);
}
REGISTER_TRT_PLUGIN("split_plugin", CreateSplitPluginDeserialize);
// copied from operators::math::SplitFunctor
template <typename T>
__global__ void SplitKernel(const T* input_data, const int in_row,
......
......@@ -15,6 +15,7 @@
#pragma once
#include <thrust/device_vector.h>
#include <utility>
#include <vector>
#include "paddle/fluid/inference/tensorrt/plugin/trt_plugin.h"
......@@ -25,6 +26,7 @@ namespace plugin {
class SplitPlugin : public PluginTensorRT {
public:
SplitPlugin() {}
SplitPlugin(int axis, std::vector<int> const &output_lengths)
: axis_(axis), same_shape_(true), output_length_(output_lengths) {}
......@@ -38,7 +40,7 @@ class SplitPlugin : public PluginTensorRT {
return new SplitPlugin(axis_, output_length_);
}
const char *getPluginType() const override { return "split"; }
const char *getPluginType() const override { return "split_plugin"; }
int getNbOutputs() const override { return output_length_.size(); }
nvinfer1::Dims getOutputDimensions(int index,
const nvinfer1::Dims *input_dims,
......@@ -50,11 +52,12 @@ class SplitPlugin : public PluginTensorRT {
protected:
size_t getSerializationSize() override {
return SerializedSize(axis_) + SerializedSize(output_length_) +
getBaseSerializationSize();
return SerializedSize(getPluginType()) + SerializedSize(axis_) +
SerializedSize(output_length_) + getBaseSerializationSize();
}
void serialize(void *buffer) override {
SerializeValue(&buffer, getPluginType());
serializeBase(buffer);
SerializeValue(&buffer, axis_);
SerializeValue(&buffer, output_length_);
......
......@@ -17,9 +17,10 @@
#include <NvInfer.h>
#include <cstring>
#include <unordered_map>
#include <utility>
#include <vector>
#include "paddle/fluid/inference/tensorrt/plugin/serialize.h"
#include "paddle/fluid/inference/tensorrt/plugin/trt_plugin_utils.h"
#include "paddle/fluid/platform/enforce.h"
#include "paddle/fluid/platform/profiler.h"
......@@ -30,6 +31,13 @@ namespace inference {
namespace tensorrt {
namespace plugin {
class PluginTensorRT;
typedef std::function<PluginTensorRT*(const void*, size_t)>
PluginDeserializeFunc;
typedef std::function<PluginTensorRT*(void)> PluginConstructFunc;
class PluginTensorRT : public nvinfer1::IPluginExt {
public:
PluginTensorRT() {}
......
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/fluid/inference/tensorrt/plugin/trt_plugin_factory.h"
namespace paddle {
namespace inference {
namespace tensorrt {
namespace plugin {
PluginTensorRT* PluginFactoryTensorRT::createPlugin(const char* layer_name,
const void* serial_data,
size_t serial_length) {
const char* plugin_type;
DeserializeValue(&serial_data, &serial_length, &plugin_type);
PADDLE_ENFORCE(Has(plugin_type),
"trt plugin type %s does not exists, check it.", plugin_type);
auto plugin = plugin_registry_[plugin_type](serial_data, serial_length);
owned_plugins_.emplace_back(plugin);
return plugin;
}
bool PluginFactoryTensorRT::RegisterPlugin(
const std::string& op_name, PluginDeserializeFunc deserialize_func) {
if (Has(op_name)) return false;
auto ret = plugin_registry_.emplace(op_name, deserialize_func);
return ret.second;
}
void PluginFactoryTensorRT::DestroyPlugins() { owned_plugins_.clear(); }
} // namespace plugin
} // namespace tensorrt
} // namespace inference
} // namespace paddle
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include <NvInfer.h>
#include <cstring>
#include <list>
#include <memory>
#include <string>
#include <unordered_map>
#include <vector>
#include "paddle/fluid/inference/tensorrt/plugin/trt_plugin.h"
#include "paddle/fluid/inference/tensorrt/plugin/trt_plugin_utils.h"
#include "paddle/fluid/inference/utils/singleton.h"
#include "paddle/fluid/platform/enforce.h"
namespace paddle {
namespace inference {
namespace tensorrt {
namespace plugin {
class PluginFactoryTensorRT : public nvinfer1::IPluginFactory,
public DeleteHelper {
public:
// Deserialization method
PluginTensorRT* createPlugin(const char* layer_name, const void* serial_data,
size_t serial_length) override;
bool RegisterPlugin(const std::string& op_name,
PluginDeserializeFunc deserialize_func);
bool Has(const std::string& op_name) {
return plugin_registry_.find(op_name) != plugin_registry_.end();
}
void DestroyPlugins();
protected:
std::unordered_map<std::string, PluginDeserializeFunc> plugin_registry_;
std::list<std::unique_ptr<PluginTensorRT>> owned_plugins_;
};
class TrtPluginRegistrar {
public:
TrtPluginRegistrar(const std::string& name,
PluginDeserializeFunc deserialize_func) {
inference::Singleton<PluginFactoryTensorRT>::Global().RegisterPlugin(
name, deserialize_func);
}
};
#define REGISTER_TRT_PLUGIN(name, deserialize_func) \
REGISTER_TRT_PLUGIN_UNIQ(__COUNTER__, name, deserialize_func)
#define REGISTER_TRT_PLUGIN_UNIQ(ctr, name, deserialize_func) \
static paddle::inference::tensorrt::plugin::TrtPluginRegistrar \
trt_plugin_registrar##ctr __attribute__((unused)) = \
paddle::inference::tensorrt::plugin::TrtPluginRegistrar( \
name, deserialize_func)
} // namespace plugin
} // namespace tensorrt
} // namespace inference
} // namespace paddle
......@@ -13,8 +13,8 @@
// limitations under the License.
#pragma once
#include <cstring>
#include <string>
#include <type_traits>
#include <vector>
#include "paddle/fluid/platform/enforce.h"
......@@ -24,6 +24,13 @@ namespace inference {
namespace tensorrt {
namespace plugin {
// Some trt base classes lack of the destructor.
// We use a assisted class to fix this.
struct DeleteHelper {
protected:
virtual ~DeleteHelper() {}
};
template <typename T>
inline void SerializeValue(void** buffer, T const& value);
......
......@@ -17,6 +17,8 @@ limitations under the License. */
#include <glog/logging.h>
#include <gtest/gtest.h>
#include "paddle/fluid/framework/tensor.h"
#include "paddle/fluid/framework/tensor_util.h"
#include "paddle/fluid/inference/tensorrt/engine.h"
#include "paddle/fluid/platform/enforce.h"
......@@ -27,19 +29,34 @@ namespace tensorrt {
class TensorRTEngineTest : public ::testing::Test {
protected:
void SetUp() override {
ASSERT_EQ(0, cudaStreamCreate(&stream_));
engine_ = new TensorRTEngine(10, 1 << 10, stream_);
ctx_ = new platform::CUDADeviceContext(platform::CUDAPlace(0));
engine_ = new TensorRTEngine(10, 1 << 10);
engine_->InitNetwork();
}
void TearDown() override {
delete engine_;
cudaStreamDestroy(stream_);
if (engine_) {
delete engine_;
engine_ = nullptr;
}
}
void PrepareInputOutput(const std::vector<float> &input,
std::vector<int> output_shape) {
TensorFromVector(input, *ctx_, &input_);
output_.Resize(framework::make_ddim(output_shape));
}
void GetOutput(std::vector<float> *output) {
TensorToVector(output_, *ctx_, output);
}
protected:
TensorRTEngine* engine_;
cudaStream_t stream_;
framework::Tensor input_;
framework::Tensor output_;
TensorRTEngine *engine_;
platform::CUDADeviceContext *ctx_;
};
TEST_F(TensorRTEngineTest, add_layer) {
......@@ -48,12 +65,14 @@ TEST_F(TensorRTEngineTest, add_layer) {
float raw_weight[size] = {2.}; // Weight in CPU memory.
float raw_bias[size] = {3.};
std::vector<void *> buffers(2); // TRT binded inputs
LOG(INFO) << "create weights";
TensorRTEngine::Weight weight(nvinfer1::DataType::kFLOAT, raw_weight, size);
TensorRTEngine::Weight bias(nvinfer1::DataType::kFLOAT, raw_bias, size);
auto* x = engine_->DeclareInput("x", nvinfer1::DataType::kFLOAT,
auto *x = engine_->DeclareInput("x", nvinfer1::DataType::kFLOAT,
nvinfer1::DimsCHW{1, 1, 1});
auto* fc_layer = TRT_ENGINE_ADD_LAYER(engine_, FullyConnected, *x, size,
auto *fc_layer = TRT_ENGINE_ADD_LAYER(engine_, FullyConnected, *x, size,
weight.get(), bias.get());
PADDLE_ENFORCE(fc_layer != nullptr);
......@@ -63,18 +82,24 @@ TEST_F(TensorRTEngineTest, add_layer) {
ASSERT_EQ(engine_->engine()->getNbBindings(), 2);
// fill in real data
float x_v = 1234;
engine_->SetInputFromCPU("x", reinterpret_cast<void*>(&x_v),
1 * sizeof(float));
std::vector<float> x_v = {1234};
std::vector<float> y_cpu;
PrepareInputOutput(x_v, {1});
auto *x_v_gpu_data = input_.mutable_data<float>(ctx_->GetPlace());
auto *y_gpu_data = output_.mutable_data<float>(ctx_->GetPlace());
buffers[0] = reinterpret_cast<void *>(x_v_gpu_data);
buffers[1] = reinterpret_cast<void *>(y_gpu_data);
LOG(INFO) << "to execute";
engine_->Execute(1);
engine_->Execute(1, &buffers, ctx_->stream());
LOG(INFO) << "to get output";
float y_cpu;
engine_->GetOutputInCPU("y", &y_cpu, 1 * sizeof(float));
GetOutput(&y_cpu);
LOG(INFO) << "to checkout output";
ASSERT_EQ(y_cpu, x_v * 2 + 3);
ASSERT_EQ(y_cpu[0], x_v[0] * 2 + 3);
}
TEST_F(TensorRTEngineTest, add_layer_multi_dim) {
......@@ -83,12 +108,13 @@ TEST_F(TensorRTEngineTest, add_layer_multi_dim) {
// instead of row-major, which is [[1.0, 1.1], [3.3, 4.4]]
float raw_weight[4] = {1.0, 1.1, 3.3, 4.4};
float raw_bias[2] = {1.3, 2.4};
std::vector<void *> buffers(2); // TRT binded inputs
TensorRTEngine::Weight weight(nvinfer1::DataType::kFLOAT, raw_weight, 4);
TensorRTEngine::Weight bias(nvinfer1::DataType::kFLOAT, raw_bias, 2);
auto* x = engine_->DeclareInput("x", nvinfer1::DataType::kFLOAT,
auto *x = engine_->DeclareInput("x", nvinfer1::DataType::kFLOAT,
nvinfer1::DimsCHW{1, 2, 1});
auto* fc_layer = TRT_ENGINE_ADD_LAYER(engine_, FullyConnected, *x, 2,
auto *fc_layer = TRT_ENGINE_ADD_LAYER(engine_, FullyConnected, *x, 2,
weight.get(), bias.get());
PADDLE_ENFORCE(fc_layer != nullptr);
......@@ -96,19 +122,27 @@ TEST_F(TensorRTEngineTest, add_layer_multi_dim) {
engine_->FreezeNetwork();
ASSERT_EQ(engine_->engine()->getNbBindings(), 2);
float x_v[2] = {1.0, 2.0};
engine_->SetInputFromCPU("x", reinterpret_cast<void*>(&x_v),
2 * sizeof(float));
engine_->Execute(1);
// fill in real data
std::vector<float> x_v = {1.0, 2.0};
std::vector<float> y_cpu;
PrepareInputOutput(x_v, {2});
auto *x_v_gpu_data = input_.mutable_data<float>(ctx_->GetPlace());
auto *y_gpu_data = output_.mutable_data<float>(ctx_->GetPlace());
buffers[0] = reinterpret_cast<void *>(x_v_gpu_data);
buffers[1] = reinterpret_cast<void *>(y_gpu_data);
engine_->Execute(1, &buffers, ctx_->stream());
LOG(INFO) << "to get output";
float y_cpu[2] = {-1., -1.};
GetOutput(&y_cpu);
auto dims = engine_->GetITensor("y")->getDimensions();
ASSERT_EQ(dims.nbDims, 3);
ASSERT_EQ(dims.d[0], 2);
ASSERT_EQ(dims.d[1], 1);
engine_->GetOutputInCPU("y", &y_cpu[0], 2 * sizeof(float));
ASSERT_EQ(y_cpu[0], 4.5);
ASSERT_EQ(y_cpu[1], 14.5);
}
......@@ -117,12 +151,13 @@ TEST_F(TensorRTEngineTest, test_conv2d) {
// Weight in CPU memory.
float raw_weight[9] = {1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0};
float raw_bias[1] = {0};
std::vector<void *> buffers(2); // TRT binded inputs
TensorRTEngine::Weight weight(nvinfer1::DataType::kFLOAT, raw_weight, 9);
TensorRTEngine::Weight bias(nvinfer1::DataType::kFLOAT, raw_bias, 1);
auto* x = engine_->DeclareInput("x", nvinfer1::DataType::kFLOAT,
auto *x = engine_->DeclareInput("x", nvinfer1::DataType::kFLOAT,
nvinfer1::Dims3{1, 3, 3});
auto* conv_layer =
auto *conv_layer =
TRT_ENGINE_ADD_LAYER(engine_, Convolution, *x, 1, nvinfer1::DimsHW{3, 3},
weight.get(), bias.get());
PADDLE_ENFORCE(conv_layer != nullptr);
......@@ -133,28 +168,36 @@ TEST_F(TensorRTEngineTest, test_conv2d) {
engine_->FreezeNetwork();
ASSERT_EQ(engine_->engine()->getNbBindings(), 2);
float x_v[18] = {1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0,
1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0};
engine_->SetInputFromCPU("x", reinterpret_cast<void*>(&x_v),
18 * sizeof(float));
engine_->Execute(2);
// fill in real data
std::vector<float> x_v = {1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0,
1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0};
std::vector<float> y_cpu;
PrepareInputOutput(x_v, {18});
auto *x_v_gpu_data = input_.mutable_data<float>(ctx_->GetPlace());
auto *y_gpu_data = output_.mutable_data<float>(ctx_->GetPlace());
buffers[0] = reinterpret_cast<void *>(x_v_gpu_data);
buffers[1] = reinterpret_cast<void *>(y_gpu_data);
engine_->Execute(2, &buffers, ctx_->stream());
LOG(INFO) << "to get output";
float* y_cpu = new float[18];
engine_->GetOutputInCPU("y", &y_cpu[0], 18 * sizeof(float));
GetOutput(&y_cpu);
ASSERT_EQ(y_cpu[0], 4.0);
ASSERT_EQ(y_cpu[1], 6.0);
}
TEST_F(TensorRTEngineTest, test_pool2d) {
// Weight in CPU memory.
auto* x = engine_->DeclareInput("x", nvinfer1::DataType::kFLOAT,
auto *x = engine_->DeclareInput("x", nvinfer1::DataType::kFLOAT,
nvinfer1::Dims3{1, 2, 2});
std::vector<void *> buffers(2); // TRT binded inputs
nvinfer1::PoolingType pool_t = nvinfer1::PoolingType::kAVERAGE;
auto* pool_layer =
TRT_ENGINE_ADD_LAYER(engine_, Pooling, *const_cast<nvinfer1::ITensor*>(x),
pool_t, nvinfer1::DimsHW{2, 2});
auto *pool_layer = TRT_ENGINE_ADD_LAYER(engine_, Pooling, *x, pool_t,
nvinfer1::DimsHW{2, 2});
PADDLE_ENFORCE(pool_layer != nullptr);
pool_layer->setStride(nvinfer1::DimsHW{1, 1});
......@@ -164,14 +207,21 @@ TEST_F(TensorRTEngineTest, test_pool2d) {
engine_->FreezeNetwork();
ASSERT_EQ(engine_->engine()->getNbBindings(), 2);
float x_v[8] = {1.0, 2.0, 5.0, 0.0, 2.0, 3.0, 5.0, 10.0};
engine_->SetInputFromCPU("x", reinterpret_cast<void*>(&x_v),
8 * sizeof(float));
engine_->Execute(2);
// fill in real data
std::vector<float> x_v = {1.0, 2.0, 5.0, 0.0, 2.0, 3.0, 5.0, 10.0};
std::vector<float> y_cpu;
PrepareInputOutput(x_v, {2});
auto *x_v_gpu_data = input_.mutable_data<float>(ctx_->GetPlace());
auto *y_gpu_data = output_.mutable_data<float>(ctx_->GetPlace());
buffers[0] = reinterpret_cast<void *>(x_v_gpu_data);
buffers[1] = reinterpret_cast<void *>(y_gpu_data);
engine_->Execute(2, &buffers, ctx_->stream());
LOG(INFO) << "to get output";
float* y_cpu = new float[2];
engine_->GetOutputInCPU("y", &y_cpu[0], 2 * sizeof(float));
GetOutput(&y_cpu);
ASSERT_EQ(y_cpu[0], 2.0);
ASSERT_EQ(y_cpu[1], 5.0);
......
......@@ -366,15 +366,17 @@ TEST(Analyzer_rnn1, ZeroCopyMultiThread) {
#define NEW_TENSOR(name__) \
auto name__##_tensor = predictor->GetInputTensor(#name__);
auto base_predictor = CreatePaddlePredictor<AnalysisConfig>(config);
std::vector<std::unique_ptr<PaddlePredictor>> predictors;
predictors.emplace_back(CreatePaddlePredictor<AnalysisConfig>(config));
for (int tid = 1; tid < FLAGS_num_threads; tid++) {
predictors.emplace_back(predictors.front()->Clone());
}
double total_time_of_threads{0};
std::vector<std::thread> threads;
for (int tid = 0; tid < FLAGS_num_threads; tid++) {
threads.emplace_back([&, tid] {
// To ensure the thread binding correctly,
// please clone inside the threadpool.
auto predictor = base_predictor->Clone();
auto &predictor = predictors[tid];
NEW_TENSOR(data_lod_attention);
NEW_TENSOR(cell_init);
NEW_TENSOR(data);
......
......@@ -266,15 +266,17 @@ TEST(Analyzer_seq_pool1, zerocopy_profile_threads) {
SetConfig(&config);
config.SwitchUseFeedFetchOps(false);
auto base_predictor = CreatePaddlePredictor<AnalysisConfig>(config);
std::vector<std::unique_ptr<PaddlePredictor>> predictors;
predictors.emplace_back(CreatePaddlePredictor<AnalysisConfig>(config));
for (int tid = 1; tid < FLAGS_num_threads; tid++) {
predictors.emplace_back(predictors.front()->Clone());
}
double total_time_of_threads{0};
std::vector<std::thread> threads;
for (int tid = 0; tid < FLAGS_num_threads; tid++) {
threads.emplace_back([&, tid] {
// To ensure the thread binding correctly,
// please clone inside the threadpool.
auto predictor = base_predictor->Clone();
auto &predictor = predictors[tid];
std::vector<std::unique_ptr<ZeroCopyTensor>> inputs;
PrepareZeroCopyInputs(predictor, &inputs);
auto output_tensor = predictor->GetOutputTensor(out_var_name);
......
......@@ -17,8 +17,10 @@
#include <gtest/gtest.h>
#include <algorithm>
#include <memory>
#include <string>
#include <thread> // NOLINT
#include <unordered_map>
#include <vector>
#ifdef WITH_GPERFTOOLS
#include <gperftools/profiler.h>
......@@ -252,7 +254,11 @@ void TestMultiThreadPrediction(
int batch_size = FLAGS_batch_size;
int num_times = FLAGS_repeat;
std::vector<std::thread> threads;
auto main_predictor = CreateTestPredictor(config, use_analysis);
std::vector<std::unique_ptr<PaddlePredictor>> predictors;
predictors.emplace_back(CreateTestPredictor(config, use_analysis));
for (int tid = 1; tid < num_threads; tid++) {
predictors.emplace_back(predictors.front()->Clone());
}
size_t total_time{0};
for (int tid = 0; tid < num_threads; ++tid) {
......@@ -260,9 +266,7 @@ void TestMultiThreadPrediction(
// Each thread should have local inputs and outputs.
// The inputs of each thread are all the same.
std::vector<PaddleTensor> outputs_tid;
// To ensure the thread binding correctly,
// please clone inside the threadpool.
auto predictor = main_predictor->Clone();
auto &predictor = predictors[tid];
#ifdef PADDLE_WITH_MKLDNN
if (use_analysis) {
static_cast<AnalysisPredictor *>(predictor.get())
......
......@@ -54,7 +54,8 @@ void SetConfig<AnalysisConfig>(AnalysisConfig* config, std::string model_dir,
if (use_gpu) {
config->EnableUseGpu(100, 0);
if (use_tensorrt) {
config->EnableTensorRtEngine(1 << 10, batch_size);
config->EnableTensorRtEngine(1 << 10, batch_size, 3,
AnalysisConfig::Precision::kFloat32, false);
config->pass_builder()->DeletePass("conv_bn_fuse_pass");
config->pass_builder()->DeletePass("fc_fuse_pass");
config->pass_builder()->TurnOnDebug();
......
......@@ -26,20 +26,17 @@ Allocator::~Allocator() {}
bool Allocator::IsAllocThreadSafe() const { return false; }
AllocationPtr Allocator::Allocate(size_t size, Allocator::Attr attr) {
VLOG(2) << "Alloc allocation on " << typeid(*this).name();
auto ptr = AllocateImpl(size, attr);
ptr->RegisterAllocatorChain(this);
VLOG(2) << "Alloc success";
return AllocationPtr(ptr);
}
void Allocator::FreeImpl(Allocation* allocation) {
auto* allocator = allocation->TopAllocator();
Allocator* allocator = allocation->TopAllocator();
allocator->Free(allocation);
}
void Allocator::Free(Allocation* allocation) {
VLOG(2) << "Free allocation on " << typeid(*this).name();
allocation->PopAllocator();
FreeImpl(allocation);
}
......@@ -47,7 +44,7 @@ void Allocator::Free(Allocation* allocation) {
const char* BadAlloc::what() const noexcept { return msg_.c_str(); }
void AllocationDeleter::operator()(Allocation* allocation) const {
auto* allocator = allocation->TopAllocator();
Allocator* allocator = allocation->TopAllocator();
allocator->Free(allocation);
}
......
......@@ -16,7 +16,7 @@
#include <memory>
#include <string>
#include <vector>
#include "paddle/fluid/framework/small_stack.h"
#include "paddle/fluid/framework/inlined_stack.h"
#include "paddle/fluid/platform/place.h"
namespace paddle {
......@@ -82,7 +82,7 @@ class Allocation {
std::vector<Allocator*> GetAllocatorChain() const {
std::vector<Allocator*> allocators;
for (size_t i = 0; i < allocator_chain_.size(); ++i) {
allocators[i] = allocator_chain_[i];
allocators.push_back(allocator_chain_[i]);
}
return allocators;
}
......@@ -100,7 +100,7 @@ class Allocation {
void* ptr_;
size_t size_;
platform::Place place_;
framework::SmallStack<Allocator*, 8> allocator_chain_;
framework::InlinedStack<Allocator*, 8> allocator_chain_;
friend class Allocator;
friend class AllocationDeleter;
......
......@@ -36,6 +36,8 @@ DEFINE_bool(init_allocated_mem, false,
"that initializing the allocated memory with a small value "
"during unit testing.");
DECLARE_double(fraction_of_gpu_memory_to_use);
DECLARE_double(initial_gpu_memory_in_mb);
DECLARE_double(reallocate_gpu_memory_in_mb);
DECLARE_bool(benchmark);
namespace paddle {
......@@ -69,7 +71,8 @@ BuddyAllocator *GetCPUBuddyAllocator() {
std::call_once(init_flag, []() {
a = new detail::BuddyAllocator(
std::unique_ptr<detail::SystemAllocator>(new detail::CPUAllocator),
platform::CpuMinChunkSize(), platform::CpuMaxChunkSize());
platform::CpuMinChunkSize(), platform::CpuMaxChunkSize(),
platform::CpuMaxChunkSize());
});
return a;
......@@ -131,40 +134,53 @@ size_t Used<platform::CPUPlace>(const platform::CPUPlace &place) {
}
#ifdef PADDLE_WITH_CUDA
BuddyAllocator *GetGPUBuddyAllocator(int gpu_id) {
static std::once_flag init_flag;
static detail::BuddyAllocator **a_arr = nullptr;
static std::vector<int> devices;
std::call_once(init_flag, [gpu_id]() {
devices = platform::GetSelectedDevices();
int gpu_num = devices.size();
allocation::GPUMemMonitor.Initialize(devices.size());
class GPUBuddyAllocatorList {
public:
GPUBuddyAllocatorList()
: allocators_(platform::GetCUDADeviceCount()),
flags_(platform::GetCUDADeviceCount()) {
allocation::GPUMemMonitor.Initialize(allocators_.size());
}
a_arr = new BuddyAllocator *[gpu_num];
for (size_t i = 0; i < devices.size(); ++i) {
int dev_id = devices[i];
a_arr[i] = nullptr;
BuddyAllocator *Get(size_t dev_id) {
PADDLE_ENFORCE(dev_id < flags_.size(), "Invalid device id %s", dev_id);
std::call_once(flags_[dev_id], [this, dev_id] {
platform::SetDeviceId(dev_id);
a_arr[i] = new BuddyAllocator(std::unique_ptr<detail::SystemAllocator>(
new detail::GPUAllocator(dev_id)),
platform::GpuMinChunkSize(),
platform::GpuMaxChunkSize());
VLOG(10) << "\n\nNOTE: each GPU device use "
<< FLAGS_fraction_of_gpu_memory_to_use * 100
<< "% of GPU memory.\n"
<< "You can set GFlags environment variable '"
<< "FLAGS_fraction_of_gpu_memory_to_use"
<< "' to change the fraction of GPU usage.\n\n";
}
});
size_t first_size = platform::GpuFirstAllocateChunkSize();
size_t re_size = platform::GpuReAllocateChunkSize();
allocators_[dev_id] =
new BuddyAllocator(std::unique_ptr<detail::SystemAllocator>(
new detail::GPUAllocator(dev_id)),
platform::GpuMinChunkSize(), first_size, re_size);
VLOG(2) << "\n\nNOTE: each GPU device use "
<< string::HumanReadableSize(first_size) << "(initial chunk) "
<< string::HumanReadableSize(re_size) << "(reallocate chunk) "
<< "% of GPU memory.\n"
<< "You can set GFlags environment variable '"
<< "FLAGS_fraction_of_gpu_memory_to_use"
<< "' or "
"'FLAGS_initial_gpu_memory_in_mb/"
"FLAGS_reallocate_gpu_memory_in_mb' to change the fraction "
"of GPU usage.\n\n";
VLOG(2) << "Currently, FLAGS_fraction_of_gpu_memory_to_use="
<< FLAGS_fraction_of_gpu_memory_to_use << ", "
<< "FLAGS_initial_gpu_memory_in_mb="
<< FLAGS_initial_gpu_memory_in_mb << ", "
<< "FLAGS_reallocate_gpu_memory_in_mb="
<< FLAGS_reallocate_gpu_memory_in_mb;
});
return allocators_[dev_id];
}
private:
std::vector<BuddyAllocator *> allocators_;
std::vector<std::once_flag> flags_;
};
BuddyAllocator *GetGPUBuddyAllocator(int gpu_id) {
static GPUBuddyAllocatorList allocators;
platform::SetDeviceId(gpu_id);
auto pos = std::distance(devices.begin(),
std::find(devices.begin(), devices.end(), gpu_id));
return a_arr[pos];
return allocators.Get(gpu_id);
}
#endif
......@@ -183,7 +199,7 @@ void *Alloc<platform::CUDAPlace>(const platform::CUDAPlace &place,
#ifdef PADDLE_WITH_CUDA
auto *buddy_allocator = GetGPUBuddyAllocator(place.device);
auto *ptr = buddy_allocator->Alloc(size);
if (ptr == nullptr) {
if (ptr == nullptr && size > 0) {
int cur_dev = platform::GetCurrentDeviceId();
platform::SetDeviceId(place.device);
size_t avail, total;
......@@ -234,6 +250,7 @@ BuddyAllocator *GetCUDAPinnedBuddyAllocator() {
ba = new BuddyAllocator(std::unique_ptr<detail::SystemAllocator>(
new detail::CUDAPinnedAllocator),
platform::CUDAPinnedMinChunkSize(),
platform::CUDAPinnedMaxChunkSize(),
platform::CUDAPinnedMaxChunkSize());
});
......
......@@ -14,16 +14,90 @@
#include "paddle/fluid/memory/allocation/multi_bin_buffered_allocator.h"
#include <algorithm>
#include <cctype>
#include <fstream>
#include <limits>
#include <sstream>
#include <string>
#include "paddle/fluid/platform/lock_guard_ptr.h"
DEFINE_double(tolerant_times, 2,
"Tolerant memory size times of buffered_allocator");
DEFINE_string(division_plan_path, "", "Division plan file path");
namespace paddle {
namespace memory {
namespace allocation {
std::string TrimStringAndToLowerCase(const std::string &str) {
auto not_space = [](char ch) { return std::isspace(ch) == 0; };
auto first_idx = static_cast<size_t>(
std::find_if(str.begin(), str.end(), not_space) - str.begin());
auto last_idx = static_cast<size_t>(
std::find_if(str.rbegin(), str.rend(), not_space) - str.rbegin());
if (first_idx == str.size() || last_idx == str.size()) return "";
last_idx = str.size() - 1 - last_idx;
auto ret = str.substr(first_idx, last_idx - first_idx);
std::for_each(ret.begin(), ret.end(),
[](char &ch) { ch = std::tolower(ch); });
return ret;
}
static size_t ParseStringToBytes(const std::string &str) {
std::string ret = str;
if (ret.back() == 'b') {
ret.pop_back();
}
PADDLE_ENFORCE(!ret.empty(), "Wrong format: %s", str);
size_t multiples = 1;
switch (ret.back()) {
case 'g':
multiples *= (static_cast<size_t>(1) << 30);
break;
case 'm':
multiples *= (static_cast<size_t>(1) << 20);
break;
case 'k':
multiples *= (static_cast<size_t>(1) << 10);
break;
default:
break;
}
if (multiples != 1) ret.pop_back();
ret = TrimStringAndToLowerCase(ret);
double ret_val = 0.0;
std::stringstream ss(ret);
PADDLE_ENFORCE((ss >> ret_val).good(), "Wrong format %s", str);
return static_cast<size_t>(ret_val * multiples);
}
static std::string GetDebugStringOfPlan(const std::vector<size_t> &plan) {
std::string ret("[");
for (auto sz : plan) {
ret += string::HumanReadableSize(sz);
ret += ", ";
}
return ret + "]";
}
static std::vector<size_t> ReadDivisionPlanFromFile(
const std::string &filepath) {
std::ifstream is(filepath.c_str());
PADDLE_ENFORCE(is.good(), "File not exist");
std::string str;
std::vector<size_t> plan;
while (std::getline(is, str).good()) {
str = TrimStringAndToLowerCase(str);
if (str.empty()) break;
plan.push_back(ParseStringToBytes(str));
}
return plan;
}
static void CheckAndModifyMemoryDivisionPlan(
std::vector<size_t> *division_plan) {
// Check whether the division plan is strictly sorted
......@@ -50,10 +124,21 @@ static void CheckAndModifyMemoryDivisionPlan(
}
static std::vector<size_t> GetDefaultDivisionPlan() {
if (!FLAGS_division_plan_path.empty()) {
return ReadDivisionPlanFromFile(FLAGS_division_plan_path);
}
constexpr size_t kMaxLogSize = 30;
std::vector<size_t> plan;
for (size_t i = 12; i <= kMaxLogSize; ++i) {
plan.push_back(static_cast<size_t>(1) << i);
}
/*
for (size_t i = 0; i < sizeof(size_t) * 8; ++i) {
plan.push_back(static_cast<size_t>(1) << i);
}
*/
return plan;
}
......@@ -78,27 +163,32 @@ MultiBinBufferedAllocator::MultiBinBufferedAllocator(
: underlying_allocator_(std::move(underlying_allocator)),
division_plan_(division_plan) {
CheckAndModifyMemoryDivisionPlan(&division_plan_);
allocations_.resize(division_plan_.size());
mtx_.resize(division_plan_.size());
allocations_.resize(division_plan_.size() - 1);
mtx_.resize(division_plan_.size() - 1);
if (underlying_allocator_->IsAllocThreadSafe()) {
for (auto &mtx : mtx_) {
mtx.reset(new std::mutex());
}
}
VLOG(1) << "Division plan is: " << GetDebugStringOfPlan(division_plan_);
VLOG(1) << "FLAGS_tolerant_times = " << FLAGS_tolerant_times;
}
void MultiBinBufferedAllocator::FreeImpl(Allocation *allocation) {
auto bin_index = FindDivisionPlanBinIndex(division_plan_, allocation->size());
{
if (bin_index < allocations_.size()) {
platform::LockGuardPtr<std::mutex> guard(mtx_[bin_index]);
allocations_[bin_index].emplace(allocation->size(),
AllocationPtr(allocation));
} else {
underlying_allocator_->Free(allocation);
}
}
void MultiBinBufferedAllocator::FreeCache(size_t size, size_t bin_index) {
// bin_index is not used currently.
// Maybe we can design more flexible FreeCache strategy based on bin_index
size_t MultiBinBufferedAllocator::FreeCache(size_t size, size_t bin_index) {
size_t accumulated_size = 0;
// FIXME(zjl): free the largest first when there is no extra
for (size_t i = allocations_.size() - 1; i != static_cast<size_t>(-1); --i) {
......@@ -110,33 +200,53 @@ void MultiBinBufferedAllocator::FreeCache(size_t size, size_t bin_index) {
underlying_allocator_->Free(it->second.release());
allocations_[i].erase(it--);
if (accumulated_size >= size) {
return;
return accumulated_size;
}
} while (!allocations_[i].empty());
}
return accumulated_size;
}
Allocation *MultiBinBufferedAllocator::AllocateImpl(size_t size, Attr attr) {
auto bin_index = FindDivisionPlanBinIndex(division_plan_, size);
auto upper_size = TolerantUpperSize(size);
for (; upper_size >= division_plan_[bin_index]; ++bin_index) {
// if (bin_index >= allocations_.size()) {
// VLOG(2) << "Allocate " << size << " from underlying directly";
//}
for (; bin_index < allocations_.size() &&
upper_size >= division_plan_[bin_index];
++bin_index) {
auto &allocation = allocations_[bin_index];
platform::LockGuardPtr<std::mutex> lock(mtx_[bin_index]);
auto it = allocation.lower_bound(size);
if (it != allocation.end() && it->second->size() < upper_size) {
if (it != allocation.end() && it->second->size() <= upper_size) {
size_t sz = it->second->size();
auto ret = std::move(it->second);
allocation.erase(it);
VLOG(3) << "Allocate " << sz << "(required " << size
<< ") from cache directly";
return ret.release();
}
}
try {
return underlying_allocator_->Allocate(size, attr).release();
} catch (BadAlloc &) {
VLOG(2) << "BadAlloc raises, try to free " << size << " caches";
FreeCache(size, bin_index);
return underlying_allocator_->Allocate(size, attr).release();
size_t retry_time = 1;
while (true) {
try {
auto ret = underlying_allocator_->Allocate(size, attr).release();
VLOG(2) << "Allocate " << size << " from underlying directly";
return ret;
} catch (BadAlloc &) {
VLOG(1) << retry_time << "-th BadAlloc raises, try to free " << size
<< " bytes caches";
// size_t actual_free_size = FreeCache(size, bin_index);
size_t actual_free_size = FreeCache(-1UL, bin_index);
VLOG(1) << retry_time << "-th free " << actual_free_size
<< " bytes caches";
if (actual_free_size == 0) throw;
}
++retry_time;
}
}
......
......@@ -41,7 +41,7 @@ class MultiBinBufferedAllocator : public Allocator {
void FreeImpl(Allocation* allocation) override;
private:
void FreeCache(size_t size, size_t bin_index);
size_t FreeCache(size_t size, size_t bin_index);
std::shared_ptr<Allocator> underlying_allocator_;
std::vector<std::multimap<size_t, AllocationPtr>> allocations_;
......
......@@ -25,9 +25,11 @@ namespace detail {
BuddyAllocator::BuddyAllocator(
std::unique_ptr<SystemAllocator> system_allocator, size_t min_chunk_size,
size_t max_chunk_size)
size_t first_allocate_chunk_size, size_t reallocate_chunk_size)
: min_chunk_size_(min_chunk_size),
max_chunk_size_(max_chunk_size),
first_allocate_chunk_size_(first_allocate_chunk_size),
reallocate_chunk_size_(reallocate_chunk_size),
max_chunk_size_(first_allocate_chunk_size),
cache_(system_allocator->UseGpu()),
system_allocator_(std::move(system_allocator)) {}
......@@ -36,9 +38,10 @@ BuddyAllocator::~BuddyAllocator() {
"have actually been freed";
while (!pool_.empty()) {
auto block = static_cast<MemoryBlock*>(std::get<2>(*pool_.begin()));
VLOG(10) << "Free from block (" << block << ", " << max_chunk_size_ << ")";
auto desc = cache_.load(block);
VLOG(10) << "Free from block (" << block << ", " << desc.size << ")";
system_allocator_->Free(block, max_chunk_size_, block->index(cache_));
system_allocator_->Free(block, desc.size, desc.index);
cache_.invalidate(block);
pool_.erase(pool_.begin());
}
......@@ -63,7 +66,7 @@ void* BuddyAllocator::Alloc(size_t unaligned_size) {
// if the allocation is huge, send directly to the system allocator
if (size > max_chunk_size_) {
VLOG(10) << "Allocate from system allocator.";
return SystemAlloc(size);
return SystemAlloc(size, false);
}
// query and allocate from the existing chunk
......@@ -72,9 +75,9 @@ void* BuddyAllocator::Alloc(size_t unaligned_size) {
// refill the pool if failure
if (it == pool_.end()) {
it = RefillPool();
// if still failure, fail fatally
// if still failure, try to allocate from SystemAllocator
if (it == pool_.end()) {
return nullptr;
return SystemAlloc(size, false);
}
} else {
VLOG(10) << "Allocation from existing memory block " << std::get<2>(*it)
......@@ -98,7 +101,7 @@ void BuddyAllocator::Free(void* p) {
VLOG(10) << "Free from address " << block;
if (block->type(cache_) == MemoryBlock::HUGE_CHUNK) {
if (block->type(cache_) == MemoryBlock::UNMANAGED_HUGE_CHUNK) {
VLOG(10) << "Free directly from system allocator";
system_allocator_->Free(block, block->total_size(cache_),
block->index(cache_));
......@@ -168,9 +171,12 @@ void BuddyAllocator::Free(void* p) {
size_t BuddyAllocator::Used() { return total_used_; }
size_t BuddyAllocator::GetMinChunkSize() { return min_chunk_size_; }
size_t BuddyAllocator::GetMaxChunkSize() { return max_chunk_size_; }
size_t BuddyAllocator::GetMaxChunkSize() {
std::lock_guard<std::mutex> lock(mutex_);
return max_chunk_size_;
}
void* BuddyAllocator::SystemAlloc(size_t size) {
void* BuddyAllocator::SystemAlloc(size_t size, bool is_managed) {
size_t index = 0;
void* p = system_allocator_->Alloc(&index, size);
......@@ -178,25 +184,23 @@ void* BuddyAllocator::SystemAlloc(size_t size) {
if (p == nullptr) return nullptr;
static_cast<MemoryBlock*>(p)->init(&cache_, MemoryBlock::HUGE_CHUNK, index,
size, nullptr, nullptr);
static_cast<MemoryBlock*>(p)->init(
&cache_, is_managed ? MemoryBlock::MANAGED_HUGE_CHUNK
: MemoryBlock::UNMANAGED_HUGE_CHUNK,
index, size, nullptr, nullptr);
return static_cast<MemoryBlock*>(p)->data();
}
BuddyAllocator::PoolSet::iterator BuddyAllocator::RefillPool() {
#ifdef PADDLE_WITH_CUDA
if (system_allocator_->UseGpu()) {
if ((total_used_ + total_free_) == 0) {
// Compute the maximum allocation size for the first allocation.
max_chunk_size_ = platform::GpuMaxChunkSize();
}
if (total_used_ + total_free_ > 0) {
max_chunk_size_ = reallocate_chunk_size_;
}
#endif
// Allocate a new maximum sized block
size_t index = 0;
void* p = system_allocator_->Alloc(&index, max_chunk_size_);
size_t chunk_size = max_chunk_size_;
void* p = system_allocator_->Alloc(&index, chunk_size);
if (p == nullptr) return pool_.end();
......@@ -204,7 +208,7 @@ BuddyAllocator::PoolSet::iterator BuddyAllocator::RefillPool() {
<< " from system allocator";
static_cast<MemoryBlock*>(p)->init(&cache_, MemoryBlock::FREE_CHUNK, index,
max_chunk_size_, nullptr, nullptr);
chunk_size, nullptr, nullptr);
// gpu fallback allocation
if (system_allocator_->UseGpu() &&
......@@ -212,10 +216,10 @@ BuddyAllocator::PoolSet::iterator BuddyAllocator::RefillPool() {
fallback_alloc_count_++;
}
total_free_ += max_chunk_size_;
total_free_ += chunk_size;
// dump the block into pool
return pool_.insert(IndexSizeAddress(index, max_chunk_size_, p)).first;
return pool_.insert(IndexSizeAddress(index, chunk_size, p)).first;
}
BuddyAllocator::PoolSet::iterator BuddyAllocator::FindExistChunk(size_t size) {
......@@ -271,27 +275,24 @@ void* BuddyAllocator::SplitToAlloc(BuddyAllocator::PoolSet::iterator it,
void BuddyAllocator::CleanIdleFallBackAlloc() {
// If fallback allocation does not exist, return directly
if (!fallback_alloc_count_) return;
if (!fallback_alloc_count_ || !system_allocator_->UseGpu()) return;
for (auto pool = pool_.rbegin(); pool != pool_.rend();) {
// If free memory block less than max_chunk_size_, return directly
if (std::get<1>(*pool) < max_chunk_size_) return;
MemoryBlock* block = static_cast<MemoryBlock*>(std::get<2>(*pool));
// If no GPU fallback allocator, return
if (!system_allocator_->UseGpu() || block->index(cache_) == 0) {
auto desc = cache_.load(block);
if (desc.index == 0) {
return;
}
VLOG(10) << "Return block " << block << " to fallback allocator.";
system_allocator_->Free(block, max_chunk_size_, block->index(cache_));
system_allocator_->Free(block, desc.size, block->index(cache_));
cache_.invalidate(block);
pool = PoolSet::reverse_iterator(pool_.erase(std::next(pool).base()));
total_free_ -= max_chunk_size_;
total_free_ -= desc.size;
fallback_alloc_count_--;
// If no fall allocation exists, return directly
......@@ -315,19 +316,21 @@ void BuddyAllocator::CleanIdleNormalAlloc() {
if (!shall_free_alloc()) return;
for (auto pool = pool_.rbegin(); pool != pool_.rend();) {
// If free memory block less than max_chunk_size_, return directly
if (std::get<1>(*pool) < max_chunk_size_) return;
MemoryBlock* block = static_cast<MemoryBlock*>(std::get<2>(*pool));
auto desc = cache_.load(block);
if (desc.type != MemoryBlock::MANAGED_HUGE_CHUNK) {
return;
}
VLOG(10) << "Return block " << block << " to base allocator.";
system_allocator_->Free(block, max_chunk_size_, block->index(cache_));
system_allocator_->Free(block, desc.size, desc.index);
cache_.invalidate(block);
pool = PoolSet::reverse_iterator(pool_.erase(std::next(pool).base()));
total_free_ -= max_chunk_size_;
total_free_ -= desc.size;
if (!shall_free_alloc()) return;
}
......
......@@ -34,7 +34,8 @@ namespace detail {
class BuddyAllocator {
public:
BuddyAllocator(std::unique_ptr<SystemAllocator> system_allocator,
size_t min_chunk_size, size_t max_chunk_size);
size_t min_chunk_size, size_t first_allocate_chunk_size,
size_t reallocate_chunk_size);
~BuddyAllocator();
......@@ -57,7 +58,7 @@ class BuddyAllocator {
using PoolSet = std::set<IndexSizeAddress>;
/*! \brief Allocate fixed-size memory from system */
void* SystemAlloc(size_t size);
void* SystemAlloc(size_t size, bool is_managed = true);
/*! \brief If existing chunks are not suitable, refill pool */
PoolSet::iterator RefillPool();
......@@ -87,7 +88,11 @@ class BuddyAllocator {
size_t total_free_ = 0; // the total size of free memory
size_t min_chunk_size_; // the minimum size of each chunk
size_t max_chunk_size_; // the maximum size of each chunk
size_t first_allocate_chunk_size_;
size_t reallocate_chunk_size_;
size_t max_chunk_size_;
private:
/**
......
......@@ -27,10 +27,11 @@ class MetadataCache;
// MemoryBlock::Desc and the payload.
struct MemoryBlock {
enum Type {
FREE_CHUNK, // memory is free and idle
ARENA_CHUNK, // memory is being occupied
HUGE_CHUNK, // memory is out of management
INVALID_CHUNK // memory is invalid
FREE_CHUNK, // memory is free and idle
ARENA_CHUNK, // memory is being occupied
MANAGED_HUGE_CHUNK, // memory is huge and out of management
UNMANAGED_HUGE_CHUNK, // memory is huge and managed by allocator
INVALID_CHUNK // memory is invalid
};
// init saves the MemoryBlock::Desc of the memory block in a MetadataCache.
......
......@@ -42,8 +42,8 @@ void OpTester::Init(const OpTesterConfig &config) {
// Initialize the OpDesc
if (op_desc_info.Has(config_.op_type)) {
type_ = config_.op_type;
op_desc_.SetType(config_.op_type);
CreateOpDesc();
CreateInputVarDesc();
CreateOutputVarDesc();
} else {
......@@ -131,6 +131,40 @@ std::vector<std::string> OpTester::GetOpProtoOutputNames() {
return output_names;
}
std::unordered_map<std::string, framework::proto::AttrType>
OpTester::GetOpProtoAttrNames() {
std::unordered_map<std::string, framework::proto::AttrType> attr_types;
const framework::proto::OpProto &proto =
framework::OpInfoMap::Instance().Get(type_).Proto();
const std::vector<std::string> skipped_attrs = {
framework::OpProtoAndCheckerMaker::OpRoleAttrName(),
framework::OpProtoAndCheckerMaker::OpRoleVarAttrName(),
framework::OpProtoAndCheckerMaker::OpNamescopeAttrName(),
framework::OpProtoAndCheckerMaker::OpCreationCallstackAttrName()};
for (int i = 0; i != proto.attrs_size(); ++i) {
const auto &attr = proto.attrs(i);
if (!Has(skipped_attrs, attr.name())) {
VLOG(4) << "attr: " << attr.name() << ", type: " << attr.type();
attr_types[attr.name()] = attr.type();
}
}
return attr_types;
}
framework::proto::VarType::Type OpTester::TransToVarType(std::string str) {
if (str == "int32") {
return framework::proto::VarType::INT32;
} else if (str == "int64") {
return framework::proto::VarType::INT64;
} else if (str == "fp32") {
return framework::proto::VarType::FP32;
} else if (str == "fp64") {
return framework::proto::VarType::FP64;
} else {
PADDLE_THROW("Unsupported dtype %s.", str.c_str());
}
}
void OpTester::CreateInputVarDesc() {
std::vector<std::string> input_names = GetOpProtoInputNames();
for (auto &name : input_names) {
......@@ -145,11 +179,11 @@ void OpTester::CreateInputVarDesc() {
// Need to support more type
var->SetType(framework::proto::VarType::LOD_TENSOR);
var->SetPersistable(false);
var->SetDataType(framework::proto::VarType::FP32);
var->SetDataType(TransToVarType(input->dtype));
var->SetShape(input->dims);
op_desc_.SetInput(name, {var_name});
input_lods_[var_name] = input->lod;
inputs_[var_name] = *input;
}
}
......@@ -167,6 +201,49 @@ void OpTester::CreateOutputVarDesc() {
}
}
void OpTester::CreateOpDesc() {
op_desc_.SetType(config_.op_type);
std::unordered_map<std::string, framework::proto::AttrType> attr_types =
GetOpProtoAttrNames();
for (auto item : config_.attrs) {
const std::string &name = item.first;
if (attr_types.find(name) == attr_types.end()) {
LOG(FATAL) << "Operator " << type_ << " do not have attr " << name;
}
const std::string &value_str = item.second;
const framework::proto::AttrType &type = attr_types[name];
switch (type) {
case framework::proto::AttrType::BOOLEAN:
break;
case framework::proto::AttrType::INT: {
int value = StringTo<int>(value_str);
op_desc_.SetAttr(name, {value});
} break;
case framework::proto::AttrType::FLOAT: {
float value = StringTo<float>(value_str);
op_desc_.SetAttr(name, {value});
} break;
case framework::proto::AttrType::STRING: {
op_desc_.SetAttr(name, {value_str});
} break;
case framework::proto::AttrType::BOOLEANS:
case framework::proto::AttrType::INTS:
case framework::proto::AttrType::FLOATS:
case framework::proto::AttrType::STRINGS:
LOG(FATAL) << "Not supported yet.";
break;
case framework::proto::AttrType::LONG: {
int64_t value = StringTo<int64_t>(value_str);
op_desc_.SetAttr(name, value);
} break;
case framework::proto::AttrType::LONGS:
default:
PADDLE_THROW("Unsupport attr type %d", type);
}
}
}
framework::VarDesc *OpTester::Var(const std::string &name) {
auto it = vars_.find(name);
if (it != vars_.end()) {
......@@ -179,24 +256,41 @@ framework::VarDesc *OpTester::Var(const std::string &name) {
template <typename T>
void OpTester::SetupTensor(framework::LoDTensor *tensor,
const std::vector<int64_t> &shape, T lower,
T upper) {
const std::vector<int64_t> &shape, T lower, T upper,
const std::string &initializer) {
static unsigned int seed = 100;
std::mt19937 rng(seed++);
std::uniform_real_distribution<double> uniform_dist(0, 1);
T *ptr = tensor->mutable_data<T>(framework::make_ddim(shape), place_);
if (platform::is_cpu_place(place_)) {
for (int i = 0; i < tensor->numel(); ++i) {
ptr[i] = static_cast<T>(uniform_dist(rng) * (upper - lower) + lower);
}
framework::LoDTensor cpu_tensor;
T *cpu_ptr = nullptr;
if (!platform::is_cpu_place(place_)) {
cpu_ptr = cpu_tensor.mutable_data<T>(framework::make_ddim(shape),
platform::CPUPlace());
} else {
framework::LoDTensor cpu_tensor;
T *cpu_ptr = cpu_tensor.mutable_data<T>(framework::make_ddim(shape),
platform::CPUPlace());
cpu_ptr = ptr;
}
if (initializer == "random") {
for (int i = 0; i < cpu_tensor.numel(); ++i) {
cpu_ptr[i] = static_cast<T>(uniform_dist(rng) * (upper - lower) + lower);
}
} else if (initializer == "natural") {
for (int i = 0; i < cpu_tensor.numel(); ++i) {
cpu_ptr[i] = lower + i;
}
} else if (initializer == "zeros") {
for (int i = 0; i < cpu_tensor.numel(); ++i) {
cpu_ptr[i] = 0;
}
} else {
PADDLE_THROW("Unsupported initializer %s.", initializer.c_str());
}
if (!platform::is_cpu_place(place_)) {
TensorCopySync(cpu_tensor, place_, tensor);
}
}
......@@ -219,7 +313,7 @@ void OpTester::CreateVariables(framework::Scope *scope) {
}
}
for (auto &item : input_lods_) {
for (auto &item : inputs_) {
// Allocate memory for input tensor
auto &var_name = item.first;
VLOG(3) << "Allocate memory for tensor " << var_name;
......@@ -229,11 +323,23 @@ void OpTester::CreateVariables(framework::Scope *scope) {
auto *var = scope->Var(var_name);
auto *tensor = var->GetMutable<framework::LoDTensor>();
SetupTensor<float>(tensor, shape, static_cast<float>(0.0),
static_cast<float>(1.0));
const auto &data_type = var_desc->GetDataType();
if (data_type == framework::proto::VarType::INT32) {
SetupTensor<int>(tensor, shape, 0, 1, item.second.initializer);
} else if (data_type == framework::proto::VarType::INT64) {
SetupTensor<int64_t>(tensor, shape, 0, 1, item.second.initializer);
} else if (data_type == framework::proto::VarType::FP32) {
SetupTensor<float>(tensor, shape, static_cast<float>(0.0),
static_cast<float>(1.0), item.second.initializer);
} else if (data_type == framework::proto::VarType::FP64) {
SetupTensor<double>(tensor, shape, static_cast<double>(0.0),
static_cast<double>(1.0), item.second.initializer);
} else {
PADDLE_THROW("Unsupported dtype %d.", data_type);
}
VLOG(3) << "Set lod for tensor " << var_name;
std::vector<std::vector<size_t>> &lod_vec = item.second;
std::vector<std::vector<size_t>> &lod_vec = item.second.lod;
framework::LoD lod;
for (size_t i = 0; i < lod_vec.size(); ++i) {
lod.push_back(lod_vec[i]);
......@@ -261,7 +367,16 @@ std::string OpTester::DebugString() {
ss << GenSpaces(count) << "type: LOD_TENSOR\n";
ss << GenSpaces(count++) << "lod_tensor {\n";
ss << GenSpaces(count++) << "tensor {\n";
ss << GenSpaces(count) << "data_type: FP32\n";
const auto &data_type = var->GetDataType();
if (data_type == framework::proto::VarType::INT32) {
ss << GenSpaces(count) << "data_type: INT32\n";
} else if (data_type == framework::proto::VarType::INT64) {
ss << GenSpaces(count) << "data_type: INT64\n";
} else if (data_type == framework::proto::VarType::FP32) {
ss << GenSpaces(count) << "data_type: FP32\n";
} else if (data_type == framework::proto::VarType::FP64) {
ss << GenSpaces(count) << "data_type: FP64\n";
}
std::vector<int64_t> shape = var->GetShape();
for (auto d : shape) {
ss << GenSpaces(count) << "dims: " << d << "\n";
......@@ -288,6 +403,63 @@ std::string OpTester::DebugString() {
ss << GenSpaces(--count) << "}\n";
}
ss << GenSpaces(count) << "type: " << op_desc_.Type() << "\n";
for (auto &name : op_desc_.AttrNames()) {
ss << GenSpaces(count++) << "attrs {\n";
const auto &attr_type = op_desc_.GetAttrType(name);
const auto &attr = op_desc_.GetAttr(name);
ss << GenSpaces(count) << "name: \"" << name << "\"\n";
switch (attr_type) {
case framework::proto::AttrType::BOOLEAN: {
ss << GenSpaces(count) << "type: BOOLEAN\n";
ss << GenSpaces(count) << "b: " << boost::get<bool>(attr) << "\n";
} break;
case framework::proto::AttrType::INT: {
ss << GenSpaces(count) << "type: INT\n";
ss << GenSpaces(count) << "i: " << boost::get<int>(attr) << "\n";
} break;
case framework::proto::AttrType::FLOAT: {
ss << GenSpaces(count) << "type: FLOAT\n";
ss << GenSpaces(count) << "f: " << boost::get<float>(attr) << "\n";
} break;
case framework::proto::AttrType::STRING: {
ss << GenSpaces(count) << "type: STRING\n";
ss << GenSpaces(count) << "s: \"" << boost::get<std::string>(attr)
<< "\"\n";
} break;
case framework::proto::AttrType::BOOLEANS: {
ss << GenSpaces(count) << "type: BOOLEANS\n";
ss << GenSpaces(count) << "bools: "
<< "\n";
} break;
case framework::proto::AttrType::INTS: {
ss << GenSpaces(count) << "type: INTS\n";
ss << GenSpaces(count) << "ints: "
<< "\n";
} break;
case framework::proto::AttrType::FLOATS: {
ss << GenSpaces(count) << "type: FLOATS\n";
ss << GenSpaces(count) << "floats: "
<< "\n";
} break;
case framework::proto::AttrType::STRINGS: {
ss << GenSpaces(count) << "type: STRINGS\n";
ss << GenSpaces(count) << "strings: "
<< "\n";
} break;
case framework::proto::AttrType::LONG: {
ss << GenSpaces(count) << "type: LONG\n";
ss << GenSpaces(count) << "l: " << boost::get<int64_t>(attr) << "\n";
} break;
case framework::proto::AttrType::LONGS: {
ss << GenSpaces(count) << "type: LONGS\n";
ss << GenSpaces(count) << "longs: "
<< "\n";
} break;
default:
PADDLE_THROW("Unsupport attr type %d", attr_type);
}
ss << GenSpaces(--count) << "}\n";
}
ss << GenSpaces(--count) << "}\n";
return ss.str();
}
......@@ -299,6 +471,7 @@ TEST(op_tester, base) {
FLAGS_op_config_list.c_str());
std::vector<OpTesterConfig> op_configs;
while (!fin.eof()) {
VLOG(4) << "Reading config " << op_configs.size() << "...";
OpTesterConfig config;
bool result = config.Init(fin);
if (result) {
......
......@@ -14,7 +14,9 @@ limitations under the License. */
#pragma once
#include <memory>
#include <string>
#include <unordered_map>
#include <vector>
#include "paddle/fluid/framework/ddim.h"
#include "paddle/fluid/framework/op_desc.h"
......@@ -39,16 +41,21 @@ class OpTester {
private:
std::vector<std::string> GetOpProtoInputNames();
std::vector<std::string> GetOpProtoOutputNames();
std::unordered_map<std::string, framework::proto::AttrType>
GetOpProtoAttrNames();
framework::proto::VarType::Type TransToVarType(std::string str);
void CreateInputVarDesc();
void CreateOutputVarDesc();
void CreateOpDesc();
framework::VarDesc *Var(const std::string &name);
void CreateVariables(framework::Scope *scope);
template <typename T>
void SetupTensor(framework::LoDTensor *input,
const std::vector<int64_t> &shape, T lower, T upper);
const std::vector<int64_t> &shape, T lower, T upper,
const std::string &initializer);
void RunImpl();
......@@ -57,7 +64,7 @@ class OpTester {
std::string type_;
framework::OpDesc op_desc_;
std::unordered_map<std::string, std::unique_ptr<framework::VarDesc>> vars_;
std::unordered_map<std::string, std::vector<std::vector<size_t>>> input_lods_;
std::unordered_map<std::string, OpInputConfig> inputs_;
std::unique_ptr<framework::OperatorBase> op_;
platform::Place place_;
std::unique_ptr<framework::Scope> scope_;
......
......@@ -14,7 +14,6 @@ limitations under the License. */
#include "paddle/fluid/operators/benchmark/op_tester_config.h"
#include <fstream>
#include "glog/logging.h"
#include "paddle/fluid/platform/enforce.h"
namespace paddle {
......@@ -40,6 +39,62 @@ static void EraseEndSep(std::string* str,
}
}
OpInputConfig::OpInputConfig(std::istream& is) {
std::string sep;
is >> sep;
if (sep == kStartSeparator) {
while (sep != kEndSeparator) {
is >> sep;
if (sep == "name" || sep == "name:") {
is >> name;
EraseEndSep(&name);
} else if (sep == "dtype" || sep == "dtype:") {
ParseDType(is);
} else if (sep == "initializer" || sep == "initializer:") {
ParseInitializer(is);
} else if (sep == "dims" || sep == "dims:") {
ParseDims(is);
} else if (sep == "lod" || sep == "lod:") {
ParseLoD(is);
}
}
}
}
void OpInputConfig::ParseDType(std::istream& is) {
std::string dtype_str;
is >> dtype_str;
EraseEndSep(&dtype_str);
if (dtype_str == "int32" || dtype_str == "int") {
dtype = "int32";
} else if (dtype_str == "int64" || dtype_str == "long") {
dtype = "int64";
} else if (dtype_str == "fp32" || dtype_str == "float") {
dtype = "fp32";
} else if (dtype_str == "fp64" || dtype_str == "double") {
dtype = "fp64";
} else {
PADDLE_THROW("Unsupported dtype %s", dtype_str.c_str());
}
VLOG(4) << "dtype of input " << name << " is: " << dtype;
}
void OpInputConfig::ParseInitializer(std::istream& is) {
std::string initializer_str;
is >> initializer_str;
EraseEndSep(&initializer_str);
const std::vector<std::string> supported_initializers = {"random", "natural",
"zeros"};
if (!Has(supported_initializers, initializer_str)) {
PADDLE_THROW("Unsupported initializer %s", initializer_str.c_str());
}
initializer = initializer_str;
VLOG(4) << "initializer of input " << name << " is: " << initializer;
}
void OpInputConfig::ParseDims(std::istream& is) {
std::string dims_str;
is >> dims_str;
......@@ -84,7 +139,7 @@ void OpInputConfig::ParseLoD(std::istream& is) {
number += lod_str[i];
++i;
}
level.push_back(atoi(number.c_str()));
level.push_back(StringTo<size_t>(number));
}
lod.push_back(level);
} else if (lod_str[i] == '}') {
......@@ -93,24 +148,6 @@ void OpInputConfig::ParseLoD(std::istream& is) {
}
}
OpInputConfig::OpInputConfig(std::istream& is) {
std::string sep;
is >> sep;
if (sep == kStartSeparator) {
while (sep != kEndSeparator) {
is >> sep;
if (sep == "name" || sep == "name:") {
is >> name;
EraseEndSep(&name);
} else if (sep == "dims" || sep == "dims:") {
ParseDims(is);
} else if (sep == "lod" || sep == "lod:") {
ParseLoD(is);
}
}
}
}
OpTesterConfig::OpTesterConfig(const std::string& filename) {
std::ifstream fin(filename, std::ios::in | std::ios::binary);
PADDLE_ENFORCE(static_cast<bool>(fin), "Cannot open file %s",
......@@ -167,6 +204,7 @@ bool OpTesterConfig::ParseAttrs(std::istream& is) {
is >> value;
EraseEndSep(&key, ":");
EraseEndSep(&value);
VLOG(4) << "attrs: " << key << ", " << value;
attrs[key] = value;
}
......
......@@ -15,6 +15,7 @@ limitations under the License. */
#pragma once
#include <istream>
#include <sstream>
#include <string>
#include <unordered_map>
#include <vector>
......@@ -27,10 +28,14 @@ struct OpInputConfig {
OpInputConfig() {}
explicit OpInputConfig(std::istream& is);
void ParseDType(std::istream& is);
void ParseInitializer(std::istream& is);
void ParseDims(std::istream& is);
void ParseLoD(std::istream& is);
std::string name;
std::string dtype{"fp32"}; // int32/int, int64/long, fp32/float, fp64/double
std::string initializer{"random"}; // random, natural
std::vector<int64_t> dims;
std::vector<std::vector<size_t>> lod;
};
......@@ -55,6 +60,23 @@ struct OpTesterConfig {
double runtime{0.0};
};
static bool Has(const std::vector<std::string>& vec, const std::string& item) {
for (size_t i = 0; i < vec.size(); ++i) {
if (vec[i] == item) {
return true;
}
}
return false;
}
template <typename T>
T StringTo(const std::string& str) {
std::istringstream is(str);
T value;
is >> value;
return value;
}
} // namespace benchmark
} // namespace operators
} // namespace paddle
......@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/operators/cast_op.h"
#include <memory>
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/platform/float16.h"
......@@ -30,7 +31,8 @@ class CastOpProtoMaker : public framework::OpProtoAndCheckerMaker {
Cast Operator.
This Operator casts the input tensor to another data type and
returns tha Output Tensor.
returns the Output Tensor. It's meaningless if the output dtype equals
the input dtype, but it's fine if you do so.
)DOC");
}
......
......@@ -33,11 +33,14 @@ detection_library(rpn_target_assign_op SRCS rpn_target_assign_op.cc)
detection_library(generate_proposal_labels_op SRCS generate_proposal_labels_op.cc)
detection_library(box_clip_op SRCS box_clip_op.cc box_clip_op.cu)
detection_library(yolov3_loss_op SRCS yolov3_loss_op.cc)
detection_library(box_decoder_and_assign_op SRCS box_decoder_and_assign_op.cc box_decoder_and_assign_op.cu)
if(WITH_GPU)
detection_library(generate_proposals_op SRCS generate_proposals_op.cc generate_proposals_op.cu DEPS memory cub)
detection_library(distribute_fpn_proposals_op SRCS distribute_fpn_proposals_op.cc distribute_fpn_proposals_op.cu DEPS memory cub)
else()
detection_library(generate_proposals_op SRCS generate_proposals_op.cc)
detection_library(distribute_fpn_proposals_op SRCS distribute_fpn_proposals_op.cc)
endif()
detection_library(roi_perspective_transform_op SRCS roi_perspective_transform_op.cc roi_perspective_transform_op.cu)
......
/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/operators/detection/box_decoder_and_assign_op.h"
namespace paddle {
namespace operators {
using LoDTensor = framework::LoDTensor;
class BoxDecoderAndAssignOp : public framework::OperatorWithKernel {
public:
using framework::OperatorWithKernel::OperatorWithKernel;
protected:
void InferShape(framework::InferShapeContext *ctx) const override {
PADDLE_ENFORCE(
ctx->HasInput("PriorBox"),
"Input(PriorBox) of BoxDecoderAndAssignOp should not be null.");
PADDLE_ENFORCE(
ctx->HasInput("PriorBoxVar"),
"Input(PriorBoxVar) of BoxDecoderAndAssignOp should not be null.");
PADDLE_ENFORCE(
ctx->HasInput("TargetBox"),
"Input(TargetBox) of BoxDecoderAndAssignOp should not be null.");
PADDLE_ENFORCE(
ctx->HasInput("BoxScore"),
"Input(BoxScore) of BoxDecoderAndAssignOp should not be null.");
PADDLE_ENFORCE(
ctx->HasOutput("DecodeBox"),
"Output(DecodeBox) of BoxDecoderAndAssignOp should not be null.");
PADDLE_ENFORCE(
ctx->HasOutput("OutputAssignBox"),
"Output(OutputAssignBox) of BoxDecoderAndAssignOp should not be null.");
auto prior_box_dims = ctx->GetInputDim("PriorBox");
auto prior_box_var_dims = ctx->GetInputDim("PriorBoxVar");
auto target_box_dims = ctx->GetInputDim("TargetBox");
auto box_score_dims = ctx->GetInputDim("BoxScore");
PADDLE_ENFORCE_EQ(prior_box_dims.size(), 2,
"The rank of Input of PriorBox must be 2");
PADDLE_ENFORCE_EQ(prior_box_dims[1], 4, "The shape of PriorBox is [N, 4]");
PADDLE_ENFORCE_EQ(prior_box_var_dims.size(), 1,
"The rank of Input of PriorBoxVar must be 1");
PADDLE_ENFORCE_EQ(prior_box_var_dims[0], 4,
"The shape of PriorBoxVar is [4]");
PADDLE_ENFORCE_EQ(target_box_dims.size(), 2,
"The rank of Input of TargetBox must be 2");
PADDLE_ENFORCE_EQ(box_score_dims.size(), 2,
"The rank of Input of BoxScore must be 2");
PADDLE_ENFORCE_EQ(prior_box_dims[0], target_box_dims[0],
"The first dim of prior_box and target_box is roi nums "
"and should be same!");
PADDLE_ENFORCE_EQ(prior_box_dims[0], box_score_dims[0],
"The first dim of prior_box and box_score is roi nums "
"and should be same!");
PADDLE_ENFORCE_EQ(target_box_dims[1], box_score_dims[1] * prior_box_dims[1],
"The shape of target_box is [N, classnum * 4], The shape "
"of box_score is [N, classnum], The shape of prior_box "
"is [N, 4]");
ctx->SetOutputDim("DecodeBox", framework::make_ddim({target_box_dims[0],
target_box_dims[1]}));
ctx->ShareLoD("TargetBox", /*->*/ "DecodeBox");
ctx->SetOutputDim(
"OutputAssignBox",
framework::make_ddim({prior_box_dims[0], prior_box_dims[1]}));
ctx->ShareLoD("PriorBox", /*->*/ "OutputAssignBox");
}
};
class BoxDecoderAndAssignOpMaker : public framework::OpProtoAndCheckerMaker {
public:
void Make() override {
AddInput(
"PriorBox",
"(Tensor, default Tensor<float>) "
"Box list PriorBox is a 2-D Tensor with shape [N, 4] which holds N "
"boxes and each box is represented as [xmin, ymin, xmax, ymax], "
"[xmin, ymin] is the left top coordinate of the anchor box, "
"if the input is image feature map, they are close to the origin "
"of the coordinate system. [xmax, ymax] is the right bottom "
"coordinate of the anchor box.");
AddInput("PriorBoxVar",
"(Tensor, default Tensor<float>, optional) "
"PriorBoxVar is a 2-D Tensor with shape [N, 4] which holds N "
"group of variance. PriorBoxVar will set all elements to 1 by "
"default.")
.AsDispensable();
AddInput("TargetBox",
"(LoDTensor or Tensor) "
"This input can be a 2-D LoDTensor with shape "
"[N, classnum*4]. It holds N targets for N boxes.");
AddInput("BoxScore",
"(LoDTensor or Tensor) "
"This input can be a 2-D LoDTensor with shape "
"[N, classnum], each box is represented as [classnum] which is "
"the classification probabilities.");
AddAttr<float>("box_clip",
"(float, default 4.135, np.log(1000. / 16.)) "
"clip box to prevent overflowing")
.SetDefault(4.135f);
AddOutput("DecodeBox",
"(LoDTensor or Tensor) "
"the output tensor of op with shape [N, classnum * 4] "
"representing the result of N target boxes decoded with "
"M Prior boxes and variances for each class.");
AddOutput("OutputAssignBox",
"(LoDTensor or Tensor) "
"the output tensor of op with shape [N, 4] "
"representing the result of N target boxes decoded with "
"M Prior boxes and variances with the best non-background class "
"by BoxScore.");
AddComment(R"DOC(
Bounding Box Coder.
Decode the target bounding box with the prior_box information.
The Decoding schema is described below:
$$
ox = (pw \\times pxv \\times tx + px) - \\frac{tw}{2}
$$
$$
oy = (ph \\times pyv \\times ty + py) - \\frac{th}{2}
$$
$$
ow = \\exp (pwv \\times tw) \\times pw + \\frac{tw}{2}
$$
$$
oh = \\exp (phv \\times th) \\times ph + \\frac{th}{2}
$$
where `tx`, `ty`, `tw`, `th` denote the target box's center coordinates, width
and height respectively. Similarly, `px`, `py`, `pw`, `ph` denote the
prior_box's (anchor) center coordinates, width and height. `pxv`, `pyv`, `pwv`,
`phv` denote the variance of the prior_box and `ox`, `oy`, `ow`, `oh` denote the
decoded coordinates, width and height in decode_box.
decode_box is obtained after box decode, then assigning schema is described below:
For each prior_box, use the best non-background class's decoded values to
update the prior_box locations and get output_assign_box. So, the shape of
output_assign_box is the same as PriorBox.
)DOC");
}
};
} // namespace operators
} // namespace paddle
namespace ops = paddle::operators;
REGISTER_OPERATOR(box_decoder_and_assign, ops::BoxDecoderAndAssignOp,
ops::BoxDecoderAndAssignOpMaker,
paddle::framework::EmptyGradOpMaker);
REGISTER_OP_CPU_KERNEL(
box_decoder_and_assign,
ops::BoxDecoderAndAssignKernel<paddle::platform::CPUDeviceContext, float>,
ops::BoxDecoderAndAssignKernel<paddle::platform::CPUDeviceContext, double>);
/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/memory/memcpy.h"
#include "paddle/fluid/operators/detection/box_decoder_and_assign_op.h"
#include "paddle/fluid/platform/cuda_primitives.h"
namespace paddle {
namespace operators {
template <typename T>
__global__ void DecodeBoxKernel(const T* prior_box_data,
const T* prior_box_var_data,
const T* target_box_data, const int roi_num,
const int class_num, const T box_clip,
T* output_box_data) {
const int idx = threadIdx.x + blockIdx.x * blockDim.x;
if (idx < roi_num * class_num) {
int i = idx / class_num;
int j = idx % class_num;
T prior_box_width = prior_box_data[i * 4 + 2] - prior_box_data[i * 4] + 1;
T prior_box_height =
prior_box_data[i * 4 + 3] - prior_box_data[i * 4 + 1] + 1;
T prior_box_center_x = prior_box_data[i * 4] + prior_box_width / 2;
T prior_box_center_y = prior_box_data[i * 4 + 1] + prior_box_height / 2;
int offset = i * class_num * 4 + j * 4;
T dw = prior_box_var_data[2] * target_box_data[offset + 2];
T dh = prior_box_var_data[3] * target_box_data[offset + 3];
if (dw > box_clip) {
dw = box_clip;
}
if (dh > box_clip) {
dh = box_clip;
}
T target_box_center_x = 0, target_box_center_y = 0;
T target_box_width = 0, target_box_height = 0;
target_box_center_x =
prior_box_var_data[0] * target_box_data[offset] * prior_box_width +
prior_box_center_x;
target_box_center_y =
prior_box_var_data[1] * target_box_data[offset + 1] * prior_box_height +
prior_box_center_y;
target_box_width = expf(dw) * prior_box_width;
target_box_height = expf(dh) * prior_box_height;
output_box_data[offset] = target_box_center_x - target_box_width / 2;
output_box_data[offset + 1] = target_box_center_y - target_box_height / 2;
output_box_data[offset + 2] =
target_box_center_x + target_box_width / 2 - 1;
output_box_data[offset + 3] =
target_box_center_y + target_box_height / 2 - 1;
}
}
template <typename T>
__global__ void AssignBoxKernel(const T* prior_box_data,
const T* box_score_data, T* output_box_data,
const int roi_num, const int class_num,
T* output_assign_box_data) {
const int idx = threadIdx.x + blockIdx.x * blockDim.x;
if (idx < roi_num) {
int i = idx;
T max_score = -1;
int max_j = -1;
for (int j = 0; j < class_num; ++j) {
T score = box_score_data[i * class_num + j];
if (score > max_score && j > 0) {
max_score = score;
max_j = j;
}
}
if (max_j > 0) {
for (int pno = 0; pno < 4; pno++) {
output_assign_box_data[i * 4 + pno] =
output_box_data[i * class_num * 4 + max_j * 4 + pno];
}
} else {
for (int pno = 0; pno < 4; pno++) {
output_assign_box_data[i * 4 + pno] = prior_box_data[i * 4 + pno];
}
}
}
}
template <typename DeviceContext, typename T>
class BoxDecoderAndAssignCUDAKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext& context) const override {
PADDLE_ENFORCE(platform::is_gpu_place(context.GetPlace()),
"This kernel only runs on GPU device.");
auto* prior_box = context.Input<framework::LoDTensor>("PriorBox");
auto* prior_box_var = context.Input<framework::Tensor>("PriorBoxVar");
auto* target_box = context.Input<framework::LoDTensor>("TargetBox");
auto* box_score = context.Input<framework::LoDTensor>("BoxScore");
auto* output_box = context.Output<framework::Tensor>("DecodeBox");
auto* output_assign_box =
context.Output<framework::Tensor>("OutputAssignBox");
auto roi_num = target_box->dims()[0];
auto class_num = box_score->dims()[1];
auto* target_box_data = target_box->data<T>();
auto* prior_box_data = prior_box->data<T>();
auto* prior_box_var_data = prior_box_var->data<T>();
auto* box_score_data = box_score->data<T>();
output_box->mutable_data<T>({roi_num, class_num * 4}, context.GetPlace());
output_assign_box->mutable_data<T>({roi_num, 4}, context.GetPlace());
T* output_box_data = output_box->data<T>();
T* output_assign_box_data = output_assign_box->data<T>();
int block = 512;
int grid = (roi_num * class_num + block - 1) / block;
auto& device_ctx = context.cuda_device_context();
const T box_clip = context.Attr<T>("box_clip");
DecodeBoxKernel<T><<<grid, block, 0, device_ctx.stream()>>>(
prior_box_data, prior_box_var_data, target_box_data, roi_num, class_num,
box_clip, output_box_data);
context.device_context().Wait();
int assign_grid = (roi_num + block - 1) / block;
AssignBoxKernel<T><<<assign_grid, block, 0, device_ctx.stream()>>>(
prior_box_data, box_score_data, output_box_data, roi_num, class_num,
output_assign_box_data);
context.device_context().Wait();
}
};
} // namespace operators
} // namespace paddle
namespace ops = paddle::operators;
REGISTER_OP_CUDA_KERNEL(
box_decoder_and_assign,
ops::BoxDecoderAndAssignCUDAKernel<paddle::platform::CUDADeviceContext,
float>,
ops::BoxDecoderAndAssignCUDAKernel<paddle::platform::CUDADeviceContext,
double>);
/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include <algorithm>
#include <string>
#include <vector>
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/operators/math/math_function.h"
namespace paddle {
namespace operators {
template <typename DeviceContext, typename T>
class BoxDecoderAndAssignKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext& context) const override {
auto* prior_box = context.Input<framework::LoDTensor>("PriorBox");
auto* prior_box_var = context.Input<framework::Tensor>("PriorBoxVar");
auto* target_box = context.Input<framework::LoDTensor>("TargetBox");
auto* box_score = context.Input<framework::LoDTensor>("BoxScore");
auto* output_box = context.Output<framework::Tensor>("DecodeBox");
auto* output_assign_box =
context.Output<framework::Tensor>("OutputAssignBox");
int roi_num = target_box->dims()[0];
int class_num = box_score->dims()[1];
auto* target_box_data = target_box->data<T>();
auto* prior_box_data = prior_box->data<T>();
auto* prior_box_var_data = prior_box_var->data<T>();
auto* box_score_data = box_score->data<T>();
output_box->mutable_data<T>({roi_num, class_num * 4}, context.GetPlace());
output_assign_box->mutable_data<T>({roi_num, 4}, context.GetPlace());
T* output_box_data = output_box->data<T>();
T* output_assign_box_data = output_assign_box->data<T>();
const T bbox_clip = context.Attr<T>("box_clip");
for (int i = 0; i < roi_num; ++i) {
T prior_box_width = prior_box_data[i * 4 + 2] - prior_box_data[i * 4] + 1;
T prior_box_height =
prior_box_data[i * 4 + 3] - prior_box_data[i * 4 + 1] + 1;
T prior_box_center_x = prior_box_data[i * 4] + prior_box_width / 2;
T prior_box_center_y = prior_box_data[i * 4 + 1] + prior_box_height / 2;
for (int j = 0; j < class_num; ++j) {
int64_t offset = i * class_num * 4 + j * 4;
T dw = std::min(prior_box_var_data[2] * target_box_data[offset + 2],
bbox_clip);
T dh = std::min(prior_box_var_data[3] * target_box_data[offset + 3],
bbox_clip);
T target_box_center_x = 0, target_box_center_y = 0;
T target_box_width = 0, target_box_height = 0;
target_box_center_x =
prior_box_var_data[0] * target_box_data[offset] * prior_box_width +
prior_box_center_x;
target_box_center_y = prior_box_var_data[1] *
target_box_data[offset + 1] *
prior_box_height +
prior_box_center_y;
target_box_width = std::exp(dw) * prior_box_width;
target_box_height = std::exp(dh) * prior_box_height;
output_box_data[offset] = target_box_center_x - target_box_width / 2;
output_box_data[offset + 1] =
target_box_center_y - target_box_height / 2;
output_box_data[offset + 2] =
target_box_center_x + target_box_width / 2 - 1;
output_box_data[offset + 3] =
target_box_center_y + target_box_height / 2 - 1;
}
T max_score = -1;
int max_j = -1;
for (int j = 0; j < class_num; ++j) {
T score = box_score_data[i * class_num + j];
if (score > max_score && j > 0) {
max_score = score;
max_j = j;
}
}
if (max_j > 0) {
for (int pno = 0; pno < 4; pno++) {
output_assign_box_data[i * 4 + pno] =
output_box_data[i * class_num * 4 + max_j * 4 + pno];
}
} else {
for (int pno = 0; pno < 4; pno++) {
output_assign_box_data[i * 4 + pno] = prior_box_data[i * 4 + pno];
}
}
}
}
};
} // namespace operators
} // namespace paddle
/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/operators/detection/distribute_fpn_proposals_op.h"
namespace paddle {
namespace operators {
class DistributeFpnProposalsOp : public framework::OperatorWithKernel {
public:
using framework::OperatorWithKernel::OperatorWithKernel;
void InferShape(framework::InferShapeContext* ctx) const override {
PADDLE_ENFORCE(ctx->HasInput("FpnRois"),
"Input(FpnRois) shouldn't be null");
PADDLE_ENFORCE_GE(
ctx->Outputs("MultiFpnRois").size(), 1UL,
"Outputs(MultiFpnRois) of DistributeOp should not be empty");
size_t min_level = static_cast<size_t>(ctx->Attrs().Get<int>("min_level"));
size_t max_level = static_cast<size_t>(ctx->Attrs().Get<int>("max_level"));
PADDLE_ENFORCE_GE(max_level, min_level,
"max_level must not lower than min_level");
// Set the output shape
size_t num_out_rois = max_level - min_level + 1;
std::vector<framework::DDim> outs_dims;
outs_dims.reserve(num_out_rois);
for (size_t i = 0; i < num_out_rois; ++i) {
framework::DDim out_dim = {-1, 4};
outs_dims.push_back(out_dim);
}
ctx->SetOutputsDim("MultiFpnRois", outs_dims);
ctx->SetOutputDim("RestoreIndex", {1, -1});
}
protected:
framework::OpKernelType GetExpectedKernelType(
const framework::ExecutionContext& ctx) const override {
auto data_type = framework::GetDataTypeOfVar(ctx.InputVar("FpnRois"));
return framework::OpKernelType(data_type, platform::CPUPlace());
}
};
class DistributeFpnProposalsOpMaker : public framework::OpProtoAndCheckerMaker {
public:
void Make() override {
AddInput("FpnRois", "(LoDTensor) The rois at all levels in shape (-1, 4)");
AddOutput("MultiFpnRois", "(LoDTensor) Output with distribute operator")
.AsDuplicable();
AddOutput("RestoreIndex",
"(Tensor) An array of positive number which is "
"used to restore the order of FpnRois");
AddAttr<int>("min_level",
"The lowest level of FPN layer where the"
" proposals come from");
AddAttr<int>("max_level",
"The highest level of FPN layer where the"
" proposals come from");
AddAttr<int>("refer_level",
"The referring level of FPN layer with"
" specified scale");
AddAttr<int>("refer_scale",
"The referring scale of FPN layer with"
" specified level");
AddComment(R"DOC(
This operator distribute all proposals into different fpn level,
with respect to scale of the proposals, the referring scale and
the referring level. Besides, to restore the order of proposals,
we return an array which indicate the original index of rois in
current proposals.
)DOC");
}
};
} // namespace operators
} // namespace paddle
namespace ops = paddle::operators;
REGISTER_OPERATOR(distribute_fpn_proposals, ops::DistributeFpnProposalsOp,
ops::DistributeFpnProposalsOpMaker,
paddle::framework::EmptyGradOpMaker);
REGISTER_OP_CPU_KERNEL(distribute_fpn_proposals,
ops::DistributeFpnProposalsOpKernel<float>,
ops::DistributeFpnProposalsOpKernel<double>);
/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include <paddle/fluid/memory/allocation/allocator.h>
#include "cub/cub.cuh"
#include "paddle/fluid/memory/memcpy.h"
#include "paddle/fluid/operators/detection/distribute_fpn_proposals_op.h"
#include "paddle/fluid/operators/gather.cu.h"
#include "paddle/fluid/platform/cuda_primitives.h"
#include "paddle/fluid/platform/for_range.h"
namespace paddle {
namespace operators {
using Tensor = framework::Tensor;
using LoDTensor = framework::LoDTensor;
static constexpr int kNumCUDAThreads = 512;
static constexpr int kNumMaxinumNumBlocks = 4096;
#define CUDA_1D_KERNEL_LOOP(i, n) \
for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < (n); \
i += blockDim.x * gridDim.x)
int const BBoxSize = 4;
struct RangeInitFunctor {
int start_;
int delta_;
int* out_;
__device__ void operator()(size_t i) { out_[i] = start_ + i * delta_; }
};
static inline int NumBlocks(const int N) {
return std::min((N + kNumCUDAThreads - 1) / kNumCUDAThreads,
kNumMaxinumNumBlocks);
}
static inline void TransLoD(const int* length_lod, const int lod_size,
int* offset_lod) {
int offset = 0;
for (int i = 0; i < lod_size; ++i) {
offset_lod[i] = offset;
offset += length_lod[i];
}
}
template <typename T>
static __device__ inline T RoIArea(const T* box, bool normalized) {
if (box[2] < box[0] || box[3] < box[1]) {
// If coordinate values are is invalid
// (e.g. xmax < xmin or ymax < ymin), return 0.
return static_cast<T>(0.);
} else {
const T w = box[2] - box[0];
const T h = box[3] - box[1];
if (normalized) {
return w * h;
} else {
// If coordinate values are not within range [0, 1].
return (w + 1) * (h + 1);
}
}
}
template <class T>
static __global__ void GPUDistFpnProposalsHelper(
const int nthreads, const T* rois, const int lod_size,
const int refer_level, const int refer_scale, const int max_level,
const int min_level, int* roi_batch_id_data, int* sub_lod_list,
int* target_lvls) {
CUDA_1D_KERNEL_LOOP(i, nthreads) {
const T* offset_roi = rois + i * BBoxSize;
int roi_batch_ind = roi_batch_id_data[i];
// get the target level of current rois
T roi_area = RoIArea(offset_roi, false);
T roi_scale = sqrt(roi_area);
int tgt_lvl = floor(log2(roi_scale / refer_scale) + refer_level);
tgt_lvl = min(max_level, max(tgt_lvl, min_level));
target_lvls[i] = tgt_lvl;
// compute number of rois in the same batch and same target level
platform::CudaAtomicAdd(sub_lod_list + tgt_lvl * lod_size + roi_batch_ind,
1);
}
}
template <typename DeviceContext, typename T>
class GPUDistributeFpnProposalsOpKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext& ctx) const override {
auto* fpn_rois = ctx.Input<paddle::framework::LoDTensor>("FpnRois");
auto multi_fpn_rois = ctx.MultiOutput<LoDTensor>("MultiFpnRois");
auto* restore_index = ctx.Output<Tensor>("RestoreIndex");
const int min_level = ctx.Attr<int>("min_level");
const int max_level = ctx.Attr<int>("max_level");
const int refer_level = ctx.Attr<int>("refer_level");
const int refer_scale = ctx.Attr<int>("refer_scale");
int num_level = max_level - min_level + 1;
// check that the fpn_rois is not empty
PADDLE_ENFORCE_EQ(fpn_rois->lod().size(), 1UL,
"DistributeFpnProposalsOp need 1 level of LoD");
auto fpn_rois_lod = fpn_rois->lod().back();
int lod_size = fpn_rois_lod.size() - 1;
int roi_num = fpn_rois_lod[lod_size];
auto& dev_ctx = ctx.template device_context<DeviceContext>();
// get batch id by lod in CPU
Tensor roi_batch_id_list;
roi_batch_id_list.Resize({roi_num});
int* roi_batch_id_data =
roi_batch_id_list.mutable_data<int>(platform::CPUPlace());
for (int n = 0; n < lod_size; ++n) {
for (size_t i = fpn_rois_lod[n]; i < fpn_rois_lod[n + 1]; ++i) {
roi_batch_id_data[i] = n;
}
}
// copy batch id list to GPU
Tensor roi_batch_id_list_gpu;
framework::TensorCopySync(roi_batch_id_list, dev_ctx.GetPlace(),
&roi_batch_id_list_gpu);
Tensor sub_lod_list;
sub_lod_list.Resize({num_level, lod_size});
int* sub_lod_list_data = sub_lod_list.mutable_data<int>(dev_ctx.GetPlace());
Tensor target_lvls;
target_lvls.Resize({roi_num});
int* target_lvls_data = target_lvls.mutable_data<int>(dev_ctx.GetPlace());
int blocks = NumBlocks(roi_num);
int threads = kNumCUDAThreads;
// get target levels and sub_lod list
GPUDistFpnProposalsHelper<T><<<blocks, threads>>>(
roi_num, fpn_rois->data<T>(), lod_size, refer_level, refer_scale,
max_level, min_level, roi_batch_id_list_gpu.data<int>(),
sub_lod_list_data, target_lvls_data);
Tensor index_in_t;
int* idx_in = index_in_t.mutable_data<int>({roi_num}, dev_ctx.GetPlace());
platform::ForRange<platform::CUDADeviceContext> for_range(dev_ctx, roi_num);
for_range(RangeInitFunctor{0, 1, idx_in});
Tensor keys_out_t;
int* keys_out = keys_out_t.mutable_data<int>({roi_num}, dev_ctx.GetPlace());
Tensor index_out_t;
int* idx_out = index_out_t.mutable_data<int>({roi_num}, dev_ctx.GetPlace());
// Determine temporary device storage requirements
size_t temp_storage_bytes = 0;
cub::DeviceRadixSort::SortPairsDescending<int, int>(
nullptr, temp_storage_bytes, target_lvls_data, keys_out, idx_in,
idx_out, roi_num);
// Allocate temporary storage
auto place = boost::get<platform::CUDAPlace>(dev_ctx.GetPlace());
auto d_temp_storage = memory::Alloc(place, temp_storage_bytes,
memory::Allocator::kScratchpad);
// Run sorting operation
// sort target level to get corresponding index
cub::DeviceRadixSort::SortPairsDescending<int, int>(
d_temp_storage->ptr(), temp_storage_bytes, target_lvls_data, keys_out,
idx_in, idx_out, roi_num);
int* restore_idx_data =
restore_index->mutable_data<int>({roi_num, 1}, dev_ctx.GetPlace());
// sort current index to get restore index
cub::DeviceRadixSort::SortPairsDescending<int, int>(
d_temp_storage->ptr(), temp_storage_bytes, idx_out, keys_out, idx_in,
restore_idx_data, roi_num);
Tensor offset_lod;
int* offset_lod_data =
offset_lod.mutable_data<int>({lod_size + 1}, dev_ctx.GetPlace());
for (int i = 0; i < num_level; ++i) {
Tensor sub_lod = sub_lod_list.Slice(i, i + 1);
int* sub_lod_data = sub_lod.data<int>();
// transfer length-based lod to offset-based lod
TransLoD(sub_lod_data, lod_size + 1, offset_lod_data);
int sub_rois_num = offset_lod_data[lod_size];
Tensor sub_idx = index_out_t.Slice(0, sub_rois_num);
multi_fpn_rois[i]->mutable_data<T>({sub_rois_num, kBoxDim},
dev_ctx.GetPlace());
GPUGather<T>(dev_ctx, *fpn_rois, sub_idx, multi_fpn_rois[i]);
framework::LoD lod;
std::vector<size_t> offset;
memory::Copy(platform::CPUPlace(), offset.data(), place, offset_lod_data,
sizeof(int) * (lod_size + 1), 0);
lod.emplace_back(offset);
multi_fpn_rois[i]->set_lod(lod);
}
}
};
} // namespace operators
} // namespace paddle
namespace ops = paddle::operators;
REGISTER_OP_CUDA_KERNEL(
distribute_fpn_proposals,
ops::GPUDistributeFpnProposalsOpKernel<paddle::platform::CUDADeviceContext,
float>,
ops::GPUDistributeFpnProposalsOpKernel<paddle::platform::CUDADeviceContext,
double>);
/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include <algorithm>
#include <cmath>
#include <cstring>
#include <string>
#include <vector>
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/operators/detail/safe_ref.h"
#include "paddle/fluid/operators/gather.h"
#include "paddle/fluid/operators/math/math_function.h"
namespace paddle {
namespace operators {
const int kBoxDim = 4;
template <typename T>
static inline T BBoxArea(const T* box, bool normalized) {
if (box[2] < box[0] || box[3] < box[1]) {
// If coordinate values are is invalid
// (e.g. xmax < xmin or ymax < ymin), return 0.
return static_cast<T>(0.);
} else {
const T w = box[2] - box[0];
const T h = box[3] - box[1];
if (normalized) {
return w * h;
} else {
// If coordinate values are not within range [0, 1].
return (w + 1) * (h + 1);
}
}
}
template <typename T>
class DistributeFpnProposalsOpKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext& context) const override {
auto* fpn_rois = context.Input<paddle::framework::LoDTensor>("FpnRois");
auto multi_fpn_rois =
context.MultiOutput<paddle::framework::LoDTensor>("MultiFpnRois");
auto* restore_index =
context.Output<paddle::framework::Tensor>("RestoreIndex");
const int min_level = context.Attr<int>("min_level");
const int max_level = context.Attr<int>("max_level");
const int refer_level = context.Attr<int>("refer_level");
const int refer_scale = context.Attr<int>("refer_scale");
const int num_level = max_level - min_level + 1;
// check that the fpn_rois is not empty
PADDLE_ENFORCE_EQ(fpn_rois->lod().size(), 1UL,
"DistributeFpnProposalsOp need 1 level of LoD");
auto fpn_rois_lod = fpn_rois->lod().back();
int fpn_rois_num = fpn_rois_lod[fpn_rois_lod.size() - 1];
std::vector<int> target_level;
// std::vector<int> target_level(fpn_rois_num, -1);
// record the number of rois in each level
std::vector<int> num_rois_level(num_level, 0);
std::vector<int> num_rois_level_integral(num_level + 1, 0);
for (int i = 0; i < fpn_rois_lod.size() - 1; ++i) {
Tensor fpn_rois_slice =
fpn_rois->Slice(fpn_rois_lod[i], fpn_rois_lod[i + 1]);
const T* rois_data = fpn_rois_slice.data<T>();
for (int j = 0; j < fpn_rois_slice.dims()[0]; ++j) {
// get the target level of current rois
T roi_scale = std::sqrt(BBoxArea(rois_data, false));
int tgt_lvl =
std::floor(std::log2(roi_scale / refer_scale) + refer_level);
tgt_lvl = std::min(max_level, std::max(tgt_lvl, min_level));
target_level.push_back(tgt_lvl);
num_rois_level[tgt_lvl - min_level]++;
rois_data += kBoxDim;
}
}
// define the output rois
// pointer which point to each level fpn rois
std::vector<T*> multi_fpn_rois_data(num_level);
// lod0 which will record the offset information of each level rois
std::vector<std::vector<size_t>> multi_fpn_rois_lod0;
for (int i = 0; i < num_level; ++i) {
// allocate memory for each level rois
multi_fpn_rois[i]->mutable_data<T>({num_rois_level[i], kBoxDim},
context.GetPlace());
multi_fpn_rois_data[i] = multi_fpn_rois[i]->data<T>();
std::vector<size_t> lod0(1, 0);
multi_fpn_rois_lod0.push_back(lod0);
// statistic start point for each level rois
num_rois_level_integral[i + 1] =
num_rois_level_integral[i] + num_rois_level[i];
}
restore_index->mutable_data<int>({1, fpn_rois_num}, context.GetPlace());
int* restore_index_data = restore_index->data<int>();
std::vector<int> restore_index_inter(fpn_rois_num, -1);
// distribute the rois into different fpn level by target level
for (int i = 0; i < fpn_rois_lod.size() - 1; ++i) {
Tensor fpn_rois_slice =
fpn_rois->Slice(fpn_rois_lod[i], fpn_rois_lod[i + 1]);
const T* rois_data = fpn_rois_slice.data<T>();
size_t cur_offset = fpn_rois_lod[i];
// std::vector<size_t > lod_offset[num_level];
for (int j = 0; j < num_level; j++) {
multi_fpn_rois_lod0[j].push_back(multi_fpn_rois_lod0[j][i]);
}
for (int j = 0; j < fpn_rois_slice.dims()[0]; ++j) {
int lvl = target_level[cur_offset + j];
memcpy(multi_fpn_rois_data[lvl - min_level], rois_data,
kBoxDim * sizeof(T));
multi_fpn_rois_data[lvl - min_level] += kBoxDim;
int index_in_shuffle = num_rois_level_integral[lvl - min_level] +
multi_fpn_rois_lod0[lvl - min_level][i + 1];
restore_index_inter[index_in_shuffle] = cur_offset + j;
multi_fpn_rois_lod0[lvl - min_level][i + 1]++;
rois_data += kBoxDim;
}
}
for (int i = 0; i < fpn_rois_num; ++i) {
restore_index_data[restore_index_inter[i]] = i;
}
// merge lod information into LoDTensor
for (int i = 0; i < num_level; ++i) {
framework::LoD lod;
lod.emplace_back(multi_fpn_rois_lod0[i]);
multi_fpn_rois[i]->set_lod(lod);
}
}
};
} // namespace operators
} // namespace paddle
......@@ -22,7 +22,6 @@ limitations under the License. */
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/framework/selected_rows.h"
#include "paddle/fluid/operators/jit/kernels.h"
#include "paddle/fluid/operators/math/blas.h"
namespace paddle {
namespace operators {
......@@ -47,7 +46,7 @@ struct EmbeddingVSumFunctor {
auto *output = output_t->mutable_data<T>(context.GetPlace());
PADDLE_ENFORCE_LE(table_width * idx_width, out_width);
PADDLE_ENFORCE_GT(ids_lod.size(), 1UL);
PADDLE_ENFORCE_GT(ids_lod.size(), 1UL, "The LoD[0] could NOT be empty");
jit::emb_seq_pool_attr_t attr(table_height, table_width, 0, idx_width,
out_width, jit::SeqPoolType::kSum);
......@@ -83,11 +82,11 @@ class FusedEmbeddingSeqPoolKernel : public framework::OpKernel<T> {
FusedEmbeddingSeqPoolLastDim(table_var->dims(), ids_t->dims());
const auto &ids_lod = ids_t->lod();
// in run time, the LoD of ids must be 1
PADDLE_ENFORCE(ids_lod.size(), 1u, "The LoD level of Input(Ids) must be 1");
PADDLE_ENFORCE_GE(ids_lod[0].size(), 1u, "The LoD could NOT be empty");
PADDLE_ENFORCE(ids_lod.size(), 1UL,
"The LoD level of Input(Ids) must be 1");
int64_t batch_size = ids_lod[0].size() - 1;
// in run time, the shape from Ids -> output
// should be [seq_length, 1] -> [batch_size, embedding_size]
// should be [seq_length, 1] -> [batch_size, last_dim]
output_t->Resize({batch_size, last_dim});
if (combiner_type == "sum") {
......@@ -125,7 +124,7 @@ class FusedEmbeddingSeqPoolGradKernel : public framework::OpKernel<T> {
auto *ids_data = ids->data<int64_t>();
int64_t ids_num = ids->numel();
auto lod = ids->lod()[0];
int64_t row_width = d_output->dims()[1];
int64_t out_width = d_output->dims()[1];
framework::Vector<int64_t> *new_rows = d_table->mutable_rows();
new_rows->resize(ids_num);
......@@ -136,15 +135,13 @@ class FusedEmbeddingSeqPoolGradKernel : public framework::OpKernel<T> {
T *d_table_data = d_table_value->mutable_data<T>(context.GetPlace());
const T *d_output_data = d_output->data<T>();
auto blas = math::GetBlas<platform::CPUDeviceContext, T>(context);
auto vbroadcast = jit::Get<jit::kVBroadcast, jit::VBroadcastTuples<T>,
platform::CPUPlace>(out_width);
for (int i = 0; i < static_cast<int>(lod.size()) - 1; ++i) {
int64_t h = static_cast<int64_t>(lod[i + 1] - lod[i]);
int64_t in_offset = lod[i] * row_width;
const T *out_pos = d_output_data + i * row_width;
T *in_pos = d_table_data + in_offset;
for (int r = 0; r != h; ++r) {
blas.VCOPY(row_width, out_pos, in_pos + r * row_width);
}
const T *src = d_output_data + i * out_width;
T *dst = d_table_data + lod[i] * out_width;
vbroadcast(src, dst, h, out_width);
}
} else {
LOG(ERROR) << "Dense is not supported in fused_embedding_seq_pool_op now";
......
......@@ -474,6 +474,23 @@ void BenchCRFDecodingKernel() {
}
}
template <jit::KernelType KT, typename T, typename PlaceType>
void BenchVBroadcastKernel() {
for (int64_t w : {1, 16, 64, 100, 256}) {
Tensor x;
x.Resize({w});
RandomVec<T>(w, x.mutable_data<T>(PlaceType()));
const T* x_data = x.data<T>();
for (int h : TestSizes()) {
Tensor y;
y.Resize({h * w});
T* y_data = y.mutable_data<T>(PlaceType());
BenchAllImpls<KT, jit::VBroadcastTuples<T>, PlaceType>(
w, x_data, y_data, static_cast<int64_t>(h), w);
}
}
}
using T = float;
using CPUPlace = paddle::platform::CPUPlace;
......@@ -498,6 +515,7 @@ BENCH_FP32_CPU(kVSquare) { BenchXYNKernel<jit::kVSquare, T, CPUPlace>(); }
BENCH_FP32_CPU(kVExp) { BenchXYNKernel<jit::kVExp, T, CPUPlace>(); }
BENCH_FP32_CPU(kVSigmoid) { BenchXYNKernel<jit::kVSigmoid, T, CPUPlace>(); }
BENCH_FP32_CPU(kVTanh) { BenchXYNKernel<jit::kVTanh, T, CPUPlace>(); }
BENCH_FP32_CPU(kVCopy) { BenchXYNKernel<jit::kVCopy, T, CPUPlace>(); }
// lstm and peephole
BENCH_FP32_CPU(kLSTMCtHt) { BenchLSTMKernel<jit::kLSTMCtHt, T, CPUPlace>(); }
......@@ -535,6 +553,11 @@ BENCH_FP32_CPU(kCRFDecoding) {
BenchCRFDecodingKernel<jit::kCRFDecoding, T, CPUPlace>();
}
// vbroadcast function
BENCH_FP32_CPU(kVBroadcast) {
BenchVBroadcastKernel<jit::kVBroadcast, T, CPUPlace>();
}
// Benchmark all jit kernels including jitcode, mkl and refer.
// To use this tool, run command: ./benchmark [options...]
// Options:
......
......@@ -33,3 +33,4 @@ USE_JITKERNEL_GEN(kHMax)
USE_JITKERNEL_GEN(kHSum)
USE_JITKERNEL_GEN(kEmbSeqPool)
USE_JITKERNEL_GEN(kSgd)
USE_JITKERNEL_GEN(kVBroadcast)
/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License. */
#include "paddle/fluid/operators/jit/gen/vbroadcast.h"
#include <memory>
#include <vector>
#include "paddle/fluid/operators/jit/registry.h"
#include "paddle/fluid/platform/cpu_info.h"
namespace paddle {
namespace operators {
namespace jit {
namespace gen {
void VBroadcastJitCode::genCode() {
preCode();
constexpr int block = YMM_FLOAT_BLOCK;
constexpr int max_num_regs = 16;
const int num_block = w_ / block;
const int num_groups = num_block / max_num_regs;
const size_t block_size = sizeof(float) * block;
std::vector<int> groups(num_groups, max_num_regs);
int rest_num_regs = num_block % max_num_regs;
if (rest_num_regs > 0) {
groups.push_back(rest_num_regs);
}
// protect param_h
mov(reg_height, param_h);
Label l_next_h;
xor_(reg_h_i, reg_h_i);
mov(reg_ptr_dst_i, param_dst);
L(l_next_h);
{
mov(reg_ptr_src_i, param_src);
for (int num_regs : groups) {
size_t w_offset = 0;
for (int reg_i = 0; reg_i < num_regs; ++reg_i) {
vmovups(ymm_t(reg_i), ptr[reg_ptr_src_i + w_offset]);
w_offset += block_size;
}
add(reg_ptr_src_i, num_regs * block_size);
w_offset = 0;
for (int reg_i = 0; reg_i < num_regs; ++reg_i) {
vmovups(ptr[reg_ptr_dst_i + w_offset], ymm_t(reg_i));
w_offset += block_size;
}
add(reg_ptr_dst_i, num_regs * block_size);
} // end of groups
inc(reg_h_i);
cmp(reg_h_i, reg_height);
jl(l_next_h, T_NEAR);
} // end of l_next_h
postCode();
}
class VBroadcastCreator : public JitCodeCreator<int64_t> {
public:
bool UseMe(const int64_t& w) const override {
return platform::MayIUse(platform::avx) && w % YMM_FLOAT_BLOCK == 0;
}
size_t CodeSize(const int64_t& w) const override {
return 96 + (w / YMM_FLOAT_BLOCK) * 16 * 8;
}
std::unique_ptr<GenBase> CreateJitCode(const int64_t& w) const override {
PADDLE_ENFORCE_GT(w, 0);
return make_unique<VBroadcastJitCode>(w, CodeSize(w));
}
};
} // namespace gen
} // namespace jit
} // namespace operators
} // namespace paddle
namespace gen = paddle::operators::jit::gen;
REGISTER_JITKERNEL_GEN(kVBroadcast, gen::VBroadcastCreator);
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License. */
#pragma once
#include <string>
#include "glog/logging.h"
#include "paddle/fluid/operators/jit/gen/jitcode.h"
namespace paddle {
namespace operators {
namespace jit {
namespace gen {
class VBroadcastJitCode : public JitCode {
public:
explicit VBroadcastJitCode(const int64_t& w, size_t code_size = 256 * 1024,
void* code_ptr = nullptr)
: JitCode(code_size, code_ptr), w_(w) {
this->genCode();
}
DECLARE_JIT_CODE(VBroadcastJitCode);
void genCode() override;
private:
int w_;
reg64_t param_src{abi_param1};
reg64_t param_dst{abi_param2};
reg64_t param_h{abi_param3};
reg64_t param_w{abi_param4};
reg64_t reg_height{r9};
reg64_t reg_h_i{r10};
reg64_t reg_ptr_src_i{r11};
reg64_t reg_ptr_dst_i{r12};
};
} // namespace gen
} // namespace jit
} // namespace operators
} // namespace paddle
......@@ -36,6 +36,8 @@ const char* to_string(KernelType kt) {
ONE_CASE(kVScal);
ONE_CASE(kVAddBias);
ONE_CASE(kVRelu);
ONE_CASE(kVBroadcast);
ONE_CASE(kVCopy);
ONE_CASE(kVIdentity);
ONE_CASE(kVExp);
ONE_CASE(kVSquare);
......
......@@ -41,6 +41,8 @@ typedef enum {
kVAdd,
kVAddBias,
kVAddRelu,
kVBroadcast,
kVCopy,
kVExp,
kVIdentity,
kVMul,
......@@ -133,6 +135,13 @@ struct GRUTuples {
typedef void (*func_type)(gru_t*, const gru_attr_t*);
};
template <typename T>
struct VBroadcastTuples {
typedef T data_type;
typedef int64_t attr_type;
typedef void (*func_type)(const T*, T*, int64_t, int64_t);
};
typedef struct seq_pool_attr_s {
int h, w; // h should always be the first one
SeqPoolType type;
......
......@@ -24,6 +24,11 @@ size_t JitCodeKey<int>(const int& d) {
return d;
}
template <>
size_t JitCodeKey<int64_t>(const int64_t& d) {
return d;
}
// TODO(TJ): refine and benchmark JitCodeKey generatation
constexpr int act_type_shift = 3; // suppot 2^3 act types
static inline int act_type_convert(KernelType type) {
......
......@@ -9,9 +9,11 @@ USE_JITKERNEL_MORE(kVAdd, mkl)
USE_JITKERNEL_MORE(kVScal, mkl)
USE_JITKERNEL_MORE(kVExp, mkl)
USE_JITKERNEL_MORE(kVSquare, mkl)
USE_JITKERNEL_MORE(kVCopy, mkl)
USE_JITKERNEL_MORE(kVSigmoid, mkl)
USE_JITKERNEL_MORE(kVTanh, mkl)
USE_JITKERNEL_MORE(kSeqPool, mkl)
USE_JITKERNEL_MORE(kSoftmax, mkl)
USE_JITKERNEL_MORE(kEmbSeqPool, mkl)
USE_JITKERNEL_MORE(kSgd, mkl)
USE_JITKERNEL_MORE(kVBroadcast, mkl)
......@@ -13,6 +13,7 @@ USE_JITKERNEL_REFER(kVAddRelu)
USE_JITKERNEL_REFER(kVSub)
USE_JITKERNEL_REFER(kVScal)
USE_JITKERNEL_REFER(kVAddBias)
USE_JITKERNEL_REFER(kVCopy)
USE_JITKERNEL_REFER(kVRelu)
USE_JITKERNEL_REFER(kVIdentity)
USE_JITKERNEL_REFER(kVExp)
......@@ -34,3 +35,4 @@ USE_JITKERNEL_REFER(kHMax)
USE_JITKERNEL_REFER(kSoftmax)
USE_JITKERNEL_REFER(kEmbSeqPool)
USE_JITKERNEL_REFER(kSgd)
USE_JITKERNEL_REFER(kVBroadcast)
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
......@@ -30,6 +30,9 @@ class TensorRTEngineOpMaker : public framework::OpProtoAndCheckerMaker {
AddOutput("Ys", "A list of outputs").AsDuplicable();
AddAttr<std::string>("subgraph", "the subgraph.");
AddAttr<std::string>("calibration_data", "the calibration data for int8");
AddAttr<std::string>(
"engine_serialized_data",
"the serialized data contains the all info of the ICUDAEngine");
AddAttr<std::string>(
"engine_key",
"The engine_key here is used to distinguish different TRT Engines");
......
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册