未验证 提交 feb2e476 编写于 作者: D denglin-github 提交者: GitHub

Nne integration (#32255)

* Add dlnne engine runtime

* Fix log

* Remove <const_cast> and remove unrelated modify with dlnne, +clang-format

* Fix CMakeList format error

* Add copyright message

* Fix dlnne CMakeList.txt

* Add some paddlepaddle_pass to support more networks

* Fix some format bug
上级 ef8671e5
...@@ -213,6 +213,11 @@ struct Argument { ...@@ -213,6 +213,11 @@ struct Argument {
DECL_ARGUMENT_FIELD(tensorrt_use_calib_mode, TensorRtUseCalibMode, bool); DECL_ARGUMENT_FIELD(tensorrt_use_calib_mode, TensorRtUseCalibMode, bool);
DECL_ARGUMENT_FIELD(tensorrt_use_oss, TensorRtUseOSS, bool); DECL_ARGUMENT_FIELD(tensorrt_use_oss, TensorRtUseOSS, bool);
DECL_ARGUMENT_FIELD(use_dlnne, UseDlnne, bool);
DECL_ARGUMENT_FIELD(dlnne_min_subgraph_size, DlnneMinSubgraphSize, int);
DECL_ARGUMENT_FIELD(dlnne_max_batch_size, DlnneMaxBatchSize, int);
DECL_ARGUMENT_FIELD(dlnne_workspace_size, DlnneWorkspaceSize, int);
DECL_ARGUMENT_FIELD(lite_passes_filter, LitePassesFilter, DECL_ARGUMENT_FIELD(lite_passes_filter, LitePassesFilter,
std::vector<std::string>); std::vector<std::string>);
DECL_ARGUMENT_FIELD(lite_ops_filter, LiteOpsFilter, std::vector<std::string>); DECL_ARGUMENT_FIELD(lite_ops_filter, LiteOpsFilter, std::vector<std::string>);
......
...@@ -166,6 +166,11 @@ void IRPassManager::CreatePasses(Argument *argument, ...@@ -166,6 +166,11 @@ void IRPassManager::CreatePasses(Argument *argument,
// run fp16. // run fp16.
pass->Set("disable_trt_plugin_fp16", pass->Set("disable_trt_plugin_fp16",
new bool(argument->disable_trt_plugin_fp16())); new bool(argument->disable_trt_plugin_fp16()));
} else if (pass_name == "dlnne_subgraph_pass") {
pass->Set("min_subgraph_size",
new int(argument->dlnne_min_subgraph_size()));
pass->Set("program",
new framework::ProgramDesc *(&argument->main_program()));
} }
if (pass_name == "lite_subgraph_pass") { if (pass_name == "lite_subgraph_pass") {
bool enable_int8 = bool enable_int8 =
......
...@@ -20,3 +20,15 @@ if (WITH_LITE) ...@@ -20,3 +20,15 @@ if (WITH_LITE)
set(INFER_IR_PASSES ${INFER_IR_PASSES} lite_subgraph_pass CACHE INTERNAL "") set(INFER_IR_PASSES ${INFER_IR_PASSES} lite_subgraph_pass CACHE INTERNAL "")
cc_test(lite_subgraph_pass_tester SRCS lite_subgraph_pass_tester.cc DEPS lite_subgraph_pass gtest glog) cc_test(lite_subgraph_pass_tester SRCS lite_subgraph_pass_tester.cc DEPS lite_subgraph_pass gtest glog)
endif() endif()
MESSAGE("WITH_DLNNE:${WITH_DLNNE}")
if(WITH_DLNNE)
cc_library(dlnne_subgraph_pass SRCS dlnne_subgraph_pass.cc DEPS ${analysis_deps} subgraph_util)
set(analysis_deps ${analysis_deps}
subgraph_util dlnne_subgraph_pass
CACHE INTERNAL "")
set(pass_file ${PADDLE_BINARY_DIR}/paddle/fluid/inference/api/paddle_inference_pass.h.tmp)
file(APPEND ${pass_file} "USE_PASS(dlnne_subgraph_pass);\n")
set(INFER_IR_PASSES ${INFER_IR_PASSES} dlnne_subgraph_pass CACHE INTERNAL "")
endif()
// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
namespace paddle {
namespace inference {
int RegisterPyFunc(const std::string& name, void* pfn);
} // namespace inference
} // namespace paddle
// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include <algorithm>
#include <map>
#include <set>
#include <fstream>
#include <iostream>
#include "paddle/fluid/framework/ir/graph_pattern_detector.h"
#include "paddle/fluid/framework/ir/subgraph_detector.h"
#include "paddle/fluid/framework/op_version_registry.h"
#include "paddle/fluid/inference/analysis/helper.h"
#include "paddle/fluid/inference/analysis/ir_passes/dlnne_reg_py.h"
#include "paddle/fluid/inference/analysis/ir_passes/dlnne_subgraph_pass.h"
#include "paddle/fluid/string/pretty_log.h"
namespace paddle {
namespace inference {
int (*PyConvertGraph)(const char *graph_name);
int RegisterPyFunc(const std::string &name, void *pfn) {
if (name.compare("convert_graph") == 0) {
PyConvertGraph = reinterpret_cast<decltype(PyConvertGraph)>(pfn);
}
return 0;
}
int ConvertGraph(std::string graph_name) {
LOG(INFO) << "starting doing convert_graph";
PyConvertGraph(graph_name.c_str());
return 0;
}
namespace analysis {
using framework::ir::Node;
void analysis::DlnneSubgraphPass::ApplyImpl(framework::ir::Graph *graph) const {
static std::unordered_set<std::string> teller_set{
"mul", "matmul", "conv2d", "pool2d", "relu", "softmax", "sigmoid",
"hard_swish", "depthwise_conv2d", "batch_norm", "concat", "tanh", "pad",
"elementwise_add", "elementwise_mul", "dropout", "prelu",
"conv2d_transpose", "leaky_relu",
// "fc",
"shuffle_channel", "swish", "split",
// "instance_norm",
"gelu",
// "layer_norm",
// "scale",
// "stack",
"relu6", "reshape2", "transpose2", "concat", "slice",
};
framework::ir::FusePassBase::Init("dlnne_subgraph_pass", graph);
auto teller = [&](const framework::ir::Node *node) {
if (!node->IsOp() || !node->Op()) return false;
return teller_set.find(node->Op()->Type()) != teller_set.end();
};
framework::ir::SubGraphFuser fuser(
graph, teller, Get<int>("min_subgraph_size") /*min subgraph size*/,
"dlnne_engine");
fuser();
std::vector<std::string> graph_param_names =
ExtractParameters(graph->Nodes());
// those parameter already exist in dlnne, and should not have another copy in
// fluid.
std::vector<std::string> repetitive_params;
for (auto *node : graph->Nodes()) {
if (node->IsOp() && !framework::ir::Agent(node).subgraph()->empty()) {
CreateDlnneOp(node, graph, graph_param_names, &repetitive_params);
std::unordered_set<const Node *> nodes2remove(
framework::ir::Agent(node).subgraph()->begin(),
framework::ir::Agent(node).subgraph()->end());
framework::ir::GraphSafeRemoveNodes(graph, nodes2remove);
}
}
std::unordered_set<const Node *> nodes2remove;
for (auto *node : graph->Nodes()) {
if (node->IsOp() && framework::ir::Agent(node).deleted()) {
nodes2remove.insert(node);
}
}
framework::ir::GraphSafeRemoveNodes(graph, nodes2remove);
}
std::string GenerateEngineKey(const std::set<std::string> &engine_inputs,
const std::set<std::string> &engine_outputs,
const std::string &predictor_id) {
std::string engine_hash_key = "";
for (auto name : engine_inputs) {
engine_hash_key += name;
}
for (auto name : engine_outputs) {
engine_hash_key += name;
}
engine_hash_key += predictor_id;
auto engine_key = std::to_string(std::hash<std::string>()(engine_hash_key));
return engine_key;
}
std::string replace_name(std::string name, const char *raw,
const char *new_char) {
std::string r_name = name;
int pos = r_name.find(raw);
while (pos >= 0) {
r_name = r_name.replace(pos, 1, new_char);
pos = r_name.find(raw);
}
return r_name;
}
void DlnneSubgraphPass::CreateDlnneOp(
framework::ir::Node *node, framework::ir::Graph *graph,
const std::vector<std::string> &graph_params,
std::vector<std::string> *repetitive_params) const {
auto *op_desc = node->Op();
auto &subgraph = *framework::ir::Agent(node).subgraph();
PADDLE_ENFORCE_EQ(subgraph.empty(), false,
platform::errors::PreconditionNotMet(
"The subgraph should not be empty."));
// A fake block desc.
framework::proto::BlockDesc block_proto;
framework::BlockDesc block_desc(nullptr, &block_proto);
block_desc.Proto()->set_parent_idx(-1);
block_desc.Proto()->set_idx(0);
LOG(INFO) << "--- detect a sub-graph with " << subgraph.size() << " nodes";
// for debug
framework::ProgramDesc tmp_dump_program_desc;
auto *tmp_dump_main_block = tmp_dump_program_desc.MutableBlock(0);
std::unordered_map<std::string, framework::VarDesc *> name_var_desc;
std::set<std::string> name_var_input_nodes;
std::set<std::string> name_var_output_nodes;
std::set<std::string> name_ops;
for (auto *node : subgraph) {
auto *op = block_desc.AppendOp();
*op->Proto() = *node->Op()->Proto();
// debug
{
name_ops.insert(node->Name());
auto *tmp_dump_new_block_op = tmp_dump_main_block->AppendOp();
framework::OpDesc op_desc;
op_desc.CopyFrom(*node->Op());
for (auto argument_name : op_desc.InputArgumentNames()) {
if (std::count(graph_params.begin(), graph_params.end(),
argument_name) > 0) {
op_desc.Rename(argument_name, replace_name(argument_name, "/", "."));
}
}
for (auto argument_name : op_desc.OutputArgumentNames()) {
if (std::count(graph_params.begin(), graph_params.end(),
argument_name) > 0) {
op_desc.Rename(argument_name, replace_name(argument_name, "/", "."));
}
}
*tmp_dump_new_block_op->Proto() = *op_desc.Proto();
for (auto *x : node->inputs) {
if (x->IsVar()) {
name_var_desc[x->Name()] = x->Var();
}
if (std::count(graph_params.begin(), graph_params.end(), x->Name()) ==
0)
name_var_input_nodes.insert(x->Name());
}
for (auto *x : node->outputs) {
if (x->IsVar()) {
name_var_desc[x->Name()] = x->Var();
}
if (std::count(graph_params.begin(), graph_params.end(), x->Name()) ==
0)
name_var_output_nodes.insert(x->Name());
}
}
}
std::set<std::string> valid_input_names;
std::set<std::string> valid_output_names;
for (auto name : name_var_output_nodes) {
if (name_var_input_nodes.find(name) == name_var_input_nodes.end()) {
valid_output_names.insert(name);
}
}
for (auto name : name_var_input_nodes) {
if (name_var_output_nodes.find(name) == name_var_output_nodes.end()) {
valid_input_names.insert(name);
}
}
// Then, we will use the input_names_with_id and output_names_with_id to
// generate the engine key.
// So, We use set instead of unordered_set here to ensure that the engine key
// is unique.
std::set<std::string> input_names;
std::set<std::string> input_names_with_id;
std::vector<std::string> params;
// if we delete fluid copy of params shared by more than 1 ops, there will be
// problem, so we filter them out.
// The node->inputs contains input tensors and parameters.
for (auto *x : node->inputs) {
input_names.insert(x->Name());
input_names_with_id.insert(x->Name() + std::to_string(x->id()));
if (std::count(graph_params.begin(), graph_params.end(), x->Name()) > 0) {
params.push_back(x->Name());
}
}
std::set<std::string> output_names;
std::set<std::string> output_names_with_id;
std::vector<int> origin_output_dims;
for (auto *x : node->outputs) {
origin_output_dims.push_back(x->Var()->GetShape().size());
output_names.insert(x->Name());
output_names_with_id.insert(x->Name() + std::to_string(x->id()));
}
std::unordered_map<std::string, std::string> output_name_map;
std::unordered_map<std::string, framework::ir::Node *> graph_var_map;
for (framework::ir::Node *node : graph->Nodes()) {
if (node->IsVar() && node->Var()) {
graph_var_map[node->Name()] = node;
}
}
// Set attrs
op_desc->SetType("dlnne_engine");
op_desc->SetInput("Xs", std::vector<std::string>(valid_input_names.begin(),
valid_input_names.end()));
op_desc->SetOutput("Ys", std::vector<std::string>(valid_output_names.begin(),
valid_output_names.end()));
op_desc->SetAttr("parameters", params);
auto engine_key = GenerateEngineKey(input_names_with_id, output_names_with_id,
std::to_string(0));
op_desc->SetAttr("engine_key", engine_key);
auto *scope = param_scope();
{
std::set<std::string> input_names;
for (auto name : name_var_input_nodes) {
if (name_var_output_nodes.find(name) == name_var_output_nodes.end()) {
input_names.insert(name);
}
}
// add feed to subgraph:
int input_idx = 0;
for (auto input_name : input_names) {
auto *feed0 = tmp_dump_main_block->AppendOp();
feed0->SetType("feed");
feed0->SetInput("X", {"feed"});
feed0->SetOutput("Out", {input_name});
feed0->SetAttr("col", input_idx);
input_idx++;
}
// add fetch to subgraph:
int output_idx = 0;
for (auto output_name : valid_output_names) {
auto *fetch0 = tmp_dump_main_block->AppendOp();
fetch0->SetType("fetch");
fetch0->SetInput("X", {output_name});
fetch0->SetOutput("Out", {"out"});
fetch0->SetAttr("col", output_idx);
output_idx++;
}
mkdir("./dump", 0777);
std::string dir_name = "./dump/" + engine_key;
mkdir(dir_name.c_str(), 0777);
ofstream m_stream;
m_stream.open(dir_name + "/__model__", ios::out);
VLOG(4) << "name_var_desc size:" << name_var_desc.size();
for (auto &kv : name_var_desc) {
auto *new_add_var = tmp_dump_main_block->Proto()->add_vars();
*new_add_var = *kv.second->Proto();
auto *variable_tmp = scope->FindVar(kv.first);
if (variable_tmp != nullptr) {
*new_add_var->mutable_name() = replace_name(kv.first, "/", ".");
new_add_var->set_persistable(true);
} else {
new_add_var->set_persistable(false);
}
}
for (auto param_name : params) {
auto *var = scope->FindVar(param_name);
if (var != nullptr) {
auto *var_t = var->GetMutable<framework::LoDTensor>();
ofstream p_stream;
p_stream.open(dir_name + "/" + replace_name(param_name, "/", "."),
ios::out);
platform::DeviceContextPool &pool =
platform::DeviceContextPool::Instance();
auto &dev_ctx = *pool.Get(var_t->place());
framework::SerializeToStream(p_stream, *var_t, dev_ctx);
p_stream.close();
}
}
std::string model;
tmp_dump_program_desc.Proto()->SerializeToString(&model);
m_stream << model;
m_stream.close();
op_desc->SetBlockAttr("sub_block", tmp_dump_main_block);
op_desc->SetAttr("subgraph", model);
op_desc->Flush();
ConvertGraph(engine_key);
}
}
} // namespace analysis
} // namespace inference
} // namespace paddle
REGISTER_PASS(dlnne_subgraph_pass,
paddle::inference::analysis::DlnneSubgraphPass);
// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include <memory>
#include <string>
#include <unordered_map>
#include <unordered_set>
#include <vector>
#include "paddle/fluid/framework/ir/fuse_pass_base.h"
#include "paddle/fluid/framework/ir/pass.h"
#include "paddle/fluid/inference/analysis/ir_passes/subgraph_util.h"
#include "paddle/fluid/inference/api/paddle_analysis_config.h"
namespace paddle {
namespace framework {
namespace ir {
class Graph;
class Node;
} // namespace ir
} // namespace framework
} // namespace paddle
namespace paddle {
namespace inference {
int ConvertGraph(std::string graph_name);
namespace analysis {
class DlnneSubgraphPass : public framework::ir::FusePassBase {
public:
void ApplyImpl(framework::ir::Graph *graph) const override;
private:
void CleanIntermediateOutputs(framework::ir::Node *node);
void CreateDlnneOp(framework::ir::Node *x, framework::ir::Graph *graph,
const std::vector<std::string> &graph_params,
std::vector<std::string> *repetitive_params) const;
};
} // namespace analysis
} // namespace inference
} // namespace paddle
...@@ -26,6 +26,7 @@ namespace paddle { ...@@ -26,6 +26,7 @@ namespace paddle {
struct MkldnnQuantizerConfig; struct MkldnnQuantizerConfig;
extern const std::vector<std::string> kTRTSubgraphPasses; extern const std::vector<std::string> kTRTSubgraphPasses;
extern const std::vector<std::string> kDlnneSubgraphPasses;
extern const std::vector<std::string> kLiteSubgraphPasses; extern const std::vector<std::string> kLiteSubgraphPasses;
PassStrategy *AnalysisConfig::pass_builder() const { PassStrategy *AnalysisConfig::pass_builder() const {
...@@ -134,6 +135,9 @@ AnalysisConfig::AnalysisConfig(const AnalysisConfig &other) { ...@@ -134,6 +135,9 @@ AnalysisConfig::AnalysisConfig(const AnalysisConfig &other) {
CP_MEMBER(trt_use_static_engine_); CP_MEMBER(trt_use_static_engine_);
CP_MEMBER(trt_use_calib_mode_); CP_MEMBER(trt_use_calib_mode_);
CP_MEMBER(trt_use_oss_); CP_MEMBER(trt_use_oss_);
// Dlnne related
CP_MEMBER(use_dlnne_);
CP_MEMBER(dlnne_min_subgraph_size_);
// MKLDNN related. // MKLDNN related.
CP_MEMBER(use_mkldnn_); CP_MEMBER(use_mkldnn_);
CP_MEMBER(mkldnn_enabled_op_types_); CP_MEMBER(mkldnn_enabled_op_types_);
...@@ -211,6 +215,21 @@ AnalysisConfig::AnalysisConfig(const AnalysisConfig &other) { ...@@ -211,6 +215,21 @@ AnalysisConfig::AnalysisConfig(const AnalysisConfig &other) {
pass_builder_->DeletePass(ps); pass_builder_->DeletePass(ps);
} }
} }
if (use_dlnne_) {
auto all_passes = kDlnneSubgraphPasses;
auto other_passes = other.pass_builder()->AllPasses();
// We should sort them, because the user may call the SwitchIrDebug
// interface, which will change the pass.
std::sort(all_passes.begin(), all_passes.end());
std::sort(other_passes.begin(), other_passes.end());
std::vector<std::string> deleted_passes;
std::set_difference(all_passes.begin(), all_passes.end(),
other_passes.begin(), other_passes.end(),
std::inserter(deleted_passes, deleted_passes.begin()));
for (auto ps : deleted_passes) {
pass_builder_->DeletePass(ps);
}
}
} }
void AnalysisConfig::EnableCUDNN() { void AnalysisConfig::EnableCUDNN() {
...@@ -309,6 +328,12 @@ void AnalysisConfig::EnableTensorRtEngine( ...@@ -309,6 +328,12 @@ void AnalysisConfig::EnableTensorRtEngine(
#endif #endif
} }
void AnalysisConfig::EnableDlnne(int min_subgraph_size) {
use_dlnne_ = true;
dlnne_min_subgraph_size_ = min_subgraph_size;
Update();
}
void AnalysisConfig::SetTRTDynamicShapeInfo( void AnalysisConfig::SetTRTDynamicShapeInfo(
std::map<std::string, std::vector<int>> min_input_shape, std::map<std::string, std::vector<int>> min_input_shape,
std::map<std::string, std::vector<int>> max_input_shape, std::map<std::string, std::vector<int>> max_input_shape,
...@@ -383,6 +408,14 @@ void AnalysisConfig::Update() { ...@@ -383,6 +408,14 @@ void AnalysisConfig::Update() {
pass_builder()->AppendPass(pass); pass_builder()->AppendPass(pass);
} }
} }
LOG(INFO) << "use_dlnne_:" << use_dlnne_ << std::endl;
if (use_dlnne_) {
pass_builder()->ClearPasses();
for (const auto &pass : kDlnneSubgraphPasses) {
pass_builder()->AppendPass(pass);
}
}
if (use_gpu() && use_cudnn_) { if (use_gpu() && use_cudnn_) {
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
if (!enable_ir_optim_) { if (!enable_ir_optim_) {
...@@ -479,6 +512,9 @@ std::string AnalysisConfig::SerializeInfoCache() { ...@@ -479,6 +512,9 @@ std::string AnalysisConfig::SerializeInfoCache() {
ss << tensorrt_max_batchsize_; ss << tensorrt_max_batchsize_;
ss << tensorrt_min_subgraph_size_; ss << tensorrt_min_subgraph_size_;
ss << use_dlnne_;
ss << dlnne_min_subgraph_size_;
for (auto &op : trt_disabled_ops_) ss << op.c_str(); for (auto &op : trt_disabled_ops_) ss << op.c_str();
ss << ";"; ss << ";";
......
...@@ -537,6 +537,12 @@ void AnalysisPredictor::PrepareArgument() { ...@@ -537,6 +537,12 @@ void AnalysisPredictor::PrepareArgument() {
argument_.SetCloseTrtPluginFp16(config_.disable_trt_plugin_fp16_); argument_.SetCloseTrtPluginFp16(config_.disable_trt_plugin_fp16_);
} }
if (config_.dlnne_enabled()) {
LOG(INFO) << "Dlnne subgraph is enabled";
argument_.SetUseDlnne(true);
argument_.SetDlnneMinSubgraphSize(config_.dlnne_min_subgraph_size_);
}
if (config_.lite_engine_enabled()) { if (config_.lite_engine_enabled()) {
argument_.SetCpuMathLibraryNumThreads( argument_.SetCpuMathLibraryNumThreads(
config_.cpu_math_library_num_threads()); config_.cpu_math_library_num_threads());
......
...@@ -360,6 +360,9 @@ struct PD_INFER_DECL AnalysisConfig { ...@@ -360,6 +360,9 @@ struct PD_INFER_DECL AnalysisConfig {
/// ///
bool tensorrt_dla_enabled() { return trt_use_dla_; } bool tensorrt_dla_enabled() { return trt_use_dla_; }
void EnableDlnne(int min_subgraph_size = 3);
bool dlnne_enabled() const { return use_dlnne_; }
/// ///
/// \brief Turn on the usage of Lite sub-graph engine. /// \brief Turn on the usage of Lite sub-graph engine.
/// ///
...@@ -627,6 +630,10 @@ struct PD_INFER_DECL AnalysisConfig { ...@@ -627,6 +630,10 @@ struct PD_INFER_DECL AnalysisConfig {
std::vector<std::string> trt_disabled_ops_{}; std::vector<std::string> trt_disabled_ops_{};
bool disable_trt_plugin_fp16_{false}; bool disable_trt_plugin_fp16_{false};
// dlnne related.
bool use_dlnne_{false};
int dlnne_min_subgraph_size_{3};
// memory reuse related. // memory reuse related.
bool enable_memory_optim_{false}; bool enable_memory_optim_{false};
......
...@@ -110,6 +110,15 @@ const std::vector<std::string> kTRTSubgraphPasses({ ...@@ -110,6 +110,15 @@ const std::vector<std::string> kTRTSubgraphPasses({
"transpose_flatten_concat_fuse_pass", "transpose_flatten_concat_fuse_pass",
}); });
const std::vector<std::string> kDlnneSubgraphPasses({
"is_test_pass", //
"simplify_with_basic_ops_pass", //
"conv_bn_fuse_pass", //
"depthwise_conv_bn_fuse_pass", //
"shuffle_channel_detect_pass", //
"dlnne_subgraph_pass", //
});
const std::vector<std::string> kLiteSubgraphPasses({ const std::vector<std::string> kLiteSubgraphPasses({
#ifdef PADDLE_WITH_LITE #ifdef PADDLE_WITH_LITE
"lite_subgraph_pass", "lite_subgraph_pass",
......
...@@ -242,6 +242,9 @@ class PD_INFER_DECL XpuPassStrategy final : public PassStrategy { ...@@ -242,6 +242,9 @@ class PD_INFER_DECL XpuPassStrategy final : public PassStrategy {
/// \brief List of tensorRT subgraph passes. /// \brief List of tensorRT subgraph passes.
PD_INFER_DECL extern const std::vector<std::string> kTRTSubgraphPasses; PD_INFER_DECL extern const std::vector<std::string> kTRTSubgraphPasses;
/// \brief List of dlnne subgraph passes.
PD_INFER_DECL extern const std::vector<std::string> kDlnneSubgraphPasses;
/// \brief List of lite subgraph passes. /// \brief List of lite subgraph passes.
PD_INFER_DECL extern const std::vector<std::string> kLiteSubgraphPasses; PD_INFER_DECL extern const std::vector<std::string> kLiteSubgraphPasses;
......
...@@ -260,6 +260,22 @@ bool PD_TensorrtEngineEnabled(const PD_AnalysisConfig* config) { ...@@ -260,6 +260,22 @@ bool PD_TensorrtEngineEnabled(const PD_AnalysisConfig* config) {
return config->config.tensorrt_engine_enabled(); return config->config.tensorrt_engine_enabled();
} }
void PD_EnableDlnne(PD_AnalysisConfig* config, int min_subgraph_size) {
PADDLE_ENFORCE_NOT_NULL(
config,
paddle::platform::errors::InvalidArgument(
"The pointer of analysis configuration shouldn't be nullptr"));
config->config.EnableDlnne(min_subgraph_size);
}
bool PD_DlnneEnabled(const PD_AnalysisConfig* config) {
PADDLE_ENFORCE_NOT_NULL(
config,
paddle::platform::errors::InvalidArgument(
"The pointer of analysis configuration shouldn't be nullptr"));
return config->config.dlnne_enabled();
}
void PD_SwitchIrDebug(PD_AnalysisConfig* config, bool x) { void PD_SwitchIrDebug(PD_AnalysisConfig* config, bool x) {
PADDLE_ENFORCE_NOT_NULL( PADDLE_ENFORCE_NOT_NULL(
config, config,
......
...@@ -42,6 +42,10 @@ if (WITH_GPU AND TENSORRT_FOUND) ...@@ -42,6 +42,10 @@ if (WITH_GPU AND TENSORRT_FOUND)
add_subdirectory(tensorrt) add_subdirectory(tensorrt)
endif() endif()
if (WITH_DLNNE)
add_subdirectory(dlnne)
endif()
if (WITH_LITE) if (WITH_LITE)
add_subdirectory(lite) add_subdirectory(lite)
endif() endif()
......
# compile flags
set(DLNNE_FLAGS
-Wno-error=non-virtual-dtor
-Wno-error=unused-variable
-Wno-error=attributes
${fsanitize}
)
foreach(flag ${DLNNE_FLAGS})
safe_set_cflag(CMAKE_C_FLAGS ${flag})
safe_set_cxxflag(CMAKE_CXX_FLAGS ${flag})
endforeach()
# add nne
find_path(DLNNE_INCLUDE_DIR dlnne.h
PATHS
$ENV{SOFTWARE_SOURCE_DIR} $ENV{SOFTWARE_SOURCE_DIR}/driver/nne/include
NO_DEFAULT_PATH
)
find_library(DLNNE_LIB libdlnne.so
PATHS
$ENV{SOFTWARE_BUILD_DIR} $ENV{SOFTWARE_BUILD_DIR}/driver/nne
NO_DEFAULT_PATH
)
find_path(CUDA_INCLUDE_DIR cuda.h
$ENV{SOFTWARE_BUILD_DIR}/llvm-project-10/cuda/include
)
find_library(CURT_LIB libcurt.so
PATHS
$ENV{SOFTWARE_BUILD_DIR} $ENV{SOFTWARE_BUILD_DIR}/llvm-project-10/cuda/lib
NO_DEFAULT_PATH
)
message("DLNNE_INCLUDE_DIR: "${DLNNE_INCLUDE_DIR})
message("DLNNE_LIB: "${DLNNE_LIB})
message("CUDA_INCLUDE_DIR: "${CUDA_INCLUDE_DIR})
message("CURT_LIB: "${CURT_LIB})
include_directories("${DLNNE_INCLUDE_DIR}")
include_directories("${CUDA_INCLUDE_DIR}")
op_library(dlnne_engine_op DEPS ${GLOB_OPERATOR_DEPS} framework_proto boost device_context op_registry scope)
#message("PYBIND_FILE:${pybind_file}")
#file(APPEND ${pybind_file} "USE_NO_KERNEL_OP(dlnne_engine);\n")
#endif()
target_link_libraries(dlnne_engine_op ${DLNNE_LIB} ${CURT_LIB})
cc_test(test_dlnne_engine_op SRCS dlnne_engine_op_test.cc DEPS dlnne_engine_op analysis)
// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/fluid/operators/dlnne/dlnne_engine_op.h"
namespace paddle {
namespace inference {
void CopyTensorDeviceToCpu(void* dst_ptr, void* src_ptr, int total_bytes) {
cudaDeviceSynchronize();
cudaMemcpy(dst_ptr, src_ptr, total_bytes, cudaMemcpyDeviceToHost);
cudaDeviceSynchronize();
}
void CopyTensorCpuToDevice(void* dst_ptr, void* src_ptr, int total_bytes) {
cudaDeviceSynchronize();
cudaMemcpy(dst_ptr, src_ptr, total_bytes, cudaMemcpyHostToDevice);
cudaDeviceSynchronize();
}
} // namespace inference
namespace operators {
class DlnneEngineOpMaker : public framework::OpProtoAndCheckerMaker {
public:
void Make() override {
AddInput("Xs", "A list of inputs.").AsDuplicable();
AddOutput("Ys", "A list of outputs").AsDuplicable();
AddAttr<std::string>("subgraph", "the subgraph.");
AddAttr<std::string>(
"engine_key",
"The engine_key here is used to distinguish different DLNNE Engines");
AddAttr<framework::BlockDesc*>("sub_block", "the trt block");
AddComment("Dlnne engine operator.");
}
};
class DlnneEngineInferVarType : public framework::VarTypeInference {
public:
void operator()(framework::InferVarTypeContext* ctx) const override {}
};
} // namespace operators
} // namespace paddle
namespace ops = paddle::operators;
REGISTER_OPERATOR(dlnne_engine, ops::DlnneEngineOp, ops::DlnneEngineOpMaker);
// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include <cuda.h> // NOTLINT
#include <cuda_runtime.h> // NOTLINT
#include <dlnne.h> // NOTLINT
#include <assert.h>
#include <ctime>
#include <fstream>
#include <iostream>
#include <memory>
#include <string>
#include <unordered_map>
#include <unordered_set>
#include <utility>
#include <vector>
#include "paddle/fluid/framework/executor.h"
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/framework/operator.h"
#include "paddle/fluid/inference/analysis/helper.h"
namespace dl {
namespace nne {
class Builder;
class Engine;
class Network;
class Parser;
class ExecutionContext;
} // namespace nne
} // namespace dl
namespace paddle {
namespace inference {
class NneDeleter {
public:
NneDeleter() {}
template <typename T>
inline void operator()(T *ptr) {
if (ptr != nullptr) {
ptr->Destroy();
}
}
};
void CopyTensorDeviceToCpu(void *dst_ptr, void *src_ptr, int total_bytes);
void CopyTensorCpuToDevice(void *dst_ptr, void *src_ptr, int total_bytes);
template <typename T>
struct Singleton;
} // namespace inference
} // namespace paddle
namespace paddle {
namespace operators {
class DlnneEngineOp : public framework::OperatorBase {
private:
std::vector<std::string> input_names_;
std::unordered_set<std::string> param_names_;
std::string engine_key_;
int num_inputs;
int num_outputs;
std::vector<std::string> output_names;
std::vector<std::string> input_names;
dl::nne::Builder *builder;
dl::nne::Parser *parser;
dl::nne::Network *network;
dl::nne::ExecutionContext *context;
dl::nne::Engine *engine;
unsigned int engine_input_size;
std::vector<int> InputIndexToBindIndex_;
public:
DlnneEngineOp(const std::string &type,
const framework::VariableNameMap &inputs,
const framework::VariableNameMap &outputs,
const framework::AttributeMap &attrs)
: framework::OperatorBase(type, inputs, outputs, attrs) {
input_names_ = Inputs("Xs");
engine_key_ = Attr<std::string>("engine_key");
auto params = Attr<std::vector<std::string>>("parameters");
for (const auto &param : params) {
param_names_.insert(param);
}
num_inputs = 0;
for (const auto &x : Inputs("Xs")) {
if (param_names_.count(x)) continue;
num_inputs += 1;
input_names.push_back(x);
}
num_outputs = Outputs("Ys").size();
for (const auto &y : Outputs("Ys")) {
VLOG(4) << "y: " << y << std::endl;
output_names.push_back(y);
}
// onnx path
std::stringstream filename;
std::string current_path = ".";
char *buffer;
if ((buffer = getcwd(NULL, 0)) != NULL) {
current_path = buffer;
} else {
current_path = ".";
}
filename << current_path << "/dump/" << engine_key_ << "/" << engine_key_
<< ".onnx";
builder = dl::nne::CreateInferBuilder();
PADDLE_ENFORCE_NE(builder, nullptr, platform::errors::Unavailable(
"nne create builder failed"));
parser = dl::nne::CreateParser();
PADDLE_ENFORCE_NE(parser, nullptr, platform::errors::Unavailable(
"nne create parser failed"));
network = builder->CreateNetwork();
LOG(INFO) << "set output for dlnne";
for (std::string &output_op_name : output_names)
parser->RegisterOutput(output_op_name.c_str());
LOG(INFO) << "parser onnx for dlnne";
parser->Parse(filename.str().c_str(), *network);
LOG(INFO) << "build network";
engine = builder->BuildEngine(*network);
// total size = input_size+output_size
engine_input_size = num_inputs + num_outputs;
for (std::string &input_name : input_names) {
int BindIndex = engine->GetBindingIndex(input_name.c_str());
InputIndexToBindIndex_.push_back(BindIndex);
}
for (std::string &output_name : output_names) {
int BindIndex = engine->GetBindingIndex(output_name.c_str());
InputIndexToBindIndex_.push_back(BindIndex);
}
// context
context = engine->CreateExecutionContext();
}
~DlnneEngineOp() {
network->Destroy();
context->Destroy();
engine->Destroy();
parser->Destroy();
builder->Destroy();
}
protected:
void RunDlnneOnCreateEngine(const framework::Scope &scope,
const platform::Place &dev_place) const {
PADDLE_ENFORCE_EQ(
input_names_.empty(), false,
platform::errors::PreconditionNotMet(
"Dlnne engine needs at least one input, but no input is found. "
"Please check if you set the input correctly."));
std::vector<void *> input_buffers(num_inputs);
std::vector<void *> cpu_input_buffers(num_inputs);
std::vector<std::vector<int64_t>> input_shapes(num_inputs);
std::vector<int32_t> input_data_types(num_inputs);
std::vector<int64_t> input_bytes(num_inputs);
int index = 0;
for (const auto &x : Inputs("Xs")) {
if (param_names_.count(x)) continue;
// convert input and copy to Dlnne engine's buffer
auto &t =
inference::analysis::GetFromScope<framework::LoDTensor>(scope, x);
const int bind_index = index;
index++;
int64_t data_bytes;
int32_t dtype;
auto type = t.type();
data_bytes = 1;
void *buffer = nullptr;
if (type == framework::proto::VarType::FP32) {
buffer = static_cast<void *>(t.data<float>());
data_bytes = 4;
dtype = 0;
} else if (type == framework::proto::VarType::INT64) {
buffer = static_cast<void *>(t.data<int64_t>());
data_bytes = 8;
dtype = 1;
} else if (type == framework::proto::VarType::INT32) {
buffer = static_cast<void *>(t.data<int32_t>());
data_bytes = 4;
dtype = 2;
} else {
PADDLE_THROW(platform::errors::Fatal(
"The DLNNE Engine OP only support float/int32_t/int64_t input."));
}
input_buffers[bind_index] = buffer;
auto t_shape = framework::vectorize<int64_t>(t.dims());
std::vector<int64_t> runtime_input_shape(t_shape.begin(), t_shape.end());
for (auto &size : t_shape) {
data_bytes = data_bytes * size;
}
VLOG(4) << "buffers_size:" << data_bytes;
cpu_input_buffers[bind_index] =
input_buffers[bind_index]; // malloc(data_bytes);
input_shapes[bind_index] = runtime_input_shape;
input_data_types[bind_index] = dtype;
input_bytes[bind_index] = data_bytes;
}
// output shape
std::vector<std::vector<int64_t>> out_shapes;
std::vector<int32_t> output_bytes;
for (int i = 0; i < num_outputs; i++) {
int index = engine->GetBindingIndex(output_names[i].c_str());
dl::nne::Dims out_dim = engine->GetBindingDimensions(index);
std::vector<int64_t> shape(out_dim.nbDims);
for (int dim = 0; dim < out_dim.nbDims; dim++) {
shape[dim] = (out_dim.d[dim]);
}
out_shapes.push_back(shape);
int64_t data_bytes;
// float32
data_bytes = 4;
for (auto &size : shape) {
data_bytes = data_bytes * size;
}
VLOG(4) << "data_bytes: " << data_bytes;
output_bytes.push_back(data_bytes);
}
int bind_index = 0;
std::vector<void *> cpu_output_buffers(num_outputs);
std::vector<void *> output_buffers(num_outputs);
std::vector<int32_t> output_dtypes(num_outputs);
for (const auto &y : Outputs("Ys")) {
auto *fluid_v = scope.FindVar(y);
PADDLE_ENFORCE_NOT_NULL(
fluid_v,
platform::errors::NotFound(
"Output variable %s is not found in DLNNE subgraph.", y));
auto *fluid_t = fluid_v->GetMutable<framework::LoDTensor>();
VLOG(4) << "out_shapes[bind_index] dim:" << out_shapes[bind_index].size();
fluid_t->Resize(framework::make_ddim(out_shapes[bind_index]));
int32_t dtype;
output_buffers[bind_index] = fluid_t->mutable_data<float>(
BOOST_GET_CONST(platform::CPUPlace, dev_place));
dtype = 0;
cpu_output_buffers[bind_index] =
output_buffers[bind_index]; // malloc(data_bytes);
output_dtypes[bind_index] = dtype;
bind_index++;
}
std::vector<void *> engine_input_ptr(engine_input_size);
// set input_ptr
for (unsigned int i = 0; i < engine_input_size; i++) {
if (InputIndexToBindIndex_[i] < 0) continue;
if (engine->BindingIsInput(InputIndexToBindIndex_[i])) {
// copy cpu buffer to gpu buffer
int64_t total_bytes;
total_bytes = input_bytes[i];
VLOG(4) << "input_bytes: " << total_bytes;
void *gpu_ptr;
cudaMalloc(&gpu_ptr, total_bytes);
engine_input_ptr[InputIndexToBindIndex_[i]] = gpu_ptr;
paddle::inference::CopyTensorCpuToDevice(
gpu_ptr, reinterpret_cast<void *>(cpu_input_buffers[i]),
total_bytes);
} else {
int64_t total_size;
total_size = output_bytes[i - input_names.size()];
VLOG(4) << "output_bytes: " << total_size;
void *gpu_ptr;
cudaMalloc(&gpu_ptr, total_size);
engine_input_ptr[InputIndexToBindIndex_[i]] = gpu_ptr;
}
}
clock_t startTime, endTime;
startTime = clock();
context->Execute(1, engine_input_ptr.data());
endTime = clock();
double during_ms =
static_cast<double>(endTime - startTime) / CLOCKS_PER_SEC * 1000;
LOG(INFO) << "dlNNE execute time: " << during_ms << " ms";
bind_index = 0;
for (unsigned int i = 0; i < engine_input_size; i++) {
if (InputIndexToBindIndex_[i] < 0) continue;
if (i >= input_names.size()) {
void *cpu_ptr = cpu_output_buffers[i - input_names.size()];
int64_t size;
size = output_bytes[i - input_names.size()];
paddle::inference::CopyTensorDeviceToCpu(
cpu_ptr, engine_input_ptr[InputIndexToBindIndex_[i]], size);
// dtype: float32
int32_t dtypes;
dtypes = 0;
cpu_output_buffers[bind_index] = cpu_ptr;
output_dtypes[bind_index] = dtypes;
bind_index++;
}
cudaFree(engine_input_ptr[InputIndexToBindIndex_[i]]);
}
}
void RunImpl(const framework::Scope &scope,
const platform::Place &dev_place) const override {
RunDlnneOnCreateEngine(scope, dev_place);
}
};
} // namespace operators
} // namespace paddle
// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/fluid/operators/dlnne/dlnne_engine_op.h"
#include <gtest/gtest.h>
#include "paddle/fluid/framework/block_desc.h"
#include "paddle/fluid/framework/lod_tensor.h"
#include "paddle/fluid/framework/op_desc.h"
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/framework/program_desc.h"
#include "paddle/fluid/framework/scope.h"
#include "paddle/fluid/inference/analysis/helper.h"
#include "paddle/fluid/inference/tensorrt/convert/op_converter.h"
#include "paddle/fluid/inference/tensorrt/convert/ut_helper.h"
#include "paddle/fluid/operators/tensorrt/tensorrt_engine_op.h"
USE_NO_KERNEL_OP(dlnne_engine);
namespace paddle {
namespace operators {
namespace {
void CreateCUDATensor(framework::Scope* scope, const std::string& name,
const std::vector<int64_t>& shape) {
auto* var = scope->Var(name);
auto* tensor = var->GetMutable<framework::LoDTensor>();
auto dims = framework::make_ddim(shape);
tensor->Resize(dims);
platform::CUDAPlace place;
platform::CUDADeviceContext ctx(place);
inference::tensorrt::RandomizeTensor(tensor, place, ctx);
}
void AddTensorToBlockDesc(framework::proto::BlockDesc* block,
const std::string& name,
const std::vector<int64_t>& shape) {
using framework::proto::VarType;
auto* var = block->add_vars();
framework::VarDesc desc(name);
desc.SetType(VarType::LOD_TENSOR);
desc.SetDataType(VarType::FP32);
desc.SetShape(shape);
*var = *desc.Proto();
}
} // namespace
using inference::analysis::SetAttr;
TEST(DlnneEngineOp, manual) {
framework::ProgramDesc program;
auto* block_ = program.Proto()->add_blocks();
block_->set_idx(0);
block_->set_parent_idx(-1);
LOG(INFO) << "create block desc";
framework::BlockDesc block_desc(&program, block_);
LOG(INFO) << "create fc op";
auto* fc0 = block_desc.AppendOp();
fc0->SetType("fc");
fc0->SetInput("X", std::vector<std::string>({"x"})); // 4 x 1 x 1
fc0->SetInput("Y", std::vector<std::string>({"y"})); // 4 x 6
fc0->SetOutput("Out", std::vector<std::string>({"z"})); // 6 x 1 x 1
LOG(INFO) << "create fc op";
auto* fc1 = block_desc.AppendOp();
fc1->SetType("fc");
fc1->SetInput("X", std::vector<std::string>({"z"}));
fc1->SetInput("Y", std::vector<std::string>({"y0"})); // 6 x 8
fc1->SetOutput("Out", std::vector<std::string>({"z0"})); // 8 x 1 x 1
// Set inputs' variable shape in BlockDesc
// the batch size is 2, so the dims of 'x' is {2, 4, 1, 1}
AddTensorToBlockDesc(block_, "x", std::vector<int64_t>({2, 4, 1, 1}));
AddTensorToBlockDesc(block_, "y", std::vector<int64_t>({4, 6}));
AddTensorToBlockDesc(block_, "y0", std::vector<int64_t>({6, 8}));
AddTensorToBlockDesc(block_, "z", std::vector<int64_t>({2, 6}));
// It is wired, need to copy manually.
*block_->add_ops() = *fc0->Proto();
*block_->add_ops() = *fc1->Proto();
ASSERT_EQ(block_->ops_size(), 2);
LOG(INFO) << "create dlnne desc";
framework::OpDesc engine_op_desc(nullptr);
engine_op_desc.SetType("dlnne_engine");
engine_op_desc.SetInput("Xs", std::vector<std::string>({"x"}));
engine_op_desc.SetOutput("Ys", std::vector<std::string>({"z0"}));
engine_op_desc.SetBlockAttr("sub_block", &block_desc);
engine_op_desc.SetAttr("max_batch_size", static_cast<int>(2));
engine_op_desc.SetAttr("workspace_size", static_cast<int>(1 << 20));
engine_op_desc.SetAttr("parameters", std::vector<std::string>({}));
engine_op_desc.SetAttr("engine_key", std::string("a_engine"));
engine_op_desc.SetAttr("calibration_engine_key",
std::string("a_calib_engine"));
engine_op_desc.SetAttr("predictor_id", 1);
engine_op_desc.SetAttr("calibration_data", std::string(""));
engine_op_desc.SetAttr("enable_int8", static_cast<bool>(false));
engine_op_desc.SetAttr("enable_fp16", static_cast<bool>(false));
engine_op_desc.SetAttr("use_calib_mode", static_cast<bool>(false));
engine_op_desc.SetAttr("output_name_mapping",
std::vector<std::string>({"z0"}));
engine_op_desc.SetAttr("origin_output_dims", std::vector<int>({2}));
engine_op_desc.SetAttr("subgraph", std::string(block_->SerializeAsString()));
engine_op_desc.SetAttr("engine_serialized_data", std::string(""));
int device_id = 0;
engine_op_desc.SetAttr("gpu_id", device_id);
LOG(INFO) << "create engine op";
auto engine_op = framework::OpRegistry::CreateOp(engine_op_desc);
LOG(INFO) << "engine_op " << engine_op.get();
framework::Scope scope;
platform::CUDAPlace place;
platform::CUDADeviceContext ctx(place);
// Prepare variables.
CreateCUDATensor(&scope, "x", std::vector<int64_t>({2, 4}));
CreateCUDATensor(&scope, "y", std::vector<int64_t>({4, 6}));
CreateCUDATensor(&scope, "z", std::vector<int64_t>({2, 6}));
CreateCUDATensor(&scope, "y0", std::vector<int64_t>({6, 8}));
CreateCUDATensor(&scope, "z0", std::vector<int64_t>({2, 8}));
// Execute them.
LOG(INFO) << "engine_op run";
engine_op->Run(scope, place);
}
void Execute(int batch_size, int input_dim, int output_dim, int nlayers = 1) {
framework::ProgramDesc program;
framework::Scope scope;
platform::CUDAPlace place;
platform::CUDADeviceContext ctx(place);
auto* block_ = program.Proto()->add_blocks();
block_->set_idx(0);
block_->set_parent_idx(-1);
using shape_t = std::vector<int64_t>;
LOG(INFO) << "create block desc";
framework::BlockDesc block_desc(&program, block_);
auto AddFCLayer = [&](const std::string& x_name, const std::string& y_name,
const std::string& z_name, bool x_created,
const shape_t& x_shape, const shape_t& y_shape,
const shape_t& z_shape) {
LOG(INFO) << "create fc op";
auto* fc = block_desc.AppendOp();
fc->SetType("mul");
fc->SetInput("X", std::vector<std::string>({x_name}));
fc->SetInput("Y", std::vector<std::string>({y_name}));
fc->SetOutput("Out", std::vector<std::string>({z_name}));
// Set inputs' variable shape in BlockDesc
if (!x_created) {
AddTensorToBlockDesc(block_, x_name,
std::vector<int64_t>({batch_size, input_dim, 1, 1}));
}
AddTensorToBlockDesc(block_, y_name,
std::vector<int64_t>({input_dim, output_dim}));
AddTensorToBlockDesc(block_, z_name,
std::vector<int64_t>({batch_size, output_dim}));
// Prepare variables.
if (!x_created) {
CreateCUDATensor(&scope, x_name, std::vector<int64_t>(x_shape));
}
CreateCUDATensor(&scope, y_name, std::vector<int64_t>(y_shape));
CreateCUDATensor(&scope, z_name, std::vector<int64_t>(z_shape));
// It is wired, need to copy manually.
*block_->add_ops() = *fc->Proto();
};
// Test with 4 layer FC
AddFCLayer("x0", "y0", "z0", false, {batch_size, input_dim},
{input_dim, output_dim}, {batch_size, output_dim});
AddFCLayer("z0", "y1", "z1", true, {}, {output_dim, output_dim},
{batch_size, output_dim});
AddFCLayer("z1", "y2", "z2", true, {}, {output_dim, output_dim},
{batch_size, output_dim});
AddFCLayer("z2", "y3", "z3", true, {}, {output_dim, output_dim},
{batch_size, output_dim});
LOG(INFO) << "create dlnne desc";
framework::OpDesc engine_op_desc(nullptr);
engine_op_desc.SetType("dlnne_engine");
engine_op_desc.SetInput("Xs", std::vector<std::string>({"x0"}));
engine_op_desc.SetOutput("Ys", std::vector<std::string>({"z3"}));
engine_op_desc.SetBlockAttr("sub_block", &block_desc);
engine_op_desc.SetAttr("max_batch_size", static_cast<int>(batch_size));
engine_op_desc.SetAttr("workspace_size", static_cast<int>(1 << 20));
engine_op_desc.SetAttr("parameters",
std::vector<std::string>({"y0", "y1", "y2", "y3"}));
engine_op_desc.SetAttr("engine_key", std::string("b_engine"));
engine_op_desc.SetAttr("calibration_engine_key",
std::string("b_calib_engine"));
engine_op_desc.SetAttr("predictor_id", 1);
engine_op_desc.SetAttr("calibration_data", std::string(""));
engine_op_desc.SetAttr("enable_int8", static_cast<bool>(false));
engine_op_desc.SetAttr("enable_fp16", static_cast<bool>(false));
engine_op_desc.SetAttr("use_calib_mode", static_cast<bool>(false));
engine_op_desc.SetAttr("output_name_mapping",
std::vector<std::string>({"z3"}));
engine_op_desc.SetAttr("origin_output_dims", std::vector<int>({2}));
engine_op_desc.SetAttr("subgraph", std::string(block_->SerializeAsString()));
engine_op_desc.SetAttr("engine_serialized_data", std::string(""));
int device_id = 0;
engine_op_desc.SetAttr("gpu_id", device_id);
auto engine_op = framework::OpRegistry::CreateOp(engine_op_desc);
// Execute them.
engine_op->Run(scope, place);
}
// Test with a larger FC layer.
TEST(DlnneEngineOp, fc) { Execute(40, 28, 28); }
} // namespace operators
} // namespace paddle
USE_TRT_CONVERTER(fc)
...@@ -512,6 +512,8 @@ void BindAnalysisConfig(py::module *m) { ...@@ -512,6 +512,8 @@ void BindAnalysisConfig(py::module *m) {
py::arg("dla_core") = 0) py::arg("dla_core") = 0)
.def("tensorrt_dla_enabled", &AnalysisConfig::tensorrt_dla_enabled) .def("tensorrt_dla_enabled", &AnalysisConfig::tensorrt_dla_enabled)
.def("tensorrt_engine_enabled", &AnalysisConfig::tensorrt_engine_enabled) .def("tensorrt_engine_enabled", &AnalysisConfig::tensorrt_engine_enabled)
.def("enable_dlnne", &AnalysisConfig::EnableDlnne,
py::arg("min_subgraph_size") = 3)
.def("enable_lite_engine", &AnalysisConfig::EnableLiteEngine, .def("enable_lite_engine", &AnalysisConfig::EnableLiteEngine,
py::arg("precision_mode") = AnalysisConfig::Precision::kFloat32, py::arg("precision_mode") = AnalysisConfig::Precision::kFloat32,
py::arg("zero_copy") = false, py::arg("zero_copy") = false,
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册