提交 4f77248d 编写于 作者: N nhzlx 提交者: ceci3

3. when runing in trt mode, do not allocate memory for parameters in fluid.

test=develop
上级 8c171902
...@@ -14,6 +14,7 @@ ...@@ -14,6 +14,7 @@
#pragma once #pragma once
#include <string>
#include "paddle/fluid/framework/ir/graph.h" #include "paddle/fluid/framework/ir/graph.h"
#include "paddle/fluid/framework/ir/pass.h" #include "paddle/fluid/framework/ir/pass.h"
#include "paddle/fluid/framework/scope.h" #include "paddle/fluid/framework/scope.h"
...@@ -24,6 +25,10 @@ namespace ir { ...@@ -24,6 +25,10 @@ namespace ir {
static const char kParamScopeAttr[] = "__param_scope__"; static const char kParamScopeAttr[] = "__param_scope__";
static const char kFuseStatisAttr[] = "__fuse_statis__"; static const char kFuseStatisAttr[] = "__fuse_statis__";
// When we use trt or other third_party lib, the parameters are managered by
// the lib, but not the fluid. So we need to record them to avoid duplicate
// allocation.
static const char kRepetitiveParamAttr[] = "__repetitive_param__";
enum FuseOptions { enum FuseOptions {
DO_NOT_FUSE, // fusing will not be done DO_NOT_FUSE, // fusing will not be done
......
...@@ -14,8 +14,6 @@ ...@@ -14,8 +14,6 @@
#include <algorithm> #include <algorithm>
#include <set> #include <set>
#include <string>
#include <vector>
#include "paddle/fluid/framework/ir/graph_pattern_detector.h" #include "paddle/fluid/framework/ir/graph_pattern_detector.h"
#include "paddle/fluid/inference/analysis/helper.h" #include "paddle/fluid/inference/analysis/helper.h"
...@@ -42,7 +40,6 @@ void RenameAndGetOutputs( ...@@ -42,7 +40,6 @@ void RenameAndGetOutputs(
std::unordered_map<std::string, std::string> *output_name_map); std::unordered_map<std::string, std::string> *output_name_map);
std::unique_ptr<framework::ir::Graph> analysis::TensorRtSubgraphPass::ApplyImpl( std::unique_ptr<framework::ir::Graph> analysis::TensorRtSubgraphPass::ApplyImpl(
std::unique_ptr<framework::ir::Graph> graph) const { std::unique_ptr<framework::ir::Graph> graph) const {
framework::ir::FusePassBase::Init("tensorrt_subgraph_pass", graph.get()); framework::ir::FusePassBase::Init("tensorrt_subgraph_pass", graph.get());
...@@ -55,9 +52,16 @@ std::unique_ptr<framework::ir::Graph> analysis::TensorRtSubgraphPass::ApplyImpl( ...@@ -55,9 +52,16 @@ std::unique_ptr<framework::ir::Graph> analysis::TensorRtSubgraphPass::ApplyImpl(
Get<int>("min_subgraph_size") /*min subgraph size*/); Get<int>("min_subgraph_size") /*min subgraph size*/);
fuser(); fuser();
std::vector<std::string> graph_param_names =
ExtractParameters(graph->Nodes());
// those parameter already exist in trt, and should not have another copy in
// fluid.
std::vector<std::string> repetitive_params;
for (auto *node : graph->Nodes()) { for (auto *node : graph->Nodes()) {
if (node->IsOp() && !Agent(node).subgraph()->empty()) { if (node->IsOp() && !Agent(node).subgraph()->empty()) {
CreateTensorRTOp(node, graph.get()); CreateTensorRTOp(node, graph.get(), graph_param_names,
&repetitive_params);
std::unordered_set<const Node *> nodes2remove( std::unordered_set<const Node *> nodes2remove(
Agent(node).subgraph()->begin(), Agent(node).subgraph()->end()); Agent(node).subgraph()->begin(), Agent(node).subgraph()->end());
...@@ -72,6 +76,8 @@ std::unique_ptr<framework::ir::Graph> analysis::TensorRtSubgraphPass::ApplyImpl( ...@@ -72,6 +76,8 @@ std::unique_ptr<framework::ir::Graph> analysis::TensorRtSubgraphPass::ApplyImpl(
} }
} }
framework::ir::GraphSafeRemoveNodes(graph.get(), nodes2remove); framework::ir::GraphSafeRemoveNodes(graph.get(), nodes2remove);
graph->Set(framework::ir::kRepetitiveParamAttr,
new std::vector<std::string>(repetitive_params));
return graph; return graph;
} }
...@@ -89,8 +95,10 @@ std::string GenerateEngineKey(const std::set<std::string> &engine_inputs, ...@@ -89,8 +95,10 @@ std::string GenerateEngineKey(const std::set<std::string> &engine_inputs,
return engine_key; return engine_key;
} }
void TensorRtSubgraphPass::CreateTensorRTOp(framework::ir::Node *node, void TensorRtSubgraphPass::CreateTensorRTOp(
Graph *graph) const { framework::ir::Node *node, Graph *graph,
const std::vector<std::string> &graph_params,
std::vector<std::string> *repetitive_params) const {
auto *op_desc = node->Op(); auto *op_desc = node->Op();
auto &subgraph = *Agent(node).subgraph(); auto &subgraph = *Agent(node).subgraph();
PADDLE_ENFORCE(!subgraph.empty()); PADDLE_ENFORCE(!subgraph.empty());
...@@ -124,10 +132,17 @@ void TensorRtSubgraphPass::CreateTensorRTOp(framework::ir::Node *node, ...@@ -124,10 +132,17 @@ void TensorRtSubgraphPass::CreateTensorRTOp(framework::ir::Node *node,
// is unique. // is unique.
std::set<std::string> input_names; std::set<std::string> input_names;
std::set<std::string> input_names_with_id; std::set<std::string> input_names_with_id;
std::vector<std::string> params;
// The node->inputs containes input tensors and parameters.
for (auto *x : node->inputs) { for (auto *x : node->inputs) {
input_names.insert(x->Name()); input_names.insert(x->Name());
input_names_with_id.insert(x->Name() + std::to_string(x->id())); input_names_with_id.insert(x->Name() + std::to_string(x->id()));
if (std::count(graph_params.begin(), graph_params.end(), x->Name()) > 0) {
params.push_back(x->Name());
}
} }
std::set<std::string> output_names; std::set<std::string> output_names;
std::set<std::string> output_names_with_id; std::set<std::string> output_names_with_id;
for (auto *x : node->outputs) { for (auto *x : node->outputs) {
...@@ -161,6 +176,7 @@ void TensorRtSubgraphPass::CreateTensorRTOp(framework::ir::Node *node, ...@@ -161,6 +176,7 @@ void TensorRtSubgraphPass::CreateTensorRTOp(framework::ir::Node *node,
PADDLE_ENFORCE(output_name_map.count(name) != 0); PADDLE_ENFORCE(output_name_map.count(name) != 0);
output_mapping.push_back(output_name_map[name]); output_mapping.push_back(output_name_map[name]);
} }
PADDLE_ENFORCE(!output_mapping.empty());
auto *vars = block_desc.Proto()->mutable_vars(); auto *vars = block_desc.Proto()->mutable_vars();
for (framework::ir::Node *node : graph->Nodes()) { for (framework::ir::Node *node : graph->Nodes()) {
...@@ -172,22 +188,21 @@ void TensorRtSubgraphPass::CreateTensorRTOp(framework::ir::Node *node, ...@@ -172,22 +188,21 @@ void TensorRtSubgraphPass::CreateTensorRTOp(framework::ir::Node *node,
PADDLE_ENFORCE(!block_desc.Proto()->vars().empty(), PADDLE_ENFORCE(!block_desc.Proto()->vars().empty(),
"the block has no var-desc"); "the block has no var-desc");
// Set attrs
op_desc->SetType("tensorrt_engine");
op_desc->SetInput( op_desc->SetInput(
"Xs", std::vector<std::string>(input_names.begin(), input_names.end())); "Xs", std::vector<std::string>(input_names.begin(), input_names.end()));
op_desc->SetOutput( op_desc->SetOutput(
"Ys", std::vector<std::string>(output_names.begin(), output_names.end())); "Ys", std::vector<std::string>(output_names.begin(), output_names.end()));
op_desc->SetType("tensorrt_engine");
PADDLE_ENFORCE(!output_mapping.empty());
op_desc->SetBlockAttr("sub_block", new_block); op_desc->SetBlockAttr("sub_block", new_block);
SetAttr(op_desc->Proto(), "subgraph", SetAttr(op_desc->Proto(), "subgraph",
block_desc.Proto()->SerializeAsString()); block_desc.Proto()->SerializeAsString());
// Set attrs
SetAttr(op_desc->Proto(), "max_batch_size", Get<int>("max_batch_size")); SetAttr(op_desc->Proto(), "max_batch_size", Get<int>("max_batch_size"));
SetAttr(op_desc->Proto(), "workspace_size", Get<int>("workspace_size")); SetAttr(op_desc->Proto(), "workspace_size", Get<int>("workspace_size"));
SetAttr(op_desc->Proto(), "parameters", ExtractParameters(graph->Nodes()));
SetAttr(op_desc->Proto(), "output_name_mapping", output_mapping); SetAttr(op_desc->Proto(), "output_name_mapping", output_mapping);
SetAttr(op_desc->Proto(), "parameters", params);
auto enable_int8 = Get<bool>("enable_int8"); auto enable_int8 = Get<bool>("enable_int8");
auto engine_key = auto engine_key =
...@@ -200,6 +215,11 @@ void TensorRtSubgraphPass::CreateTensorRTOp(framework::ir::Node *node, ...@@ -200,6 +215,11 @@ void TensorRtSubgraphPass::CreateTensorRTOp(framework::ir::Node *node,
SetAttr(op_desc->Proto(), "enable_int8", enable_int8); SetAttr(op_desc->Proto(), "enable_int8", enable_int8);
SetAttr(op_desc->Proto(), "engine_key", engine_key); SetAttr(op_desc->Proto(), "engine_key", engine_key);
if (!(enable_int8 && calibration_data.size() == 0)) {
std::copy(params.begin(), params.end(),
std::back_inserter(*repetitive_params));
}
} }
std::vector<std::string> ExtractParameters( std::vector<std::string> ExtractParameters(
...@@ -211,7 +231,7 @@ std::vector<std::string> ExtractParameters( ...@@ -211,7 +231,7 @@ std::vector<std::string> ExtractParameters(
for (const auto &node : nodes) { for (const auto &node : nodes) {
if (!node->IsOp()) continue; if (!node->IsOp()) continue;
std::string op_type = node->Op()->Type(); std::string op_type = node->Op()->Type();
if (op_type == "feed") { if (op_type == "feed" || op_type == "fetch") {
std::vector<std::string> output_names = node->Op()->OutputArgumentNames(); std::vector<std::string> output_names = node->Op()->OutputArgumentNames();
std::copy(output_names.begin(), output_names.end(), std::copy(output_names.begin(), output_names.end(),
std::back_inserter(feed_outputs)); std::back_inserter(feed_outputs));
......
...@@ -14,6 +14,8 @@ ...@@ -14,6 +14,8 @@
#pragma once #pragma once
#include <paddle/fluid/framework/ir/fuse_pass_base.h> #include <paddle/fluid/framework/ir/fuse_pass_base.h>
#include <string>
#include <vector>
#include "paddle/fluid/framework/ir/pass.h" #include "paddle/fluid/framework/ir/pass.h"
namespace paddle { namespace paddle {
...@@ -26,8 +28,9 @@ class TensorRtSubgraphPass : public framework::ir::FusePassBase { ...@@ -26,8 +28,9 @@ class TensorRtSubgraphPass : public framework::ir::FusePassBase {
std::unique_ptr<framework::ir::Graph> graph) const override; std::unique_ptr<framework::ir::Graph> graph) const override;
private: private:
void CreateTensorRTOp(framework::ir::Node *x, void CreateTensorRTOp(framework::ir::Node *x, framework::ir::Graph *graph,
framework::ir::Graph *graph) const; const std::vector<std::string> &graph_params,
std::vector<std::string> *repetitive_params) const;
void CleanIntermediateOutputs(framework::ir::Node *node); void CleanIntermediateOutputs(framework::ir::Node *node);
}; };
......
...@@ -31,6 +31,13 @@ void IrParamsSyncAmongDevicesPass::RunImpl(Argument *argument) { ...@@ -31,6 +31,13 @@ void IrParamsSyncAmongDevicesPass::RunImpl(Argument *argument) {
// The parameters are on the cpu, therefore, synchronization is not necessary. // The parameters are on the cpu, therefore, synchronization is not necessary.
if (!argument->use_gpu()) return; if (!argument->use_gpu()) return;
auto &graph = argument->main_graph();
std::vector<std::string> repetitive_params;
if (graph.Has(framework::ir::kRepetitiveParamAttr))
repetitive_params = graph.Get<std::vector<std::string>>(
framework::ir::kRepetitiveParamAttr);
LOG(INFO) << "Sync params from CPU to GPU"; LOG(INFO) << "Sync params from CPU to GPU";
PADDLE_ENFORCE(argument->gpu_device_id_valid()); PADDLE_ENFORCE(argument->gpu_device_id_valid());
...@@ -43,6 +50,10 @@ void IrParamsSyncAmongDevicesPass::RunImpl(Argument *argument) { ...@@ -43,6 +50,10 @@ void IrParamsSyncAmongDevicesPass::RunImpl(Argument *argument) {
// Because there exists the case that new parameter variables are not added to // Because there exists the case that new parameter variables are not added to
// the program in the analysis pass. // the program in the analysis pass.
for (auto &var_name : all_vars) { for (auto &var_name : all_vars) {
if (std::count(repetitive_params.begin(), repetitive_params.end(),
var_name)) {
continue;
}
auto *var = scope->FindLocalVar(var_name); auto *var = scope->FindLocalVar(var_name);
PADDLE_ENFORCE(var != nullptr); PADDLE_ENFORCE(var != nullptr);
if (var->IsType<framework::LoDTensor>() || if (var->IsType<framework::LoDTensor>() ||
......
...@@ -17,6 +17,7 @@ ...@@ -17,6 +17,7 @@
#include <string> #include <string>
#include <vector> #include <vector>
#include "paddle/fluid/framework/ir/fuse_pass_base.h"
#include "paddle/fluid/framework/scope.h" #include "paddle/fluid/framework/scope.h"
#include "paddle/fluid/inference/analysis/analysis_pass.h" #include "paddle/fluid/inference/analysis/analysis_pass.h"
#include "paddle/fluid/platform/place.h" #include "paddle/fluid/platform/place.h"
......
...@@ -16,9 +16,11 @@ limitations under the License. */ ...@@ -16,9 +16,11 @@ limitations under the License. */
#include <string> #include <string>
#include <unordered_map> #include <unordered_map>
#include <vector>
#include "paddle/fluid/framework/block_desc.h" #include "paddle/fluid/framework/block_desc.h"
#include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/framework/scope.h" #include "paddle/fluid/framework/scope.h"
#include "paddle/fluid/inference/analysis/helper.h"
#include "paddle/fluid/inference/tensorrt/engine.h" #include "paddle/fluid/inference/tensorrt/engine.h"
#include "paddle/fluid/inference/utils/singleton.h" #include "paddle/fluid/inference/utils/singleton.h"
...@@ -26,6 +28,37 @@ namespace paddle { ...@@ -26,6 +28,37 @@ namespace paddle {
namespace inference { namespace inference {
namespace tensorrt { namespace tensorrt {
using FluidDT = framework::proto::VarType_Type;
using TRT_DT = nvinfer1::DataType;
namespace { // NOLINT
TRT_DT FluidDataType2TRT(FluidDT type) {
switch (type) {
case FluidDT::VarType_Type_FP32:
return TRT_DT::kFLOAT;
case FluidDT::VarType_Type_INT32:
return TRT_DT::kINT32;
default:
return TRT_DT::kINT32;
}
PADDLE_THROW("unkown type");
return TRT_DT::kINT32;
}
nvinfer1::Dims Vec2TRT_Dims(const std::vector<int64_t>& shape) {
PADDLE_ENFORCE_GT(shape.size(), 1UL,
"TensorRT' tensor input requires at least 2 dimensions");
PADDLE_ENFORCE_LE(shape.size(), 4UL,
"TensorRT' tensor input requires at most 4 dimensions");
PADDLE_ENFORCE(shape.size() == 4UL || shape.size() == 2UL);
if (shape.size() == 4UL)
return nvinfer1::DimsCHW(shape[1], shape[2], shape[3]);
return nvinfer1::DimsCHW(shape[1], 1, 1);
}
} // namespace // NOLINT
/* /*
* Convert Op from Fluid to TensorRT Engine. * Convert Op from Fluid to TensorRT Engine.
*/ */
...@@ -110,6 +143,35 @@ class OpConverter { ...@@ -110,6 +143,35 @@ class OpConverter {
} }
} }
void ConvertBlockToTRTEngine(
framework::BlockDesc* block_desc, const framework::Scope& scope,
const std::vector<std::string>& inputs,
const std::unordered_set<std::string>& parameters,
const std::vector<std::string>& outputs, TensorRTEngine* engine) {
engine->InitNetwork();
for (auto& input : inputs) {
if (parameters.count(input)) continue;
auto& t =
inference::analysis::GetFromScope<framework::LoDTensor>(scope, input);
auto t_shape = framework::vectorize(t.dims());
auto* var = block_desc->FindVar(input);
PADDLE_ENFORCE(var, "no variable called %s", input);
PADDLE_ENFORCE_EQ(var->GetType(), FluidDT::VarType_Type_LOD_TENSOR,
"TensorRT engine only takes LoDTensor as input");
engine->DeclareInput(
input, FluidDataType2TRT(
var->Proto()->type().lod_tensor().tensor().data_type()),
Vec2TRT_Dims(t_shape));
}
framework::proto::BlockDesc* block_proto = block_desc->Proto();
ConvertBlock(*block_proto, parameters, scope, engine);
for (auto& output : outputs) {
engine->DeclareOutput(output);
}
engine->FreezeNetwork();
}
void SetEngine(TensorRTEngine* engine) { engine_ = engine; } void SetEngine(TensorRTEngine* engine) { engine_ = engine; }
virtual ~OpConverter() {} virtual ~OpConverter() {}
......
...@@ -31,37 +31,6 @@ namespace paddle { ...@@ -31,37 +31,6 @@ namespace paddle {
namespace operators { namespace operators {
using FluidDT = framework::proto::VarType_Type;
using TRT_DT = nvinfer1::DataType;
namespace { // NOLINT
TRT_DT FluidDataType2TRT(FluidDT type) {
switch (type) {
case FluidDT::VarType_Type_FP32:
return TRT_DT::kFLOAT;
case FluidDT::VarType_Type_INT32:
return TRT_DT::kINT32;
default:
return TRT_DT::kINT32;
}
PADDLE_THROW("unkown type");
return TRT_DT::kINT32;
}
nvinfer1::Dims Vec2TRT_Dims(const std::vector<int64_t> &shape) {
PADDLE_ENFORCE_GT(shape.size(), 1UL,
"TensorRT' tensor input requires at least 2 dimensions");
PADDLE_ENFORCE_LE(shape.size(), 4UL,
"TensorRT' tensor input requires at most 4 dimensions");
PADDLE_ENFORCE(shape.size() == 4UL || shape.size() == 2UL);
if (shape.size() == 4UL)
return nvinfer1::DimsCHW(shape[1], shape[2], shape[3]);
return nvinfer1::DimsCHW(shape[1], 1, 1);
}
} // namespace // NOLINT
using inference::Singleton; using inference::Singleton;
using inference::tensorrt::TensorRTEngine; using inference::tensorrt::TensorRTEngine;
using inference::tensorrt::TRTInt8Calibrator; using inference::tensorrt::TRTInt8Calibrator;
...@@ -161,7 +130,7 @@ class TensorRTEngineOp : public framework::OperatorBase { ...@@ -161,7 +130,7 @@ class TensorRTEngineOp : public framework::OperatorBase {
new TensorRTEngine(max_batch_size_, workspace_size_, enable_int8_, new TensorRTEngine(max_batch_size_, workspace_size_, enable_int8_,
calib_res->calib_.get())); calib_res->calib_.get()));
VLOG(3) << "start the calib trt engine thread"; VLOG(3) << "start the calib trt engine thread";
Prepare(scope, calib_res->engine_.get()); PrepareTRTEngine(scope, calib_res->engine_.get());
})); }));
} }
...@@ -259,7 +228,7 @@ class TensorRTEngineOp : public framework::OperatorBase { ...@@ -259,7 +228,7 @@ class TensorRTEngineOp : public framework::OperatorBase {
trt_engine_.reset(new TensorRTEngine(max_batch_size_, workspace_size_, trt_engine_.reset(new TensorRTEngine(max_batch_size_, workspace_size_,
enable_int8_, calibrator_.get())); enable_int8_, calibrator_.get()));
if (true) { if (true) {
Prepare(scope, trt_engine_.get()); PrepareTRTEngine(scope, trt_engine_.get());
} else { } else {
// create static engine // create static engine
} }
...@@ -267,49 +236,21 @@ class TensorRTEngineOp : public framework::OperatorBase { ...@@ -267,49 +236,21 @@ class TensorRTEngineOp : public framework::OperatorBase {
return trt_engine_.get(); return trt_engine_.get();
} }
void Prepare(const framework::Scope &scope, TensorRTEngine *engine) const { void PrepareTRTEngine(const framework::Scope &scope,
TensorRTEngine *engine) const {
LOG(INFO) << "Prepare TRT engine (Optimize model structure, Select OP " LOG(INFO) << "Prepare TRT engine (Optimize model structure, Select OP "
"kernel etc). This process may cost a lot of time."; "kernel etc). This process may cost a lot of time.";
framework::proto::BlockDesc block_desc; framework::proto::BlockDesc block_proto;
block_desc.ParseFromString(Attr<std::string>("subgraph")); block_proto.ParseFromString(Attr<std::string>("subgraph"));
framework::BlockDesc block(nullptr /*programdesc*/, &block_desc); framework::BlockDesc block_desc(nullptr, &block_proto);
engine->InitNetwork();
VLOG(4) << "parsed var size " << block.AllVars().size(); std::vector<std::string> inputs = Inputs("Xs");
std::vector<std::string> output_maps = std::vector<std::string> outputs =
Attr<std::vector<std::string>>("output_name_mapping"); Attr<std::vector<std::string>>("output_name_mapping");
// Add inputs
VLOG(4) << "declare inputs";
for (auto &input : Inputs("Xs")) {
if (param_names_.count(input)) continue;
VLOG(4) << "declare input " << input;
auto &t =
inference::analysis::GetFromScope<framework::LoDTensor>(scope, input);
auto t_shape = framework::vectorize(t.dims());
auto *var = block.FindVar(input);
// TensorRT engine need to create parameters. The parameter's description
// should be set in
PADDLE_ENFORCE(var, "no variable called %s", input);
PADDLE_ENFORCE_EQ(var->GetType(), FluidDT::VarType_Type_LOD_TENSOR,
"TensorRT engine only takes LoDTensor as input");
engine->DeclareInput(
input, FluidDataType2TRT(
var->Proto()->type().lod_tensor().tensor().data_type()),
Vec2TRT_Dims(t_shape));
}
inference::Singleton<inference::tensorrt::OpConverter>::Global() inference::Singleton<inference::tensorrt::OpConverter>::Global()
.ConvertBlock(block_desc, param_names_, scope, engine); .ConvertBlockToTRTEngine(&block_desc, scope, inputs, param_names_,
outputs, engine);
// Add outputs
for (auto &output : output_maps) {
engine->DeclareOutput(output);
}
engine->FreezeNetwork();
} }
}; };
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册