提交 ecc12fb4 编写于 作者: N nhzlx

3. when runing in trt mode, do not allocate memory for parameters in fluid.

test=develop
上级 9cc6249c
......@@ -14,6 +14,7 @@
#pragma once
#include <string>
#include "paddle/fluid/framework/ir/graph.h"
#include "paddle/fluid/framework/ir/pass.h"
#include "paddle/fluid/framework/scope.h"
......@@ -24,6 +25,10 @@ namespace ir {
static const char kParamScopeAttr[] = "__param_scope__";
static const char kFuseStatisAttr[] = "__fuse_statis__";
// When we use trt or other third_party lib, the parameters are managered by
// the lib, but not the fluid. So we need to record them to avoid duplicate
// allocation.
static const char kRepetitiveParamAttr[] = "__repetitive_param__";
enum FuseOptions {
DO_NOT_FUSE, // fusing will not be done
......
......@@ -14,8 +14,6 @@
#include <algorithm>
#include <set>
#include <string>
#include <vector>
#include "paddle/fluid/framework/ir/graph_pattern_detector.h"
#include "paddle/fluid/inference/analysis/helper.h"
......@@ -42,7 +40,6 @@ void RenameAndGetOutputs(
std::unordered_map<std::string, std::string> *output_name_map);
std::unique_ptr<framework::ir::Graph> analysis::TensorRtSubgraphPass::ApplyImpl(
std::unique_ptr<framework::ir::Graph> graph) const {
framework::ir::FusePassBase::Init("tensorrt_subgraph_pass", graph.get());
......@@ -55,9 +52,16 @@ std::unique_ptr<framework::ir::Graph> analysis::TensorRtSubgraphPass::ApplyImpl(
Get<int>("min_subgraph_size") /*min subgraph size*/);
fuser();
std::vector<std::string> graph_param_names =
ExtractParameters(graph->Nodes());
// those parameter already exist in trt, and should not have another copy in
// fluid.
std::vector<std::string> repetitive_params;
for (auto *node : graph->Nodes()) {
if (node->IsOp() && !Agent(node).subgraph()->empty()) {
CreateTensorRTOp(node, graph.get());
CreateTensorRTOp(node, graph.get(), graph_param_names,
&repetitive_params);
std::unordered_set<const Node *> nodes2remove(
Agent(node).subgraph()->begin(), Agent(node).subgraph()->end());
......@@ -72,6 +76,8 @@ std::unique_ptr<framework::ir::Graph> analysis::TensorRtSubgraphPass::ApplyImpl(
}
}
framework::ir::GraphSafeRemoveNodes(graph.get(), nodes2remove);
graph->Set(framework::ir::kRepetitiveParamAttr,
new std::vector<std::string>(repetitive_params));
return graph;
}
......@@ -89,8 +95,10 @@ std::string GenerateEngineKey(const std::set<std::string> &engine_inputs,
return engine_key;
}
void TensorRtSubgraphPass::CreateTensorRTOp(framework::ir::Node *node,
Graph *graph) const {
void TensorRtSubgraphPass::CreateTensorRTOp(
framework::ir::Node *node, Graph *graph,
const std::vector<std::string> &graph_params,
std::vector<std::string> *repetitive_params) const {
auto *op_desc = node->Op();
auto &subgraph = *Agent(node).subgraph();
PADDLE_ENFORCE(!subgraph.empty());
......@@ -124,10 +132,17 @@ void TensorRtSubgraphPass::CreateTensorRTOp(framework::ir::Node *node,
// is unique.
std::set<std::string> input_names;
std::set<std::string> input_names_with_id;
std::vector<std::string> params;
// The node->inputs containes input tensors and parameters.
for (auto *x : node->inputs) {
input_names.insert(x->Name());
input_names_with_id.insert(x->Name() + std::to_string(x->id()));
if (std::count(graph_params.begin(), graph_params.end(), x->Name()) > 0) {
params.push_back(x->Name());
}
}
std::set<std::string> output_names;
std::set<std::string> output_names_with_id;
for (auto *x : node->outputs) {
......@@ -161,6 +176,7 @@ void TensorRtSubgraphPass::CreateTensorRTOp(framework::ir::Node *node,
PADDLE_ENFORCE(output_name_map.count(name) != 0);
output_mapping.push_back(output_name_map[name]);
}
PADDLE_ENFORCE(!output_mapping.empty());
auto *vars = block_desc.Proto()->mutable_vars();
for (framework::ir::Node *node : graph->Nodes()) {
......@@ -172,22 +188,21 @@ void TensorRtSubgraphPass::CreateTensorRTOp(framework::ir::Node *node,
PADDLE_ENFORCE(!block_desc.Proto()->vars().empty(),
"the block has no var-desc");
// Set attrs
op_desc->SetType("tensorrt_engine");
op_desc->SetInput(
"Xs", std::vector<std::string>(input_names.begin(), input_names.end()));
op_desc->SetOutput(
"Ys", std::vector<std::string>(output_names.begin(), output_names.end()));
op_desc->SetType("tensorrt_engine");
PADDLE_ENFORCE(!output_mapping.empty());
op_desc->SetBlockAttr("sub_block", new_block);
SetAttr(op_desc->Proto(), "subgraph",
block_desc.Proto()->SerializeAsString());
// Set attrs
SetAttr(op_desc->Proto(), "max_batch_size", Get<int>("max_batch_size"));
SetAttr(op_desc->Proto(), "workspace_size", Get<int>("workspace_size"));
SetAttr(op_desc->Proto(), "parameters", ExtractParameters(graph->Nodes()));
SetAttr(op_desc->Proto(), "output_name_mapping", output_mapping);
SetAttr(op_desc->Proto(), "parameters", params);
auto enable_int8 = Get<bool>("enable_int8");
auto engine_key =
......@@ -200,6 +215,11 @@ void TensorRtSubgraphPass::CreateTensorRTOp(framework::ir::Node *node,
SetAttr(op_desc->Proto(), "enable_int8", enable_int8);
SetAttr(op_desc->Proto(), "engine_key", engine_key);
if (!(enable_int8 && calibration_data.size() == 0)) {
std::copy(params.begin(), params.end(),
std::back_inserter(*repetitive_params));
}
}
std::vector<std::string> ExtractParameters(
......@@ -211,7 +231,7 @@ std::vector<std::string> ExtractParameters(
for (const auto &node : nodes) {
if (!node->IsOp()) continue;
std::string op_type = node->Op()->Type();
if (op_type == "feed") {
if (op_type == "feed" || op_type == "fetch") {
std::vector<std::string> output_names = node->Op()->OutputArgumentNames();
std::copy(output_names.begin(), output_names.end(),
std::back_inserter(feed_outputs));
......
......@@ -14,6 +14,8 @@
#pragma once
#include <paddle/fluid/framework/ir/fuse_pass_base.h>
#include <string>
#include <vector>
#include "paddle/fluid/framework/ir/pass.h"
namespace paddle {
......@@ -26,8 +28,9 @@ class TensorRtSubgraphPass : public framework::ir::FusePassBase {
std::unique_ptr<framework::ir::Graph> graph) const override;
private:
void CreateTensorRTOp(framework::ir::Node *x,
framework::ir::Graph *graph) const;
void CreateTensorRTOp(framework::ir::Node *x, framework::ir::Graph *graph,
const std::vector<std::string> &graph_params,
std::vector<std::string> *repetitive_params) const;
void CleanIntermediateOutputs(framework::ir::Node *node);
};
......
......@@ -31,6 +31,13 @@ void IrParamsSyncAmongDevicesPass::RunImpl(Argument *argument) {
// The parameters are on the cpu, therefore, synchronization is not necessary.
if (!argument->use_gpu()) return;
auto &graph = argument->main_graph();
std::vector<std::string> repetitive_params;
if (graph.Has(framework::ir::kRepetitiveParamAttr))
repetitive_params = graph.Get<std::vector<std::string>>(
framework::ir::kRepetitiveParamAttr);
LOG(INFO) << "Sync params from CPU to GPU";
PADDLE_ENFORCE(argument->gpu_device_id_valid());
......@@ -43,6 +50,10 @@ void IrParamsSyncAmongDevicesPass::RunImpl(Argument *argument) {
// Because there exists the case that new parameter variables are not added to
// the program in the analysis pass.
for (auto &var_name : all_vars) {
if (std::count(repetitive_params.begin(), repetitive_params.end(),
var_name)) {
continue;
}
auto *var = scope->FindLocalVar(var_name);
PADDLE_ENFORCE(var != nullptr);
if (var->IsType<framework::LoDTensor>() ||
......
......@@ -17,6 +17,7 @@
#include <string>
#include <vector>
#include "paddle/fluid/framework/ir/fuse_pass_base.h"
#include "paddle/fluid/framework/scope.h"
#include "paddle/fluid/inference/analysis/analysis_pass.h"
#include "paddle/fluid/platform/place.h"
......
......@@ -16,9 +16,11 @@ limitations under the License. */
#include <string>
#include <unordered_map>
#include <vector>
#include "paddle/fluid/framework/block_desc.h"
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/framework/scope.h"
#include "paddle/fluid/inference/analysis/helper.h"
#include "paddle/fluid/inference/tensorrt/engine.h"
#include "paddle/fluid/inference/utils/singleton.h"
......@@ -26,6 +28,37 @@ namespace paddle {
namespace inference {
namespace tensorrt {
using FluidDT = framework::proto::VarType_Type;
using TRT_DT = nvinfer1::DataType;
namespace { // NOLINT
TRT_DT FluidDataType2TRT(FluidDT type) {
switch (type) {
case FluidDT::VarType_Type_FP32:
return TRT_DT::kFLOAT;
case FluidDT::VarType_Type_INT32:
return TRT_DT::kINT32;
default:
return TRT_DT::kINT32;
}
PADDLE_THROW("unkown type");
return TRT_DT::kINT32;
}
nvinfer1::Dims Vec2TRT_Dims(const std::vector<int64_t>& shape) {
PADDLE_ENFORCE_GT(shape.size(), 1UL,
"TensorRT' tensor input requires at least 2 dimensions");
PADDLE_ENFORCE_LE(shape.size(), 4UL,
"TensorRT' tensor input requires at most 4 dimensions");
PADDLE_ENFORCE(shape.size() == 4UL || shape.size() == 2UL);
if (shape.size() == 4UL)
return nvinfer1::DimsCHW(shape[1], shape[2], shape[3]);
return nvinfer1::DimsCHW(shape[1], 1, 1);
}
} // namespace // NOLINT
/*
* Convert Op from Fluid to TensorRT Engine.
*/
......@@ -110,6 +143,35 @@ class OpConverter {
}
}
void ConvertBlockToTRTEngine(
framework::BlockDesc* block_desc, const framework::Scope& scope,
const std::vector<std::string>& inputs,
const std::unordered_set<std::string>& parameters,
const std::vector<std::string>& outputs, TensorRTEngine* engine) {
engine->InitNetwork();
for (auto& input : inputs) {
if (parameters.count(input)) continue;
auto& t =
inference::analysis::GetFromScope<framework::LoDTensor>(scope, input);
auto t_shape = framework::vectorize(t.dims());
auto* var = block_desc->FindVar(input);
PADDLE_ENFORCE(var, "no variable called %s", input);
PADDLE_ENFORCE_EQ(var->GetType(), FluidDT::VarType_Type_LOD_TENSOR,
"TensorRT engine only takes LoDTensor as input");
engine->DeclareInput(
input, FluidDataType2TRT(
var->Proto()->type().lod_tensor().tensor().data_type()),
Vec2TRT_Dims(t_shape));
}
framework::proto::BlockDesc* block_proto = block_desc->Proto();
ConvertBlock(*block_proto, parameters, scope, engine);
for (auto& output : outputs) {
engine->DeclareOutput(output);
}
engine->FreezeNetwork();
}
void SetEngine(TensorRTEngine* engine) { engine_ = engine; }
virtual ~OpConverter() {}
......
......@@ -31,37 +31,6 @@ namespace paddle {
namespace operators {
using FluidDT = framework::proto::VarType_Type;
using TRT_DT = nvinfer1::DataType;
namespace { // NOLINT
TRT_DT FluidDataType2TRT(FluidDT type) {
switch (type) {
case FluidDT::VarType_Type_FP32:
return TRT_DT::kFLOAT;
case FluidDT::VarType_Type_INT32:
return TRT_DT::kINT32;
default:
return TRT_DT::kINT32;
}
PADDLE_THROW("unkown type");
return TRT_DT::kINT32;
}
nvinfer1::Dims Vec2TRT_Dims(const std::vector<int64_t> &shape) {
PADDLE_ENFORCE_GT(shape.size(), 1UL,
"TensorRT' tensor input requires at least 2 dimensions");
PADDLE_ENFORCE_LE(shape.size(), 4UL,
"TensorRT' tensor input requires at most 4 dimensions");
PADDLE_ENFORCE(shape.size() == 4UL || shape.size() == 2UL);
if (shape.size() == 4UL)
return nvinfer1::DimsCHW(shape[1], shape[2], shape[3]);
return nvinfer1::DimsCHW(shape[1], 1, 1);
}
} // namespace // NOLINT
using inference::Singleton;
using inference::tensorrt::TensorRTEngine;
using inference::tensorrt::TRTInt8Calibrator;
......@@ -161,7 +130,7 @@ class TensorRTEngineOp : public framework::OperatorBase {
new TensorRTEngine(max_batch_size_, workspace_size_, enable_int8_,
calib_res->calib_.get()));
VLOG(3) << "start the calib trt engine thread";
Prepare(scope, calib_res->engine_.get());
PrepareTRTEngine(scope, calib_res->engine_.get());
}));
}
......@@ -259,7 +228,7 @@ class TensorRTEngineOp : public framework::OperatorBase {
trt_engine_.reset(new TensorRTEngine(max_batch_size_, workspace_size_,
enable_int8_, calibrator_.get()));
if (true) {
Prepare(scope, trt_engine_.get());
PrepareTRTEngine(scope, trt_engine_.get());
} else {
// create static engine
}
......@@ -267,49 +236,21 @@ class TensorRTEngineOp : public framework::OperatorBase {
return trt_engine_.get();
}
void Prepare(const framework::Scope &scope, TensorRTEngine *engine) const {
void PrepareTRTEngine(const framework::Scope &scope,
TensorRTEngine *engine) const {
LOG(INFO) << "Prepare TRT engine (Optimize model structure, Select OP "
"kernel etc). This process may cost a lot of time.";
framework::proto::BlockDesc block_desc;
block_desc.ParseFromString(Attr<std::string>("subgraph"));
framework::BlockDesc block(nullptr /*programdesc*/, &block_desc);
engine->InitNetwork();
framework::proto::BlockDesc block_proto;
block_proto.ParseFromString(Attr<std::string>("subgraph"));
framework::BlockDesc block_desc(nullptr, &block_proto);
VLOG(4) << "parsed var size " << block.AllVars().size();
std::vector<std::string> output_maps =
std::vector<std::string> inputs = Inputs("Xs");
std::vector<std::string> outputs =
Attr<std::vector<std::string>>("output_name_mapping");
// Add inputs
VLOG(4) << "declare inputs";
for (auto &input : Inputs("Xs")) {
if (param_names_.count(input)) continue;
VLOG(4) << "declare input " << input;
auto &t =
inference::analysis::GetFromScope<framework::LoDTensor>(scope, input);
auto t_shape = framework::vectorize(t.dims());
auto *var = block.FindVar(input);
// TensorRT engine need to create parameters. The parameter's description
// should be set in
PADDLE_ENFORCE(var, "no variable called %s", input);
PADDLE_ENFORCE_EQ(var->GetType(), FluidDT::VarType_Type_LOD_TENSOR,
"TensorRT engine only takes LoDTensor as input");
engine->DeclareInput(
input, FluidDataType2TRT(
var->Proto()->type().lod_tensor().tensor().data_type()),
Vec2TRT_Dims(t_shape));
}
inference::Singleton<inference::tensorrt::OpConverter>::Global()
.ConvertBlock(block_desc, param_names_, scope, engine);
// Add outputs
for (auto &output : output_maps) {
engine->DeclareOutput(output);
}
engine->FreezeNetwork();
.ConvertBlockToTRTEngine(&block_desc, scope, inputs, param_names_,
outputs, engine);
}
};
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册