未验证 提交 74812d1c 编写于 作者: P Pei Yang 提交者: GitHub

Fix BUGS: paddle-TRT repeatedly sets weight_map and overdeletes repetitive_params (#19825)

* fix trt bugs when sharing params, test=develop

* add unittest for cascade_rcnn
上级 e2372750
......@@ -102,7 +102,7 @@ void TensorRtSubgraphPass::CreateTensorRTOp(
// const framework::BlockDesc& main_block = program_desc->Block(0);
framework::BlockDesc *new_block = program_desc->AppendBlock(main_block);
// An fake block desc.
// A fake block desc.
framework::proto::BlockDesc block_proto;
framework::BlockDesc block_desc(nullptr, &block_proto);
block_desc.Proto()->set_parent_idx(-1);
......@@ -118,20 +118,27 @@ void TensorRtSubgraphPass::CreateTensorRTOp(
}
// Then, we will use the input_names_with_id and output_names_with_id to
// generate the eigine key.
// generate the engine key.
// So, We use set instead of unordered_set here to ensure that the engine key
// is unique.
std::set<std::string> input_names;
std::set<std::string> input_names_with_id;
std::vector<std::string> params;
// if we delete fluid copy of params shared by more than 1 ops, there will be
// problem, so we filter them out.
std::vector<std::string> params_not_shared;
// The node->inputs containes input tensors and parameters.
// The node->inputs contains input tensors and parameters.
for (auto *x : node->inputs) {
input_names.insert(x->Name());
input_names_with_id.insert(x->Name() + std::to_string(x->id()));
if (std::count(graph_params.begin(), graph_params.end(), x->Name()) > 0) {
params.push_back(x->Name());
}
if (std::count(graph_params.begin(), graph_params.end(), x->Name()) > 0 &&
x->outputs.size() <= 1) {
params_not_shared.push_back(x->Name());
}
}
std::set<std::string> output_names;
......@@ -241,7 +248,7 @@ void TensorRtSubgraphPass::CreateTensorRTOp(
return;
}
std::copy(params.begin(), params.end(),
std::copy(params_not_shared.begin(), params_not_shared.end(),
std::back_inserter(*repetitive_params));
tensorrt::TensorRTEngine *trt_engine =
......
......@@ -116,11 +116,10 @@ class BatchNormOpConverter : public OpConverter {
scale_weights.get(), power_weights.get());
auto output_name = op_desc.Output("Y").front();
engine_->weight_map[op_desc.Input("Bias").front()] =
std::move(combile_bias_tensor);
engine_->weight_map[op_desc.Input("Scale").front()] =
std::move(combile_scale_tensor);
engine_->SetWeights(op_desc.Input("Bias").front(),
std::move(combile_bias_tensor));
engine_->SetWeights(op_desc.Input("Scale").front(),
std::move(combile_scale_tensor));
RreplenishLayerAndOutput(layer, "pool2d", {output_name}, test_mode);
}
};
......
......@@ -66,8 +66,8 @@ class DropoutOpConverter : public OpConverter {
nvinfer1::ScaleMode::kUNIFORM, shift_weights.get(), scale_weights.get(),
power_weights.get());
engine_->weight_map[op_desc.Output("Out").front() + "_dropout"] =
std::move(weight_tensor);
engine_->SetWeights(op_desc.Output("Out").front() + "_dropout",
std::move(weight_tensor));
auto output_name = op_desc.Output("Out")[0];
RreplenishLayerAndOutput(layer, "dropout", {output_name}, test_mode);
......
......@@ -72,7 +72,7 @@ class FcOpConverter : public OpConverter {
PADDLE_ENFORCE_NOT_NULL(Y_v);
auto* Y_t = Y_v->GetMutable<framework::LoDTensor>();
// This may trigger a GPU->CPU copy, because TRT's weight can only be
// assigned from CPU memory, that can't be avoided.
// assigned from CPU memory, which can't be avoided.
float* weight_data = nullptr;
bool enable_int8 = boost::get<bool>(op_desc.HasAttr("enable_int8"));
if (enable_int8) {
......@@ -131,7 +131,7 @@ class FcOpConverter : public OpConverter {
*const_cast<nvinfer1::ITensor*>(X),
n_output, tmp_weight.get(), bias.get());
engine_->weight_map[op_desc.Input(w_name).front()] = std::move(tmp);
engine_->SetWeights(op_desc.Input(w_name).front(), std::move(tmp));
auto output_name = op_desc.Output("Out").front();
RreplenishLayerAndOutput(layer, "fc", {output_name}, test_mode);
......
......@@ -81,7 +81,7 @@ class LeakyReluOpConverter : public OpConverter {
std::string alpha_name = op_desc.Output("Out")[0] + "_alpha";
PADDLE_ENFORCE(engine_->weight_map.find(alpha_name) ==
engine_->weight_map.end());
engine_->weight_map[alpha_name] = std::move(alpha_tensor);
engine_->SetWeights(alpha_name, std::move(alpha_tensor));
#endif
auto output_name = op_desc.Output("Out")[0];
RreplenishLayerAndOutput(output_layer, "leaky_relu", {output_name},
......
......@@ -55,8 +55,8 @@ class PReluOpConverter : public OpConverter {
nvinfer1::IPluginLayer* layer =
engine_->AddPlugin(&input, input_num, plugin);
// keep alpha tensor to avoid release it's memory
engine_->weight_map[op_desc.Input("Alpha")[0]] =
std::move(alpha_tensor_temp);
engine_->SetWeights(op_desc.Input("Alpha")[0],
std::move(alpha_tensor_temp));
auto output_name = op_desc.Output("Out")[0];
RreplenishLayerAndOutput(layer, "prelu", {output_name}, test_mode);
......
......@@ -70,7 +70,7 @@ void TensorRTEngine::FreezeNetwork() {
}
#else
if (enable_fp16)
LOG(INFO) << "Using FP16 in Paddle-trt must ensure that the version of TRT "
LOG(INFO) << "Using FP16 in Paddle-TRT must ensure that the version of TRT "
"is at least 5."
"So, use FP32 to run.";
#endif
......@@ -146,8 +146,8 @@ void TensorRTEngine::DeclareOutput(const nvinfer1::ILayer *layer, int offset,
PADDLE_ENFORCE(!output->isNetworkInput());
infer_network_->markOutput(*output);
PADDLE_ENFORCE(output->isNetworkOutput());
// output buffers' size can only be decided latter, set zero here to mark this
// and will reset latter.
// output buffers' size can only be decided later, set zero here to mark this
// and will reset later.
buffer_sizes_[name] = 0;
}
......@@ -164,8 +164,8 @@ void TensorRTEngine::DeclareOutput(const std::string &name) {
output->setName(name.c_str());
PADDLE_ENFORCE(!output->isNetworkInput());
infer_network_->markOutput(*output);
// output buffers' size can only be decided latter, set zero here to mark this
// and will reset latter.
// output buffers' size can only be decided later, set zero here to mark this
// and will reset later.
buffer_sizes_[name] = 0;
}
......@@ -190,20 +190,26 @@ float *TensorRTEngine::GetWeightCPUData(const std::string &name,
framework::Tensor *weight_tensor,
bool enable_int8,
const std::vector<float> &scale) {
static int name_suffix_counter = 0;
std::string name_suffix = std::to_string(name_suffix_counter);
std::string name_with_suffix = name + name_suffix;
auto w_dims = weight_tensor->dims();
platform::CPUPlace cpu_place;
PADDLE_ENFORCE(!weight_map.count(name),
"During TRT Op converter: We set weight %s with the same name "
"twice into the weight_map",
name);
weight_map[name].reset(new framework::Tensor());
weight_map[name]->Resize(weight_tensor->dims());
TensorCopySync(*weight_tensor, cpu_place, weight_map[name].get());
float *weight_data = weight_map[name]->mutable_data<float>(cpu_place);
PADDLE_ENFORCE_EQ(
weight_map.count(name_with_suffix), 0,
"During TRT Op converter: We set weight %s with the same name "
"twice into the weight_map",
name_with_suffix);
weight_map[name_with_suffix].reset(new framework::Tensor());
weight_map[name_with_suffix]->Resize(weight_tensor->dims());
TensorCopySync(*weight_tensor, cpu_place, weight_map[name_with_suffix].get());
float *weight_data =
weight_map[name_with_suffix]->mutable_data<float>(cpu_place);
name_suffix_counter += 1;
if (enable_int8) {
// when the op is fc, scale's size should be 1
// when the op is conv, the scale's size should be w_dims[0]
// when the op is conv, scale's size should be w_dims[0]
bool valid_scale_size =
(scale.size() == 1 || scale.size() == static_cast<size_t>(w_dims[0]));
PADDLE_ENFORCE(valid_scale_size, "TRT int8 quant: invalid scale size");
......
......@@ -15,10 +15,12 @@ limitations under the License. */
#pragma once
#include <NvInfer.h>
#include <map>
#include <memory>
#include <string>
#include <unordered_map>
#include <unordered_set>
#include <utility>
#include <vector>
#include "paddle/fluid/framework/tensor.h"
#include "paddle/fluid/framework/tensor_util.h"
......@@ -39,7 +41,7 @@ class TRTInt8Calibrator;
* TensorRT Engine.
*
* There are two alternative ways to use it, one is to build from a paddle
* protobuf model, another way is to manully construct the network.
* protobuf model, another way is to manually construct the network.
*/
class TensorRTEngine {
using DescType = ::paddle::framework::proto::BlockDesc;
......@@ -89,11 +91,11 @@ class TensorRTEngine {
infer_builder_.reset(createInferBuilder(&logger_));
infer_network_.reset(infer_builder_->createNetwork());
}
// After finishing adding ops, freeze this network and creates the executation
// After finishing adding ops, freeze this network and creates the execution
// environment.
void FreezeNetwork();
// Add an input and set its name, data type and dimention.
// Add an input and set its name, data type and dimension.
nvinfer1::ITensor* DeclareInput(const std::string& name,
nvinfer1::DataType dtype,
const nvinfer1::Dims& dim);
......@@ -151,6 +153,16 @@ class TensorRTEngine {
std::unordered_map<std::string /*name*/, std::unique_ptr<framework::Tensor>>
weight_map;
// When setting weight_map, a self-increasing suffix is needed for the names
// so as to avoid repeatedly setting weights with the same name.
void SetWeights(std::string w_name,
std::unique_ptr<framework::Tensor> w_tensor) {
static int suffix_counter = 0;
std::string suffix = std::to_string(suffix_counter);
weight_map[w_name + suffix] = std::move(w_tensor);
suffix_counter += 1;
}
void ClearWeights() {
for (auto& weight_pair : weight_map) {
weight_pair.second.reset(nullptr);
......@@ -209,7 +221,7 @@ class TensorRTEngine {
((NV_TENSORRT_MAJOR * 1000 + NV_TENSORRT_MINOR * 100 + \
NV_TENSORRT_PATCH * 10 + NV_TENSORRT_BUILD) >= version)
// Add an layer__ into engine__ with args ARGS.
// Add a layer__ into engine__ with args ARGS.
// For example:
//
// Reference
......
......@@ -269,7 +269,7 @@ download_model_and_data(${BERT_INSTALL_DIR} "bert_emb128_model.tar.gz" "bert_dat
inference_analysis_api_test(test_analyzer_bert ${BERT_INSTALL_DIR} analyzer_bert_tester.cc)
if(WITH_GPU AND TENSORRT_FOUND)
set(TRT_MODEL_INSTALL_DIR "${INFERENCE_DEMO_INSTALL_DIR}/trt_tests_models")
set(TRT_MODEL_INSTALL_DIR "${INFERENCE_DEMO_INSTALL_DIR}/trt_models")
if (NOT EXISTS ${TRT_MODEL_INSTALL_DIR})
inference_download_and_uncompress(${TRT_MODEL_INSTALL_DIR} ${INFERENCE_URL}/tensorrt_test "trt_inference_test_models.tar.gz")
endif()
......@@ -285,4 +285,7 @@ if(WITH_GPU AND TENSORRT_FOUND)
inference_analysis_test(trt_fc_prelu_test SRCS trt_fc_prelu_test.cc
EXTRA_DEPS ${INFERENCE_EXTRA_DEPS}
ARGS --infer_model=${TRT_MODEL_INSTALL_DIR}/trt_inference_test_models)
inference_analysis_test(trt_cascade_rcnn_test SRCS trt_cascade_rcnn_test.cc
EXTRA_DEPS ${INFERENCE_EXTRA_DEPS}
ARGS --infer_model=${TRT_MODEL_INSTALL_DIR}/trt_inference_test_models)
endif()
/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include <gflags/gflags.h>
#include <glog/logging.h>
#include <gtest/gtest.h>
#include "paddle/fluid/inference/tests/api/trt_test_helper.h"
namespace paddle {
namespace inference {
TEST(TensorRT, cascade_rcnn) {
std::string model_dir = FLAGS_infer_model + "/cascade_rcnn";
AnalysisConfig config;
int batch_size = 1;
config.EnableUseGpu(100, 0);
config.SetModel(model_dir + "/model", model_dir + "/params");
config.SwitchUseFeedFetchOps(false);
config.EnableTensorRtEngine(1 << 30, batch_size, 40,
AnalysisConfig::Precision::kFloat32, false);
auto predictor = CreatePaddlePredictor(config);
int channels = 3;
int height = 640;
int width = 640;
int input_num = batch_size * channels * height * width;
float *input = new float[input_num];
memset(input, 1.0, input_num * sizeof(float));
float *im_shape = new float[3];
im_shape[0] = 3.0;
im_shape[1] = 640.0;
im_shape[2] = 640.0;
auto input_names = predictor->GetInputNames();
auto input_t = predictor->GetInputTensor(input_names[0]);
input_t->Reshape({batch_size, channels, height, width});
input_t->copy_from_cpu(input);
auto input_t1 = predictor->GetInputTensor(input_names[1]);
input_t1->Reshape({batch_size, 3});
input_t1->copy_from_cpu(im_shape);
ASSERT_TRUE(predictor->ZeroCopyRun());
}
} // namespace inference
} // namespace paddle
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册