From 74812d1c90eba8f9bd231312a5832eb32217bdcb Mon Sep 17 00:00:00 2001 From: Pei Yang Date: Sat, 21 Sep 2019 09:17:44 +0800 Subject: [PATCH] Fix BUGS: paddle-TRT repeatedly sets weight_map and overdeletes repetitive_params (#19825) * fix trt bugs when sharing params, test=develop * add unittest for cascade_rcnn --- .../ir_passes/tensorrt_subgraph_pass.cc | 15 +++-- .../tensorrt/convert/batch_norm_op.cc | 9 ++- .../inference/tensorrt/convert/dropout_op.cc | 4 +- .../fluid/inference/tensorrt/convert/fc_op.cc | 4 +- .../tensorrt/convert/leaky_relu_op.cc | 2 +- .../inference/tensorrt/convert/prelu_op.cc | 4 +- paddle/fluid/inference/tensorrt/engine.cc | 34 +++++----- paddle/fluid/inference/tensorrt/engine.h | 20 ++++-- .../fluid/inference/tests/api/CMakeLists.txt | 5 +- .../tests/api/trt_cascade_rcnn_test.cc | 62 +++++++++++++++++++ 10 files changed, 124 insertions(+), 35 deletions(-) create mode 100644 paddle/fluid/inference/tests/api/trt_cascade_rcnn_test.cc diff --git a/paddle/fluid/inference/analysis/ir_passes/tensorrt_subgraph_pass.cc b/paddle/fluid/inference/analysis/ir_passes/tensorrt_subgraph_pass.cc index 1d1e4570e1e..bbe68a7bab5 100644 --- a/paddle/fluid/inference/analysis/ir_passes/tensorrt_subgraph_pass.cc +++ b/paddle/fluid/inference/analysis/ir_passes/tensorrt_subgraph_pass.cc @@ -102,7 +102,7 @@ void TensorRtSubgraphPass::CreateTensorRTOp( // const framework::BlockDesc& main_block = program_desc->Block(0); framework::BlockDesc *new_block = program_desc->AppendBlock(main_block); - // An fake block desc. + // A fake block desc. framework::proto::BlockDesc block_proto; framework::BlockDesc block_desc(nullptr, &block_proto); block_desc.Proto()->set_parent_idx(-1); @@ -118,20 +118,27 @@ void TensorRtSubgraphPass::CreateTensorRTOp( } // Then, we will use the input_names_with_id and output_names_with_id to - // generate the eigine key. + // generate the engine key. // So, We use set instead of unordered_set here to ensure that the engine key // is unique. std::set input_names; std::set input_names_with_id; std::vector params; + // if we delete fluid copy of params shared by more than 1 ops, there will be + // problem, so we filter them out. + std::vector params_not_shared; - // The node->inputs containes input tensors and parameters. + // The node->inputs contains input tensors and parameters. for (auto *x : node->inputs) { input_names.insert(x->Name()); input_names_with_id.insert(x->Name() + std::to_string(x->id())); if (std::count(graph_params.begin(), graph_params.end(), x->Name()) > 0) { params.push_back(x->Name()); } + if (std::count(graph_params.begin(), graph_params.end(), x->Name()) > 0 && + x->outputs.size() <= 1) { + params_not_shared.push_back(x->Name()); + } } std::set output_names; @@ -241,7 +248,7 @@ void TensorRtSubgraphPass::CreateTensorRTOp( return; } - std::copy(params.begin(), params.end(), + std::copy(params_not_shared.begin(), params_not_shared.end(), std::back_inserter(*repetitive_params)); tensorrt::TensorRTEngine *trt_engine = diff --git a/paddle/fluid/inference/tensorrt/convert/batch_norm_op.cc b/paddle/fluid/inference/tensorrt/convert/batch_norm_op.cc index d9488684644..25f0d866dcd 100644 --- a/paddle/fluid/inference/tensorrt/convert/batch_norm_op.cc +++ b/paddle/fluid/inference/tensorrt/convert/batch_norm_op.cc @@ -116,11 +116,10 @@ class BatchNormOpConverter : public OpConverter { scale_weights.get(), power_weights.get()); auto output_name = op_desc.Output("Y").front(); - engine_->weight_map[op_desc.Input("Bias").front()] = - std::move(combile_bias_tensor); - engine_->weight_map[op_desc.Input("Scale").front()] = - std::move(combile_scale_tensor); - + engine_->SetWeights(op_desc.Input("Bias").front(), + std::move(combile_bias_tensor)); + engine_->SetWeights(op_desc.Input("Scale").front(), + std::move(combile_scale_tensor)); RreplenishLayerAndOutput(layer, "pool2d", {output_name}, test_mode); } }; diff --git a/paddle/fluid/inference/tensorrt/convert/dropout_op.cc b/paddle/fluid/inference/tensorrt/convert/dropout_op.cc index 510b622f46f..cd28c6d98a0 100644 --- a/paddle/fluid/inference/tensorrt/convert/dropout_op.cc +++ b/paddle/fluid/inference/tensorrt/convert/dropout_op.cc @@ -66,8 +66,8 @@ class DropoutOpConverter : public OpConverter { nvinfer1::ScaleMode::kUNIFORM, shift_weights.get(), scale_weights.get(), power_weights.get()); - engine_->weight_map[op_desc.Output("Out").front() + "_dropout"] = - std::move(weight_tensor); + engine_->SetWeights(op_desc.Output("Out").front() + "_dropout", + std::move(weight_tensor)); auto output_name = op_desc.Output("Out")[0]; RreplenishLayerAndOutput(layer, "dropout", {output_name}, test_mode); diff --git a/paddle/fluid/inference/tensorrt/convert/fc_op.cc b/paddle/fluid/inference/tensorrt/convert/fc_op.cc index fb7b89b189a..ea108d6a07e 100644 --- a/paddle/fluid/inference/tensorrt/convert/fc_op.cc +++ b/paddle/fluid/inference/tensorrt/convert/fc_op.cc @@ -72,7 +72,7 @@ class FcOpConverter : public OpConverter { PADDLE_ENFORCE_NOT_NULL(Y_v); auto* Y_t = Y_v->GetMutable(); // This may trigger a GPU->CPU copy, because TRT's weight can only be - // assigned from CPU memory, that can't be avoided. + // assigned from CPU memory, which can't be avoided. float* weight_data = nullptr; bool enable_int8 = boost::get(op_desc.HasAttr("enable_int8")); if (enable_int8) { @@ -131,7 +131,7 @@ class FcOpConverter : public OpConverter { *const_cast(X), n_output, tmp_weight.get(), bias.get()); - engine_->weight_map[op_desc.Input(w_name).front()] = std::move(tmp); + engine_->SetWeights(op_desc.Input(w_name).front(), std::move(tmp)); auto output_name = op_desc.Output("Out").front(); RreplenishLayerAndOutput(layer, "fc", {output_name}, test_mode); diff --git a/paddle/fluid/inference/tensorrt/convert/leaky_relu_op.cc b/paddle/fluid/inference/tensorrt/convert/leaky_relu_op.cc index 2a46938cb10..f3c714009f8 100644 --- a/paddle/fluid/inference/tensorrt/convert/leaky_relu_op.cc +++ b/paddle/fluid/inference/tensorrt/convert/leaky_relu_op.cc @@ -81,7 +81,7 @@ class LeakyReluOpConverter : public OpConverter { std::string alpha_name = op_desc.Output("Out")[0] + "_alpha"; PADDLE_ENFORCE(engine_->weight_map.find(alpha_name) == engine_->weight_map.end()); - engine_->weight_map[alpha_name] = std::move(alpha_tensor); + engine_->SetWeights(alpha_name, std::move(alpha_tensor)); #endif auto output_name = op_desc.Output("Out")[0]; RreplenishLayerAndOutput(output_layer, "leaky_relu", {output_name}, diff --git a/paddle/fluid/inference/tensorrt/convert/prelu_op.cc b/paddle/fluid/inference/tensorrt/convert/prelu_op.cc index 01bcd03e522..d327a743662 100644 --- a/paddle/fluid/inference/tensorrt/convert/prelu_op.cc +++ b/paddle/fluid/inference/tensorrt/convert/prelu_op.cc @@ -55,8 +55,8 @@ class PReluOpConverter : public OpConverter { nvinfer1::IPluginLayer* layer = engine_->AddPlugin(&input, input_num, plugin); // keep alpha tensor to avoid release it's memory - engine_->weight_map[op_desc.Input("Alpha")[0]] = - std::move(alpha_tensor_temp); + engine_->SetWeights(op_desc.Input("Alpha")[0], + std::move(alpha_tensor_temp)); auto output_name = op_desc.Output("Out")[0]; RreplenishLayerAndOutput(layer, "prelu", {output_name}, test_mode); diff --git a/paddle/fluid/inference/tensorrt/engine.cc b/paddle/fluid/inference/tensorrt/engine.cc index 4a26417375a..f806069b476 100644 --- a/paddle/fluid/inference/tensorrt/engine.cc +++ b/paddle/fluid/inference/tensorrt/engine.cc @@ -70,7 +70,7 @@ void TensorRTEngine::FreezeNetwork() { } #else if (enable_fp16) - LOG(INFO) << "Using FP16 in Paddle-trt must ensure that the version of TRT " + LOG(INFO) << "Using FP16 in Paddle-TRT must ensure that the version of TRT " "is at least 5." "So, use FP32 to run."; #endif @@ -146,8 +146,8 @@ void TensorRTEngine::DeclareOutput(const nvinfer1::ILayer *layer, int offset, PADDLE_ENFORCE(!output->isNetworkInput()); infer_network_->markOutput(*output); PADDLE_ENFORCE(output->isNetworkOutput()); - // output buffers' size can only be decided latter, set zero here to mark this - // and will reset latter. + // output buffers' size can only be decided later, set zero here to mark this + // and will reset later. buffer_sizes_[name] = 0; } @@ -164,8 +164,8 @@ void TensorRTEngine::DeclareOutput(const std::string &name) { output->setName(name.c_str()); PADDLE_ENFORCE(!output->isNetworkInput()); infer_network_->markOutput(*output); - // output buffers' size can only be decided latter, set zero here to mark this - // and will reset latter. + // output buffers' size can only be decided later, set zero here to mark this + // and will reset later. buffer_sizes_[name] = 0; } @@ -190,20 +190,26 @@ float *TensorRTEngine::GetWeightCPUData(const std::string &name, framework::Tensor *weight_tensor, bool enable_int8, const std::vector &scale) { + static int name_suffix_counter = 0; + std::string name_suffix = std::to_string(name_suffix_counter); + std::string name_with_suffix = name + name_suffix; auto w_dims = weight_tensor->dims(); platform::CPUPlace cpu_place; - PADDLE_ENFORCE(!weight_map.count(name), - "During TRT Op converter: We set weight %s with the same name " - "twice into the weight_map", - name); - weight_map[name].reset(new framework::Tensor()); - weight_map[name]->Resize(weight_tensor->dims()); - TensorCopySync(*weight_tensor, cpu_place, weight_map[name].get()); - float *weight_data = weight_map[name]->mutable_data(cpu_place); + PADDLE_ENFORCE_EQ( + weight_map.count(name_with_suffix), 0, + "During TRT Op converter: We set weight %s with the same name " + "twice into the weight_map", + name_with_suffix); + weight_map[name_with_suffix].reset(new framework::Tensor()); + weight_map[name_with_suffix]->Resize(weight_tensor->dims()); + TensorCopySync(*weight_tensor, cpu_place, weight_map[name_with_suffix].get()); + float *weight_data = + weight_map[name_with_suffix]->mutable_data(cpu_place); + name_suffix_counter += 1; if (enable_int8) { // when the op is fc, scale's size should be 1 - // when the op is conv, the scale's size should be w_dims[0] + // when the op is conv, scale's size should be w_dims[0] bool valid_scale_size = (scale.size() == 1 || scale.size() == static_cast(w_dims[0])); PADDLE_ENFORCE(valid_scale_size, "TRT int8 quant: invalid scale size"); diff --git a/paddle/fluid/inference/tensorrt/engine.h b/paddle/fluid/inference/tensorrt/engine.h index 19ec11017a4..c1d950035ca 100644 --- a/paddle/fluid/inference/tensorrt/engine.h +++ b/paddle/fluid/inference/tensorrt/engine.h @@ -15,10 +15,12 @@ limitations under the License. */ #pragma once #include +#include #include #include #include #include +#include #include #include "paddle/fluid/framework/tensor.h" #include "paddle/fluid/framework/tensor_util.h" @@ -39,7 +41,7 @@ class TRTInt8Calibrator; * TensorRT Engine. * * There are two alternative ways to use it, one is to build from a paddle - * protobuf model, another way is to manully construct the network. + * protobuf model, another way is to manually construct the network. */ class TensorRTEngine { using DescType = ::paddle::framework::proto::BlockDesc; @@ -89,11 +91,11 @@ class TensorRTEngine { infer_builder_.reset(createInferBuilder(&logger_)); infer_network_.reset(infer_builder_->createNetwork()); } - // After finishing adding ops, freeze this network and creates the executation + // After finishing adding ops, freeze this network and creates the execution // environment. void FreezeNetwork(); - // Add an input and set its name, data type and dimention. + // Add an input and set its name, data type and dimension. nvinfer1::ITensor* DeclareInput(const std::string& name, nvinfer1::DataType dtype, const nvinfer1::Dims& dim); @@ -151,6 +153,16 @@ class TensorRTEngine { std::unordered_map> weight_map; + // When setting weight_map, a self-increasing suffix is needed for the names + // so as to avoid repeatedly setting weights with the same name. + void SetWeights(std::string w_name, + std::unique_ptr w_tensor) { + static int suffix_counter = 0; + std::string suffix = std::to_string(suffix_counter); + weight_map[w_name + suffix] = std::move(w_tensor); + suffix_counter += 1; + } + void ClearWeights() { for (auto& weight_pair : weight_map) { weight_pair.second.reset(nullptr); @@ -209,7 +221,7 @@ class TensorRTEngine { ((NV_TENSORRT_MAJOR * 1000 + NV_TENSORRT_MINOR * 100 + \ NV_TENSORRT_PATCH * 10 + NV_TENSORRT_BUILD) >= version) -// Add an layer__ into engine__ with args ARGS. +// Add a layer__ into engine__ with args ARGS. // For example: // // Reference diff --git a/paddle/fluid/inference/tests/api/CMakeLists.txt b/paddle/fluid/inference/tests/api/CMakeLists.txt index 997e7f44d9d..87e0fe7126b 100644 --- a/paddle/fluid/inference/tests/api/CMakeLists.txt +++ b/paddle/fluid/inference/tests/api/CMakeLists.txt @@ -269,7 +269,7 @@ download_model_and_data(${BERT_INSTALL_DIR} "bert_emb128_model.tar.gz" "bert_dat inference_analysis_api_test(test_analyzer_bert ${BERT_INSTALL_DIR} analyzer_bert_tester.cc) if(WITH_GPU AND TENSORRT_FOUND) - set(TRT_MODEL_INSTALL_DIR "${INFERENCE_DEMO_INSTALL_DIR}/trt_tests_models") + set(TRT_MODEL_INSTALL_DIR "${INFERENCE_DEMO_INSTALL_DIR}/trt_models") if (NOT EXISTS ${TRT_MODEL_INSTALL_DIR}) inference_download_and_uncompress(${TRT_MODEL_INSTALL_DIR} ${INFERENCE_URL}/tensorrt_test "trt_inference_test_models.tar.gz") endif() @@ -285,4 +285,7 @@ if(WITH_GPU AND TENSORRT_FOUND) inference_analysis_test(trt_fc_prelu_test SRCS trt_fc_prelu_test.cc EXTRA_DEPS ${INFERENCE_EXTRA_DEPS} ARGS --infer_model=${TRT_MODEL_INSTALL_DIR}/trt_inference_test_models) + inference_analysis_test(trt_cascade_rcnn_test SRCS trt_cascade_rcnn_test.cc + EXTRA_DEPS ${INFERENCE_EXTRA_DEPS} + ARGS --infer_model=${TRT_MODEL_INSTALL_DIR}/trt_inference_test_models) endif() diff --git a/paddle/fluid/inference/tests/api/trt_cascade_rcnn_test.cc b/paddle/fluid/inference/tests/api/trt_cascade_rcnn_test.cc new file mode 100644 index 00000000000..35be7db560a --- /dev/null +++ b/paddle/fluid/inference/tests/api/trt_cascade_rcnn_test.cc @@ -0,0 +1,62 @@ +/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include +#include +#include + +#include "paddle/fluid/inference/tests/api/trt_test_helper.h" + +namespace paddle { +namespace inference { + +TEST(TensorRT, cascade_rcnn) { + std::string model_dir = FLAGS_infer_model + "/cascade_rcnn"; + AnalysisConfig config; + int batch_size = 1; + config.EnableUseGpu(100, 0); + config.SetModel(model_dir + "/model", model_dir + "/params"); + config.SwitchUseFeedFetchOps(false); + config.EnableTensorRtEngine(1 << 30, batch_size, 40, + AnalysisConfig::Precision::kFloat32, false); + + auto predictor = CreatePaddlePredictor(config); + + int channels = 3; + int height = 640; + int width = 640; + int input_num = batch_size * channels * height * width; + float *input = new float[input_num]; + memset(input, 1.0, input_num * sizeof(float)); + + float *im_shape = new float[3]; + im_shape[0] = 3.0; + im_shape[1] = 640.0; + im_shape[2] = 640.0; + + auto input_names = predictor->GetInputNames(); + + auto input_t = predictor->GetInputTensor(input_names[0]); + input_t->Reshape({batch_size, channels, height, width}); + input_t->copy_from_cpu(input); + + auto input_t1 = predictor->GetInputTensor(input_names[1]); + input_t1->Reshape({batch_size, 3}); + input_t1->copy_from_cpu(im_shape); + + ASSERT_TRUE(predictor->ZeroCopyRun()); +} + +} // namespace inference +} // namespace paddle -- GitLab