Fix BUGS: paddle-TRT repeatedly sets weight_map and overdeletes repetitive_params (#19825)

* fix trt bugs when sharing params, test=develop * add unittest for cascade_rcnn

Fix BUGS: paddle-TRT repeatedly sets weight_map and overdeletes repetitive_params (#19825)
* fix trt bugs when sharing params, test=develop * add unittest for cascade_rcnn
74812d1c · Pei Yang · GitHub · e2372750 · 74812d1c · 74812d1c
10 changed file
--- a/paddle/fluid/inference/analysis/ir_passes/tensorrt_subgraph_pass.cc
+++ b/paddle/fluid/inference/analysis/ir_passes/tensorrt_subgraph_pass.cc
@@ -102,7 +102,7 @@ void TensorRtSubgraphPass::CreateTensorRTOp(
  // const framework::BlockDesc& main_block = program_desc->Block(0);
  framework::BlockDesc *new_block = program_desc->AppendBlock(main_block);
-  // An fake block desc.
+  // A fake block desc.
  framework::proto::BlockDesc block_proto;
  framework::BlockDesc block_desc(nullptr, &block_proto);
  block_desc.Proto()->set_parent_idx(-1);
@@ -118,20 +118,27 @@ void TensorRtSubgraphPass::CreateTensorRTOp(
  }
  // Then, we will use the input_names_with_id and output_names_with_id to
-  // generate the eigine key.
+  // generate the engine key.
  // So, We use set instead of unordered_set here to ensure that the engine key
  // is unique.
  std::set<std::string> input_names;
  std::set<std::string> input_names_with_id;
  std::vector<std::string> params;
+  // if we delete fluid copy of params shared by more than 1 ops, there will be
+  // problem, so we filter them out.
+  std::vector<std::string> params_not_shared;
-  // The node->inputs containes input tensors and parameters.
+  // The node->inputs contains input tensors and parameters.
  for (auto *x : node->inputs) {
    input_names.insert(x->Name());
    input_names_with_id.insert(x->Name() + std::to_string(x->id()));
    if (std::count(graph_params.begin(), graph_params.end(), x->Name()) > 0) {
      params.push_back(x->Name());
    }
+    if (std::count(graph_params.begin(), graph_params.end(), x->Name()) > 0 &&
+        x->outputs.size() <= 1) {
+      params_not_shared.push_back(x->Name());
+    }
  }
  std::set<std::string> output_names;
@@ -241,7 +248,7 @@ void TensorRtSubgraphPass::CreateTensorRTOp(
    return;
  }
-  std::copy(params.begin(), params.end(),
+  std::copy(params_not_shared.begin(), params_not_shared.end(),
            std::back_inserter(*repetitive_params));
  tensorrt::TensorRTEngine *trt_engine =

--- a/paddle/fluid/inference/tensorrt/convert/batch_norm_op.cc
+++ b/paddle/fluid/inference/tensorrt/convert/batch_norm_op.cc
@@ -116,11 +116,10 @@ class BatchNormOpConverter : public OpConverter {
                             scale_weights.get(), power_weights.get());
    auto output_name = op_desc.Output("Y").front();
-    engine_->weight_map[op_desc.Input("Bias").front()] =
+    engine_->SetWeights(op_desc.Input("Bias").front(),
-        std::move(combile_bias_tensor);
+                        std::move(combile_bias_tensor));
-    engine_->weight_map[op_desc.Input("Scale").front()] =
+    engine_->SetWeights(op_desc.Input("Scale").front(),
-        std::move(combile_scale_tensor);
+                        std::move(combile_scale_tensor));
    RreplenishLayerAndOutput(layer, "pool2d", {output_name}, test_mode);
  }
 };

--- a/paddle/fluid/inference/tensorrt/convert/dropout_op.cc
+++ b/paddle/fluid/inference/tensorrt/convert/dropout_op.cc
@@ -66,8 +66,8 @@ class DropoutOpConverter : public OpConverter {
        nvinfer1::ScaleMode::kUNIFORM, shift_weights.get(), scale_weights.get(),
        power_weights.get());
-    engine_->weight_map[op_desc.Output("Out").front() + "_dropout"] =
+    engine_->SetWeights(op_desc.Output("Out").front() + "_dropout",
-        std::move(weight_tensor);
+                        std::move(weight_tensor));
    auto output_name = op_desc.Output("Out")[0];
    RreplenishLayerAndOutput(layer, "dropout", {output_name}, test_mode);

--- a/paddle/fluid/inference/tensorrt/convert/fc_op.cc
+++ b/paddle/fluid/inference/tensorrt/convert/fc_op.cc
@@ -72,7 +72,7 @@ class FcOpConverter : public OpConverter {
    PADDLE_ENFORCE_NOT_NULL(Y_v);
    auto* Y_t = Y_v->GetMutable<framework::LoDTensor>();
    // This may trigger a GPU->CPU copy, because TRT's weight can only be
-    // assigned from CPU memory, that can't be avoided.
+    // assigned from CPU memory, which can't be avoided.
    float* weight_data = nullptr;
    bool enable_int8 = boost::get<bool>(op_desc.HasAttr("enable_int8"));
    if (enable_int8) {
@@ -131,7 +131,7 @@ class FcOpConverter : public OpConverter {
                                       *const_cast<nvinfer1::ITensor*>(X),
                                       n_output, tmp_weight.get(), bias.get());
-    engine_->weight_map[op_desc.Input(w_name).front()] = std::move(tmp);
+    engine_->SetWeights(op_desc.Input(w_name).front(), std::move(tmp));
    auto output_name = op_desc.Output("Out").front();
    RreplenishLayerAndOutput(layer, "fc", {output_name}, test_mode);

--- a/paddle/fluid/inference/tensorrt/convert/leaky_relu_op.cc
+++ b/paddle/fluid/inference/tensorrt/convert/leaky_relu_op.cc
@@ -81,7 +81,7 @@ class LeakyReluOpConverter : public OpConverter {
    std::string alpha_name = op_desc.Output("Out")[0] + "_alpha";
    PADDLE_ENFORCE(engine_->weight_map.find(alpha_name) ==
                   engine_->weight_map.end());
-    engine_->weight_map[alpha_name] = std::move(alpha_tensor);
+    engine_->SetWeights(alpha_name, std::move(alpha_tensor));
 #endif
    auto output_name = op_desc.Output("Out")[0];
    RreplenishLayerAndOutput(output_layer, "leaky_relu", {output_name},

--- a/paddle/fluid/inference/tensorrt/convert/prelu_op.cc
+++ b/paddle/fluid/inference/tensorrt/convert/prelu_op.cc
@@ -55,8 +55,8 @@ class PReluOpConverter : public OpConverter {
    nvinfer1::IPluginLayer* layer =
        engine_->AddPlugin(&input, input_num, plugin);
    // keep alpha tensor to avoid release it's memory
-    engine_->weight_map[op_desc.Input("Alpha")[0]] =
+    engine_->SetWeights(op_desc.Input("Alpha")[0],
-        std::move(alpha_tensor_temp);
+                        std::move(alpha_tensor_temp));
    auto output_name = op_desc.Output("Out")[0];
    RreplenishLayerAndOutput(layer, "prelu", {output_name}, test_mode);

--- a/paddle/fluid/inference/tensorrt/engine.cc
+++ b/paddle/fluid/inference/tensorrt/engine.cc
@@ -70,7 +70,7 @@ void TensorRTEngine::FreezeNetwork() {
  }
 #else
  if (enable_fp16)
-    LOG(INFO) << "Using FP16 in Paddle-trt must ensure that the version of TRT "
+    LOG(INFO) << "Using FP16 in Paddle-TRT must ensure that the version of TRT "
                 "is at least 5."
                 "So, use FP32 to run.";
 #endif
@@ -146,8 +146,8 @@ void TensorRTEngine::DeclareOutput(const nvinfer1::ILayer *layer, int offset,
  PADDLE_ENFORCE(!output->isNetworkInput());
  infer_network_->markOutput(*output);
  PADDLE_ENFORCE(output->isNetworkOutput());
-  // output buffers' size can only be decided latter, set zero here to mark this
+  // output buffers' size can only be decided later, set zero here to mark this
-  // and will reset latter.
+  // and will reset later.
  buffer_sizes_[name] = 0;
 }
@@ -164,8 +164,8 @@ void TensorRTEngine::DeclareOutput(const std::string &name) {
  output->setName(name.c_str());
  PADDLE_ENFORCE(!output->isNetworkInput());
  infer_network_->markOutput(*output);
-  // output buffers' size can only be decided latter, set zero here to mark this
+  // output buffers' size can only be decided later, set zero here to mark this
-  // and will reset latter.
+  // and will reset later.
  buffer_sizes_[name] = 0;
 }
@@ -190,20 +190,26 @@ float *TensorRTEngine::GetWeightCPUData(const std::string &name,
                                        framework::Tensor *weight_tensor,
                                        bool enable_int8,
                                        const std::vector<float> &scale) {
+  static int name_suffix_counter = 0;
+  std::string name_suffix = std::to_string(name_suffix_counter);
+  std::string name_with_suffix = name + name_suffix;
  auto w_dims = weight_tensor->dims();
  platform::CPUPlace cpu_place;
-  PADDLE_ENFORCE(!weight_map.count(name),
+  PADDLE_ENFORCE_EQ(
-                 "During TRT Op converter: We set weight %s with the same name "
+      weight_map.count(name_with_suffix), 0,
-                 "twice into the weight_map",
+      "During TRT Op converter: We set weight %s with the same name "
-                 name);
+      "twice into the weight_map",
-  weight_map[name].reset(new framework::Tensor());
+      name_with_suffix);
-  weight_map[name]->Resize(weight_tensor->dims());
+  weight_map[name_with_suffix].reset(new framework::Tensor());
-  TensorCopySync(*weight_tensor, cpu_place, weight_map[name].get());
+  weight_map[name_with_suffix]->Resize(weight_tensor->dims());
-  float *weight_data = weight_map[name]->mutable_data<float>(cpu_place);
+  TensorCopySync(*weight_tensor, cpu_place, weight_map[name_with_suffix].get());
+  float *weight_data =
+      weight_map[name_with_suffix]->mutable_data<float>(cpu_place);
+  name_suffix_counter += 1;
  if (enable_int8) {
    // when the op is fc, scale's size should be 1
-    // when the op is conv, the scale's size should be w_dims[0]
+    // when the op is conv, scale's size should be w_dims[0]
    bool valid_scale_size =
        (scale.size() == 1 || scale.size() == static_cast<size_t>(w_dims[0]));
    PADDLE_ENFORCE(valid_scale_size, "TRT int8 quant: invalid scale size");

--- a/paddle/fluid/inference/tensorrt/engine.h
+++ b/paddle/fluid/inference/tensorrt/engine.h
@@ -15,10 +15,12 @@ limitations under the License. */
 #pragma once
 #include <NvInfer.h>
+#include <map>
 #include <memory>
 #include <string>
 #include <unordered_map>
 #include <unordered_set>
+#include <utility>
 #include <vector>
 #include "paddle/fluid/framework/tensor.h"
 #include "paddle/fluid/framework/tensor_util.h"
@@ -39,7 +41,7 @@ class TRTInt8Calibrator;
 * TensorRT Engine.
 *
 * There are two alternative ways to use it, one is  to build from a paddle
- * protobuf model, another way is to manully construct the network.
+ * protobuf model, another way is to manually construct the network.
 */
 class TensorRTEngine {
  using DescType = ::paddle::framework::proto::BlockDesc;
@@ -89,11 +91,11 @@ class TensorRTEngine {
    infer_builder_.reset(createInferBuilder(&logger_));
    infer_network_.reset(infer_builder_->createNetwork());
  }
-  // After finishing adding ops, freeze this network and creates the executation
+  // After finishing adding ops, freeze this network and creates the execution
  // environment.
  void FreezeNetwork();
-  // Add an input and set its name, data type and dimention.
+  // Add an input and set its name, data type and dimension.
  nvinfer1::ITensor* DeclareInput(const std::string& name,
                                  nvinfer1::DataType dtype,
                                  const nvinfer1::Dims& dim);
@@ -151,6 +153,16 @@ class TensorRTEngine {
  std::unordered_map<std::string /*name*/, std::unique_ptr<framework::Tensor>>
      weight_map;
+  // When setting weight_map, a self-increasing suffix is needed for the names
+  // so as to avoid repeatedly setting weights with the same name.
+  void SetWeights(std::string w_name,
+                  std::unique_ptr<framework::Tensor> w_tensor) {
+    static int suffix_counter = 0;
+    std::string suffix = std::to_string(suffix_counter);
+    weight_map[w_name + suffix] = std::move(w_tensor);
+    suffix_counter += 1;
+  }
  void ClearWeights() {
    for (auto& weight_pair : weight_map) {
      weight_pair.second.reset(nullptr);
@@ -209,7 +221,7 @@ class TensorRTEngine {
  ((NV_TENSORRT_MAJOR * 1000 + NV_TENSORRT_MINOR * 100 + \
    NV_TENSORRT_PATCH * 10 + NV_TENSORRT_BUILD) >= version)
-// Add an layer__ into engine__ with args ARGS.
+// Add a layer__ into engine__ with args ARGS.
 // For example:
 //
 // Reference

--- a/paddle/fluid/inference/tests/api/CMakeLists.txt
+++ b/paddle/fluid/inference/tests/api/CMakeLists.txt
@@ -269,7 +269,7 @@ download_model_and_data(${BERT_INSTALL_DIR} "bert_emb128_model.tar.gz" "bert_dat
 inference_analysis_api_test(test_analyzer_bert ${BERT_INSTALL_DIR} analyzer_bert_tester.cc)
 if(WITH_GPU AND TENSORRT_FOUND)
-    set(TRT_MODEL_INSTALL_DIR "${INFERENCE_DEMO_INSTALL_DIR}/trt_tests_models")
+    set(TRT_MODEL_INSTALL_DIR "${INFERENCE_DEMO_INSTALL_DIR}/trt_models")
    if (NOT EXISTS ${TRT_MODEL_INSTALL_DIR})
        inference_download_and_uncompress(${TRT_MODEL_INSTALL_DIR} ${INFERENCE_URL}/tensorrt_test "trt_inference_test_models.tar.gz")
    endif()
@@ -285,4 +285,7 @@ if(WITH_GPU AND TENSORRT_FOUND)
    inference_analysis_test(trt_fc_prelu_test SRCS trt_fc_prelu_test.cc
            EXTRA_DEPS ${INFERENCE_EXTRA_DEPS}
            ARGS --infer_model=${TRT_MODEL_INSTALL_DIR}/trt_inference_test_models)
+    inference_analysis_test(trt_cascade_rcnn_test SRCS trt_cascade_rcnn_test.cc
+            EXTRA_DEPS ${INFERENCE_EXTRA_DEPS}
+            ARGS --infer_model=${TRT_MODEL_INSTALL_DIR}/trt_inference_test_models)
 endif()
--- a/paddle/fluid/inference/tests/api/trt_cascade_rcnn_test.cc
+++ b/paddle/fluid/inference/tests/api/trt_cascade_rcnn_test.cc
+/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#include <gflags/gflags.h>
+#include <glog/logging.h>
+#include <gtest/gtest.h>
+#include "paddle/fluid/inference/tests/api/trt_test_helper.h"
+namespace paddle {
+namespace inference {
+TEST(TensorRT, cascade_rcnn) {
+  std::string model_dir = FLAGS_infer_model + "/cascade_rcnn";
+  AnalysisConfig config;
+  int batch_size = 1;
+  config.EnableUseGpu(100, 0);
+  config.SetModel(model_dir + "/model", model_dir + "/params");
+  config.SwitchUseFeedFetchOps(false);
+  config.EnableTensorRtEngine(1 << 30, batch_size, 40,
+                              AnalysisConfig::Precision::kFloat32, false);
+  auto predictor = CreatePaddlePredictor(config);
+  int channels = 3;
+  int height = 640;
+  int width = 640;
+  int input_num = batch_size * channels * height * width;
+  float *input = new float[input_num];
+  memset(input, 1.0, input_num * sizeof(float));
+  float *im_shape = new float[3];
+  im_shape[0] = 3.0;
+  im_shape[1] = 640.0;
+  im_shape[2] = 640.0;
+  auto input_names = predictor->GetInputNames();
+  auto input_t = predictor->GetInputTensor(input_names[0]);
+  input_t->Reshape({batch_size, channels, height, width});
+  input_t->copy_from_cpu(input);
+  auto input_t1 = predictor->GetInputTensor(input_names[1]);
+  input_t1->Reshape({batch_size, 3});
+  input_t1->copy_from_cpu(im_shape);
+  ASSERT_TRUE(predictor->ZeroCopyRun());
+}
+}  // namespace inference
+}  // namespace paddle