fix ssa bug with batchnorm and refine the trt interface

Merge pull request from #12843 from NHZlX:fix_ssa_bug_for_trt

fix ssa bug with batchnorm and refine the trt interface
Merge pull request from #12843 from NHZlX:fix_ssa_bug_for_trt
d91e84ae · nhzlx · Paddle CI · dafd6d06 · d91e84ae · d91e84ae
9 changed file
--- a/paddle/fluid/inference/analysis/analyzer.cc
+++ b/paddle/fluid/inference/analysis/analyzer.cc
@@ -44,7 +44,8 @@ class DfgPassManagerImpl final : public DfgPassManager {
    if (FLAGS_inference_analysis_enable_tensorrt_subgraph_engine) {
      auto trt_teller = [&](const Node* node) {
        std::unordered_set<std::string> teller_set(
-            {"elementwise_add", "mul", "conv2d", "pool2d", "relu", "softmax"});
+            {"elementwise_add", "mul", "conv2d", "pool2d", "relu", "softmax",
+             "depthwise_conv2d", "batch_norm"});
        if (!node->IsFunction()) return false;

        const auto* func = static_cast<const Function*>(node);

--- a/paddle/fluid/inference/analysis/data_flow_graph_to_fluid_pass.cc
+++ b/paddle/fluid/inference/analysis/data_flow_graph_to_fluid_pass.cc
@@ -23,9 +23,6 @@
 namespace paddle {
 namespace inference {

-DEFINE_int32(tensorrt_max_batchsize, 1, "TensorRT maximum batch size");
-DEFINE_int32(tensorrt_workspace_size, 2048, "TensorRT workspace size");
-
 namespace analysis {

 using framework::proto::ProgramDesc;
@@ -190,8 +187,6 @@ void CreateTrtEngineOp(Node *node, const DataFlowGraph &graph,
  // Set attrs
  SetAttr(desc.Proto(), "subgraph", block->SerializeAsString());
  SetAttr(desc.Proto(), "engine_uniq_key", "trt-" + std::to_string(counter++));
-  SetAttr(desc.Proto(), "max_batch", FLAGS_tensorrt_max_batchsize);
-  SetAttr(desc.Proto(), "max_workspace", FLAGS_tensorrt_workspace_size);
  SetAttr(desc.Proto(), "parameters", ExtractParameters(graph.nodes.nodes()));
  SetAttr(desc.Proto(), "output_name_mapping", output_mapping);
  node->SetPbMsg(desc.Proto()->SerializeAsString());

--- a/paddle/fluid/inference/analysis/data_flow_graph_to_fluid_pass.h
+++ b/paddle/fluid/inference/analysis/data_flow_graph_to_fluid_pass.h
@@ -27,9 +27,6 @@
 namespace paddle {
 namespace inference {

-DECLARE_int32(tensorrt_max_batchsize);
-DECLARE_int32(tensorrt_workspace_size);
-
 namespace analysis {
 class DataFlowGraphToFluidPass final : public DataFlowGraphPass {
 public:

--- a/paddle/fluid/inference/analysis/fluid_to_data_flow_graph_pass.cc
+++ b/paddle/fluid/inference/analysis/fluid_to_data_flow_graph_pass.cc
@@ -92,6 +92,7 @@ void FluidToDataFlowGraphPass::Run(DataFlowGraph *graph) {
        auto *in = graph->nodes.GetMutable(var2id.at(in_var.arguments(k)));
        in->outlinks.push_back(o);
        o->inlinks.push_back(in);
+        unique_written_vars.insert(in);
      }
    }
    for (int j = 0; j < op.outputs_size(); j++) {
@@ -112,7 +113,6 @@ void FluidToDataFlowGraphPass::Run(DataFlowGraph *graph) {
        }
        out->inlinks.push_back(o);
        o->outlinks.push_back(out);
-        unique_written_vars.insert(out);
      }
    }
  }

--- a/paddle/fluid/inference/api/api_tensorrt_subgraph_engine.cc
+++ b/paddle/fluid/inference/api/api_tensorrt_subgraph_engine.cc
@@ -15,6 +15,7 @@
 #include "paddle/fluid/inference/analysis/analyzer.h"
 #include "paddle/fluid/inference/api/api_impl.h"
 #include "paddle/fluid/inference/api/paddle_inference_api.h"
+#include "paddle/fluid/inference/tensorrt/convert/op_converter.h"
 #include "paddle/fluid/inference/utils/singleton.h"
 #include "paddle/fluid/operators/tensorrt_engine_op.h"

@@ -32,7 +33,8 @@ class TensorRTSubgraphPredictor : public NativePaddlePredictor {

  bool Init(const std::shared_ptr<framework::Scope>& parent_scope) {
    VLOG(3) << "Predictor::init()";
-
+    FLAGS_tensorrt_max_batch_size = config_.max_batch_size;
+    FLAGS_tensorrt_workspace_size = config_.workspace_size;
    if (config_.use_gpu) {
      place_ = paddle::platform::CUDAPlace(config_.device);
    } else {
@@ -150,3 +152,12 @@ CreatePaddlePredictor<TensorRTConfig, PaddleEngineKind::kAutoMixedTensorRT>(
 }

 }  // namespace paddle
+
+USE_TRT_CONVERTER(elementwise_add_weight);
+USE_TRT_CONVERTER(mul);
+USE_TRT_CONVERTER(conv2d);
+USE_TRT_CONVERTER(relu);
+USE_TRT_CONVERTER(fc);
+USE_TRT_CONVERTER(pool2d);
+USE_TRT_CONVERTER(softmax);
+USE_TRT_CONVERTER(batch_norm);
--- a/paddle/fluid/inference/api/paddle_inference_api.h
+++ b/paddle/fluid/inference/api/paddle_inference_api.h
@@ -137,6 +137,14 @@ struct AnakinConfig : public PaddlePredictor::Config {
 struct TensorRTConfig : public NativeConfig {
  // Determine whether a subgraph will be executed by TRT.
  int min_subgraph_size{1};
+  // While TensorRT allows an engine optimized for a given max batch size
+  // to run at any smaller size, the performance for those smaller
+  // sizes may not be as well-optimized. Therefore, Max batch is best
+  // equivalent to the runtime batch size.
+  int max_batch_size{1};
+  // For workspace_size, refer it from here:
+  // https://docs.nvidia.com/deeplearning/sdk/tensorrt-developer-guide/index.html#troubleshooting
+  int workspace_size{1 << 30};
 };

 // A factory to help create different predictors.

--- a/paddle/fluid/operators/tensorrt_engine_op.cc
+++ b/paddle/fluid/operators/tensorrt_engine_op.cc
@@ -22,6 +22,8 @@
 namespace paddle {

 DEFINE_int32(tensorrt_engine_batch_size, 1, "the batch_size of TensorRT");
+DEFINE_int32(tensorrt_max_batch_size, 1, "TensorRT maximum batch size");
+DEFINE_int32(tensorrt_workspace_size, 16 << 20, "TensorRT workspace size");

 namespace operators {

@@ -32,8 +34,6 @@ class TensorRTEngineOpMaker : public framework::OpProtoAndCheckerMaker {
    AddOutput("Ys", "A list of outputs").AsDuplicable();
    AddAttr<std::string>("subgraph", "the subgraph.");
    AddAttr<std::string>("engine_uniq_key", "unique key for the TRT engine.");
-    AddAttr<int>("max_batch", "the maximum batch size.");
-    AddAttr<int>("max_workspace", "the maximum batch size.");
    AddComment("TensorRT engine operator.");
  }
 };

--- a/paddle/fluid/operators/tensorrt_engine_op.h
+++ b/paddle/fluid/operators/tensorrt_engine_op.h
@@ -28,6 +28,8 @@
 namespace paddle {

 DECLARE_int32(tensorrt_engine_batch_size);
+DECLARE_int32(tensorrt_max_batch_size);
+DECLARE_int32(tensorrt_workspace_size);

 namespace operators {

@@ -54,8 +56,10 @@ nvinfer1::Dims Vec2TRT_Dims(const std::vector<int64_t>& shape) {
                    "TensorRT' tensor input requires at least 2 dimensions");
  PADDLE_ENFORCE_LE(shape.size(), 4UL,
                    "TensorRT' tensor input requires at most 4 dimensions");
-  PADDLE_ENFORCE_EQ(shape.size(), 4UL);
+  PADDLE_ENFORCE(shape.size() == 4UL || shape.size() == 2UL);
+  if (shape.size() == 4UL)
    return nvinfer1::DimsCHW(shape[1], shape[2], shape[3]);
+  return nvinfer1::DimsCHW(shape[1], 1, 1);
 }

 }  // namespace
@@ -95,7 +99,7 @@ class TensorRTEngineKernel : public framework::OpKernel<T> {
    auto input_names = context.op().Inputs("Xs");
    PADDLE_ENFORCE(!input_names.empty(), "should pass more than one inputs");
    PADDLE_ENFORCE_LE(FLAGS_tensorrt_engine_batch_size,
-                      context.Attr<int>("max_batch"));
+                      FLAGS_tensorrt_max_batch_size);

    std::vector<std::string> output_maps =
        context.Attr<std::vector<std::string>>("output_name_mapping");
@@ -132,7 +136,12 @@ class TensorRTEngineKernel : public framework::OpKernel<T> {
      nvinfer1::ITensor* trt_t = engine->GetITensor(output_maps[output_index]);
      auto dims = trt_t->getDimensions();
      // Use the output ITensor's dims to reshape the Fluid Tensor.
-      std::vector<int> ddim(dims.d, dims.d + dims.nbDims);
+      // The ITensor doesn't contain the batch size dim.
+      std::vector<int> ddim;
+      ddim.push_back(FLAGS_tensorrt_engine_batch_size);
+      for (int i = 0; i < dims.nbDims; i++) {
+        ddim.push_back(dims.d[i]);
+      }

      auto* fluid_v = context.scope().FindVar(y);
      PADDLE_ENFORCE_NOT_NULL(fluid_v, "no output variable called %s", y);
@@ -168,8 +177,8 @@ class TensorRTEngineKernel : public framework::OpKernel<T> {
    // Get the ProgramDesc and pass to convert.
    framework::proto::BlockDesc block_desc;
    block_desc.ParseFromString(context.Attr<std::string>("subgraph"));
-    int max_batch = context.Attr<int>("max_batch");
-    auto max_workspace = context.Attr<int>("max_workspace");
+    int max_batch = FLAGS_tensorrt_max_batch_size;
+    auto max_workspace = FLAGS_tensorrt_workspace_size;
    auto params = context.Attr<std::vector<std::string>>("parameters");
    std::unordered_set<std::string> parameters;
    for (const auto& param : params) {

--- a/paddle/fluid/operators/tensorrt_engine_op_test.cc
+++ b/paddle/fluid/operators/tensorrt_engine_op_test.cc
@@ -12,6 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */

+#include "paddle/fluid/operators/tensorrt_engine_op.h"
 #include <gtest/gtest.h>
 #include "paddle/fluid/framework/block_desc.h"
 #include "paddle/fluid/framework/lod_tensor.h"
@@ -57,6 +58,8 @@ void AddTensorToBlockDesc(framework::proto::BlockDesc* block,
 using inference::analysis::SetAttr;

 TEST(TensorRTEngineOp, manual) {
+  FLAGS_tensorrt_engine_batch_size = 2;
+  FLAGS_tensorrt_max_batch_size = 2;
  framework::ProgramDesc program;
  auto* block_ = program.Proto()->add_blocks();
  block_->set_idx(0);
@@ -98,8 +101,6 @@ TEST(TensorRTEngineOp, manual) {
  engine_op_desc.SetOutput("Ys", std::vector<std::string>({"z0"}));
  SetAttr<std::string>(engine_op_desc.Proto(), "subgraph",
                       block_->SerializeAsString());
-  SetAttr<int>(engine_op_desc.Proto(), "max_batch", 100);
-  SetAttr<int>(engine_op_desc.Proto(), "max_workspace", 1 << 10);
  SetAttr<std::string>(engine_op_desc.Proto(), "engine_uniq_key", "a_engine");
  SetAttr<std::vector<std::string>>(engine_op_desc.Proto(), "parameters",
                                    std::vector<std::string>({}));
@@ -128,6 +129,8 @@ TEST(TensorRTEngineOp, manual) {
 }

 void Execute(int batch_size, int input_dim, int output_dim, int nlayers = 1) {
+  FLAGS_tensorrt_engine_batch_size = batch_size;
+  FLAGS_tensorrt_max_batch_size = batch_size;
  framework::ProgramDesc program;
  framework::Scope scope;
  platform::CUDAPlace place;