refactor(mgb/opr): make trt batch flag only depend on inputs dimension

GitOrigin-RevId: f2f1a1076201cbd9f8c9d73efc153ef80b08fd27

refactor(mgb/opr): make trt batch flag only depend on inputs dimension
GitOrigin-RevId: f2f1a1076201cbd9f8c9d73efc153ef80b08fd27
6de3e4ba · Megvii Engine Team · ce610ca3 · 6de3e4ba · 6de3e4ba · 6de3e4ba
5 changed file
--- a/.gitattributes
+++ b/.gitattributes
@@ -7,7 +7,6 @@ dnn/src/cuda/matrix_mul/fp32_simt/kimpl/* binary
 dnn/src/cuda/sass/prebuilt/map_defs.cpp binary
 dnn/src/cuda/convolution/backward_data/int8/kimpl/* binary
 tools/mlir/mlir-tblgen filter=lfs diff=lfs merge=lfs -text
-*.caffemodel filter=lfs diff=lfs merge=lfs -text
 imperative/python/test/integration/data/*.mge filter=lfs diff=lfs merge=lfs -text
 ci/resource/models/float/mobilenet_v2.pkl filter=lfs diff=lfs merge=lfs -text
 ci/resource/models/float/shufflenet_v2.pkl filter=lfs diff=lfs merge=lfs -text

--- a/src/tensorrt/impl/tensorrt_runtime_opr.cpp
+++ b/src/tensorrt/impl/tensorrt_runtime_opr.cpp
@@ -72,12 +72,11 @@ TensorRTRuntimeOpr::TensorRTRuntimeOpr(
    size_t nr_input = 0;
    bool is_input = true;
    for (int i = 0; i < m_engine->getNbBindings(); ++i) {
-        // nbDims == 3, means CHW, without batch
-        if (m_engine->getBindingDimensions(i).nbDims != 3)
-            m_trt_engine_has_batch = true;
-
        if (m_engine->bindingIsInput(nr_input)) {
            mgb_assert(is_input, "mixed input/output bindings");
+            // nbDims == 3, means CHW, without batch
+            if (m_engine->getBindingDimensions(nr_input).nbDims != 3)
+                m_trt_engine_has_batch = true;
            ++nr_input;
        } else {
            is_input = false;

--- a/src/tensorrt/test/make_trt_net.cpp
+++ b/src/tensorrt/test/make_trt_net.cpp
@@ -106,6 +106,70 @@ intl::SimpleTensorRTNetwork::create_trt_network(bool has_batch_dim) {
    return std::make_pair(builder, network);
 }

+intl::BatchedTensorRTNetwork::BatchedTensorRTNetwork() {
+    host_x = gen({23, 28, 28});
+
+    graph = ComputingGraph::make();
+    x = Host2DeviceCopy::make(*graph, host_x);
+    opr::Reduce::Param param1{Reduce::Mode::SUM, 0, Reduce::Param::DataType::DEFAULT};
+    opr::Reduce::Param param2{Reduce::Mode::SUM, 1, Reduce::Param::DataType::DEFAULT};
+    auto y0 = opr::Reduce::make(x, param1);
+    auto y1 = opr::Reduce::make(y0, param2);
+    TensorShape tshp{1, 28};
+    y = opr::Reshape::make(y1, tshp);
+}
+
+std::pair<nvinfer1::IBuilder*, INetworkDefinition*>
+intl::BatchedTensorRTNetwork::create_trt_network(bool has_batch_dim) {
+    CompNode::load("xpu0").activate();
+    auto builder = createInferBuilder(TensorRTOpr::Logger::instance());
+#if NV_TENSOR_RT_VERSION >= 6001
+    nvinfer1::NetworkDefinitionCreationFlags flags;
+    ::memset(&flags, 0, sizeof(nvinfer1::NetworkDefinitionCreationFlags));
+    if (has_batch_dim)
+        flags = 1 << static_cast<int>(nvinfer1::NetworkDefinitionCreationFlag::
+                                              kEXPLICIT_BATCH);
+    auto network = builder->createNetworkV2(flags);
+#else
+    auto network = builder->createNetwork();
+#endif
+    nvinfer1::ITensor* data;
+#if NV_TENSOR_RT_VERSION >= 6001
+    if (has_batch_dim) {
+        data = network->addInput("data", DataType::kFLOAT,
+                                 Dims4{1, 23, 28, 28});
+    } else {
+        data = network->addInput("data", DataType::kFLOAT, Dims3{23, 28, 28});
+    }
+    {
+        nvinfer1::TensorFormats formats =
+                1 << static_cast<int>(nvinfer1::TensorFormat::kLINEAR);
+        data->setAllowedFormats(formats);
+    }
+#else
+    if (has_batch_dim) {
+        data = network->addInput("data", DataType::kFLOAT,
+                                 DimsNCHW{1, 23, 28, 28});
+    } else {
+        data = network->addInput("data", DataType::kFLOAT, DimsCHW{23, 28, 28});
+    }
+#endif
+    mgb_assert(data != nullptr, "data is invalid");
+    auto reduce1 = network->addReduce(*data, nvinfer1::ReduceOperation::kSUM, 3, false);
+    mgb_assert(reduce1 != nullptr, "reduce1 is invalid");
+    reduce1->getOutput(0)->setName("prob");
+    network->markOutput(*reduce1->getOutput(0));
+#if NV_TENSOR_RT_VERSION >= 6001
+    {
+        nvinfer1::TensorFormats formats =
+                1 << static_cast<int>(nvinfer1::TensorFormat::kLINEAR);
+        reduce1->getOutput(0)->setAllowedFormats(formats);
+    }
+#endif
+
+    return std::make_pair(builder, network);
+}
+
 intl::SimpleQuantizedTensorRTNetwork::SimpleQuantizedTensorRTNetwork() {
    host_x = range_gen({32, 8, 28, 28});
    host_w = weight_gen({8, 8, 3, 3});

--- a/src/tensorrt/test/make_trt_net.h
+++ b/src/tensorrt/test/make_trt_net.h
@@ -48,6 +48,20 @@ struct SimpleTensorRTNetwork {
    create_trt_network(bool has_batch_dim);
 };

+struct BatchedTensorRTNetwork {
+    HostTensorGenerator<> gen;
+    std::shared_ptr<HostTensorND> host_x, host_w, host_b;
+    std::shared_ptr<ComputingGraph> graph;
+    SymbolVar x, y;
+
+    HostTensorND host_z1;
+
+    BatchedTensorRTNetwork();
+
+    std::pair<nvinfer1::IBuilder*, INetworkDefinition*>
+    create_trt_network(bool has_batch_dim);
+};
+
 struct SimpleQuantizedTensorRTNetwork {
    HostTensorGenerator<dtype::Float32, RandomDistribution::UNIFORM> weight_gen{
            1*1.1f, 127*1.1f};

--- a/src/tensorrt/test/tensorrt_runtime.cpp
+++ b/src/tensorrt/test/tensorrt_runtime.cpp
@@ -62,6 +62,37 @@ TEST(TestOprTensorRT, RuntimeBasic) {
 }


+TEST(TestOprTensorRT, RuntimeBasicBatched) {
+    REQUIRE_GPU(1);
+    intl::BatchedTensorRTNetwork net;
+    auto make_trt = [&net]() {
+        auto p = net.create_trt_network(false);
+        TensorRTUniquePtr<INetworkDefinition> trt_net{p.second, {}};
+        TensorRTUniquePtr<IBuilder> builder{p.first, {}};
+        builder->setMaxBatchSize(5);
+#if NV_TENSOR_RT_VERSION >= 6001
+        TensorRTUniquePtr<IBuilderConfig> build_config{
+                builder->createBuilderConfig()};
+        TensorRTUniquePtr<ICudaEngine> cuda_engine{
+                builder->buildEngineWithConfig(*trt_net, *build_config)};
+#else
+        TensorRTUniquePtr<ICudaEngine> cuda_engine{
+                builder->buildCudaEngine(*trt_net)};
+#endif
+        TensorRTUniquePtr<IHostMemory> mem{cuda_engine->serialize(), {}};
+        auto nx = opr::Broadcast::make(net.x, {1, net.x.shape()[0], net.x.shape()[1], net.x.shape()[2]});
+        return TensorRTRuntimeOpr::make(mem->data(), mem->size(), {nx})[0];
+    };
+    auto y2 = make_trt();
+
+    HostTensorND host_z1;
+    HostTensorND host_z2;
+    auto func = net.graph->compile({make_callback_copy(net.y, host_z1),
+                                    make_callback_copy(y2, host_z2)});
+    func->execute();
+    MGB_ASSERT_TENSOR_NEAR(host_z1, host_z2, 5e-4);
+}
+

 TEST(TestOprTensorRT, ConcatRuntimeBasic) {
    REQUIRE_GPU(1);