diff --git a/.gitattributes b/.gitattributes index 07945c65763c2b19baf583f910ab3f253d63b3c8..ca42b36f94b6c841a299ec08a48b290d241c61f7 100644 --- a/.gitattributes +++ b/.gitattributes @@ -7,7 +7,6 @@ dnn/src/cuda/matrix_mul/fp32_simt/kimpl/* binary dnn/src/cuda/sass/prebuilt/map_defs.cpp binary dnn/src/cuda/convolution/backward_data/int8/kimpl/* binary tools/mlir/mlir-tblgen filter=lfs diff=lfs merge=lfs -text -*.caffemodel filter=lfs diff=lfs merge=lfs -text imperative/python/test/integration/data/*.mge filter=lfs diff=lfs merge=lfs -text ci/resource/models/float/mobilenet_v2.pkl filter=lfs diff=lfs merge=lfs -text ci/resource/models/float/shufflenet_v2.pkl filter=lfs diff=lfs merge=lfs -text diff --git a/src/tensorrt/impl/tensorrt_runtime_opr.cpp b/src/tensorrt/impl/tensorrt_runtime_opr.cpp index ffb5a5118c4954c90e52f87da06f9213d52cb1e6..ee8a86397f844b75761b7e25df8e23e819b59e73 100644 --- a/src/tensorrt/impl/tensorrt_runtime_opr.cpp +++ b/src/tensorrt/impl/tensorrt_runtime_opr.cpp @@ -72,12 +72,11 @@ TensorRTRuntimeOpr::TensorRTRuntimeOpr( size_t nr_input = 0; bool is_input = true; for (int i = 0; i < m_engine->getNbBindings(); ++i) { - // nbDims == 3, means CHW, without batch - if (m_engine->getBindingDimensions(i).nbDims != 3) - m_trt_engine_has_batch = true; - if (m_engine->bindingIsInput(nr_input)) { mgb_assert(is_input, "mixed input/output bindings"); + // nbDims == 3, means CHW, without batch + if (m_engine->getBindingDimensions(nr_input).nbDims != 3) + m_trt_engine_has_batch = true; ++nr_input; } else { is_input = false; diff --git a/src/tensorrt/test/make_trt_net.cpp b/src/tensorrt/test/make_trt_net.cpp index b1974f39b26b3665a93885df82aa5ad54a0d43ae..cef69a43df9ad8afba78a83a93f3548b23f51304 100644 --- a/src/tensorrt/test/make_trt_net.cpp +++ b/src/tensorrt/test/make_trt_net.cpp @@ -106,6 +106,70 @@ intl::SimpleTensorRTNetwork::create_trt_network(bool has_batch_dim) { return std::make_pair(builder, network); } +intl::BatchedTensorRTNetwork::BatchedTensorRTNetwork() { + host_x = gen({23, 28, 28}); + + graph = ComputingGraph::make(); + x = Host2DeviceCopy::make(*graph, host_x); + opr::Reduce::Param param1{Reduce::Mode::SUM, 0, Reduce::Param::DataType::DEFAULT}; + opr::Reduce::Param param2{Reduce::Mode::SUM, 1, Reduce::Param::DataType::DEFAULT}; + auto y0 = opr::Reduce::make(x, param1); + auto y1 = opr::Reduce::make(y0, param2); + TensorShape tshp{1, 28}; + y = opr::Reshape::make(y1, tshp); +} + +std::pair +intl::BatchedTensorRTNetwork::create_trt_network(bool has_batch_dim) { + CompNode::load("xpu0").activate(); + auto builder = createInferBuilder(TensorRTOpr::Logger::instance()); +#if NV_TENSOR_RT_VERSION >= 6001 + nvinfer1::NetworkDefinitionCreationFlags flags; + ::memset(&flags, 0, sizeof(nvinfer1::NetworkDefinitionCreationFlags)); + if (has_batch_dim) + flags = 1 << static_cast(nvinfer1::NetworkDefinitionCreationFlag:: + kEXPLICIT_BATCH); + auto network = builder->createNetworkV2(flags); +#else + auto network = builder->createNetwork(); +#endif + nvinfer1::ITensor* data; +#if NV_TENSOR_RT_VERSION >= 6001 + if (has_batch_dim) { + data = network->addInput("data", DataType::kFLOAT, + Dims4{1, 23, 28, 28}); + } else { + data = network->addInput("data", DataType::kFLOAT, Dims3{23, 28, 28}); + } + { + nvinfer1::TensorFormats formats = + 1 << static_cast(nvinfer1::TensorFormat::kLINEAR); + data->setAllowedFormats(formats); + } +#else + if (has_batch_dim) { + data = network->addInput("data", DataType::kFLOAT, + DimsNCHW{1, 23, 28, 28}); + } else { + data = network->addInput("data", DataType::kFLOAT, DimsCHW{23, 28, 28}); + } +#endif + mgb_assert(data != nullptr, "data is invalid"); + auto reduce1 = network->addReduce(*data, nvinfer1::ReduceOperation::kSUM, 3, false); + mgb_assert(reduce1 != nullptr, "reduce1 is invalid"); + reduce1->getOutput(0)->setName("prob"); + network->markOutput(*reduce1->getOutput(0)); +#if NV_TENSOR_RT_VERSION >= 6001 + { + nvinfer1::TensorFormats formats = + 1 << static_cast(nvinfer1::TensorFormat::kLINEAR); + reduce1->getOutput(0)->setAllowedFormats(formats); + } +#endif + + return std::make_pair(builder, network); +} + intl::SimpleQuantizedTensorRTNetwork::SimpleQuantizedTensorRTNetwork() { host_x = range_gen({32, 8, 28, 28}); host_w = weight_gen({8, 8, 3, 3}); diff --git a/src/tensorrt/test/make_trt_net.h b/src/tensorrt/test/make_trt_net.h index 672e9d81e76a540db337037ff4a8d25489f60a38..d5e4b913996393a9bc256cdf0ce6cdee87701c62 100644 --- a/src/tensorrt/test/make_trt_net.h +++ b/src/tensorrt/test/make_trt_net.h @@ -48,6 +48,20 @@ struct SimpleTensorRTNetwork { create_trt_network(bool has_batch_dim); }; +struct BatchedTensorRTNetwork { + HostTensorGenerator<> gen; + std::shared_ptr host_x, host_w, host_b; + std::shared_ptr graph; + SymbolVar x, y; + + HostTensorND host_z1; + + BatchedTensorRTNetwork(); + + std::pair + create_trt_network(bool has_batch_dim); +}; + struct SimpleQuantizedTensorRTNetwork { HostTensorGenerator weight_gen{ 1*1.1f, 127*1.1f}; diff --git a/src/tensorrt/test/tensorrt_runtime.cpp b/src/tensorrt/test/tensorrt_runtime.cpp index 4cd9b1ef4f49d2d33dab14290b3778260896a6f7..bba31c0b499815e2e31bd0b94f6ae95569e4467c 100644 --- a/src/tensorrt/test/tensorrt_runtime.cpp +++ b/src/tensorrt/test/tensorrt_runtime.cpp @@ -62,6 +62,37 @@ TEST(TestOprTensorRT, RuntimeBasic) { } +TEST(TestOprTensorRT, RuntimeBasicBatched) { + REQUIRE_GPU(1); + intl::BatchedTensorRTNetwork net; + auto make_trt = [&net]() { + auto p = net.create_trt_network(false); + TensorRTUniquePtr trt_net{p.second, {}}; + TensorRTUniquePtr builder{p.first, {}}; + builder->setMaxBatchSize(5); +#if NV_TENSOR_RT_VERSION >= 6001 + TensorRTUniquePtr build_config{ + builder->createBuilderConfig()}; + TensorRTUniquePtr cuda_engine{ + builder->buildEngineWithConfig(*trt_net, *build_config)}; +#else + TensorRTUniquePtr cuda_engine{ + builder->buildCudaEngine(*trt_net)}; +#endif + TensorRTUniquePtr mem{cuda_engine->serialize(), {}}; + auto nx = opr::Broadcast::make(net.x, {1, net.x.shape()[0], net.x.shape()[1], net.x.shape()[2]}); + return TensorRTRuntimeOpr::make(mem->data(), mem->size(), {nx})[0]; + }; + auto y2 = make_trt(); + + HostTensorND host_z1; + HostTensorND host_z2; + auto func = net.graph->compile({make_callback_copy(net.y, host_z1), + make_callback_copy(y2, host_z2)}); + func->execute(); + MGB_ASSERT_TENSOR_NEAR(host_z1, host_z2, 5e-4); +} + TEST(TestOprTensorRT, ConcatRuntimeBasic) { REQUIRE_GPU(1);