提交 6de3e4ba 编写于 作者: M Megvii Engine Team

refactor(mgb/opr): make trt batch flag only depend on inputs dimension

GitOrigin-RevId: f2f1a1076201cbd9f8c9d73efc153ef80b08fd27
上级 ce610ca3
...@@ -7,7 +7,6 @@ dnn/src/cuda/matrix_mul/fp32_simt/kimpl/* binary ...@@ -7,7 +7,6 @@ dnn/src/cuda/matrix_mul/fp32_simt/kimpl/* binary
dnn/src/cuda/sass/prebuilt/map_defs.cpp binary dnn/src/cuda/sass/prebuilt/map_defs.cpp binary
dnn/src/cuda/convolution/backward_data/int8/kimpl/* binary dnn/src/cuda/convolution/backward_data/int8/kimpl/* binary
tools/mlir/mlir-tblgen filter=lfs diff=lfs merge=lfs -text tools/mlir/mlir-tblgen filter=lfs diff=lfs merge=lfs -text
*.caffemodel filter=lfs diff=lfs merge=lfs -text
imperative/python/test/integration/data/*.mge filter=lfs diff=lfs merge=lfs -text imperative/python/test/integration/data/*.mge filter=lfs diff=lfs merge=lfs -text
ci/resource/models/float/mobilenet_v2.pkl filter=lfs diff=lfs merge=lfs -text ci/resource/models/float/mobilenet_v2.pkl filter=lfs diff=lfs merge=lfs -text
ci/resource/models/float/shufflenet_v2.pkl filter=lfs diff=lfs merge=lfs -text ci/resource/models/float/shufflenet_v2.pkl filter=lfs diff=lfs merge=lfs -text
......
...@@ -72,12 +72,11 @@ TensorRTRuntimeOpr::TensorRTRuntimeOpr( ...@@ -72,12 +72,11 @@ TensorRTRuntimeOpr::TensorRTRuntimeOpr(
size_t nr_input = 0; size_t nr_input = 0;
bool is_input = true; bool is_input = true;
for (int i = 0; i < m_engine->getNbBindings(); ++i) { for (int i = 0; i < m_engine->getNbBindings(); ++i) {
// nbDims == 3, means CHW, without batch
if (m_engine->getBindingDimensions(i).nbDims != 3)
m_trt_engine_has_batch = true;
if (m_engine->bindingIsInput(nr_input)) { if (m_engine->bindingIsInput(nr_input)) {
mgb_assert(is_input, "mixed input/output bindings"); mgb_assert(is_input, "mixed input/output bindings");
// nbDims == 3, means CHW, without batch
if (m_engine->getBindingDimensions(nr_input).nbDims != 3)
m_trt_engine_has_batch = true;
++nr_input; ++nr_input;
} else { } else {
is_input = false; is_input = false;
......
...@@ -106,6 +106,70 @@ intl::SimpleTensorRTNetwork::create_trt_network(bool has_batch_dim) { ...@@ -106,6 +106,70 @@ intl::SimpleTensorRTNetwork::create_trt_network(bool has_batch_dim) {
return std::make_pair(builder, network); return std::make_pair(builder, network);
} }
intl::BatchedTensorRTNetwork::BatchedTensorRTNetwork() {
host_x = gen({23, 28, 28});
graph = ComputingGraph::make();
x = Host2DeviceCopy::make(*graph, host_x);
opr::Reduce::Param param1{Reduce::Mode::SUM, 0, Reduce::Param::DataType::DEFAULT};
opr::Reduce::Param param2{Reduce::Mode::SUM, 1, Reduce::Param::DataType::DEFAULT};
auto y0 = opr::Reduce::make(x, param1);
auto y1 = opr::Reduce::make(y0, param2);
TensorShape tshp{1, 28};
y = opr::Reshape::make(y1, tshp);
}
std::pair<nvinfer1::IBuilder*, INetworkDefinition*>
intl::BatchedTensorRTNetwork::create_trt_network(bool has_batch_dim) {
CompNode::load("xpu0").activate();
auto builder = createInferBuilder(TensorRTOpr::Logger::instance());
#if NV_TENSOR_RT_VERSION >= 6001
nvinfer1::NetworkDefinitionCreationFlags flags;
::memset(&flags, 0, sizeof(nvinfer1::NetworkDefinitionCreationFlags));
if (has_batch_dim)
flags = 1 << static_cast<int>(nvinfer1::NetworkDefinitionCreationFlag::
kEXPLICIT_BATCH);
auto network = builder->createNetworkV2(flags);
#else
auto network = builder->createNetwork();
#endif
nvinfer1::ITensor* data;
#if NV_TENSOR_RT_VERSION >= 6001
if (has_batch_dim) {
data = network->addInput("data", DataType::kFLOAT,
Dims4{1, 23, 28, 28});
} else {
data = network->addInput("data", DataType::kFLOAT, Dims3{23, 28, 28});
}
{
nvinfer1::TensorFormats formats =
1 << static_cast<int>(nvinfer1::TensorFormat::kLINEAR);
data->setAllowedFormats(formats);
}
#else
if (has_batch_dim) {
data = network->addInput("data", DataType::kFLOAT,
DimsNCHW{1, 23, 28, 28});
} else {
data = network->addInput("data", DataType::kFLOAT, DimsCHW{23, 28, 28});
}
#endif
mgb_assert(data != nullptr, "data is invalid");
auto reduce1 = network->addReduce(*data, nvinfer1::ReduceOperation::kSUM, 3, false);
mgb_assert(reduce1 != nullptr, "reduce1 is invalid");
reduce1->getOutput(0)->setName("prob");
network->markOutput(*reduce1->getOutput(0));
#if NV_TENSOR_RT_VERSION >= 6001
{
nvinfer1::TensorFormats formats =
1 << static_cast<int>(nvinfer1::TensorFormat::kLINEAR);
reduce1->getOutput(0)->setAllowedFormats(formats);
}
#endif
return std::make_pair(builder, network);
}
intl::SimpleQuantizedTensorRTNetwork::SimpleQuantizedTensorRTNetwork() { intl::SimpleQuantizedTensorRTNetwork::SimpleQuantizedTensorRTNetwork() {
host_x = range_gen({32, 8, 28, 28}); host_x = range_gen({32, 8, 28, 28});
host_w = weight_gen({8, 8, 3, 3}); host_w = weight_gen({8, 8, 3, 3});
......
...@@ -48,6 +48,20 @@ struct SimpleTensorRTNetwork { ...@@ -48,6 +48,20 @@ struct SimpleTensorRTNetwork {
create_trt_network(bool has_batch_dim); create_trt_network(bool has_batch_dim);
}; };
struct BatchedTensorRTNetwork {
HostTensorGenerator<> gen;
std::shared_ptr<HostTensorND> host_x, host_w, host_b;
std::shared_ptr<ComputingGraph> graph;
SymbolVar x, y;
HostTensorND host_z1;
BatchedTensorRTNetwork();
std::pair<nvinfer1::IBuilder*, INetworkDefinition*>
create_trt_network(bool has_batch_dim);
};
struct SimpleQuantizedTensorRTNetwork { struct SimpleQuantizedTensorRTNetwork {
HostTensorGenerator<dtype::Float32, RandomDistribution::UNIFORM> weight_gen{ HostTensorGenerator<dtype::Float32, RandomDistribution::UNIFORM> weight_gen{
1*1.1f, 127*1.1f}; 1*1.1f, 127*1.1f};
......
...@@ -62,6 +62,37 @@ TEST(TestOprTensorRT, RuntimeBasic) { ...@@ -62,6 +62,37 @@ TEST(TestOprTensorRT, RuntimeBasic) {
} }
TEST(TestOprTensorRT, RuntimeBasicBatched) {
REQUIRE_GPU(1);
intl::BatchedTensorRTNetwork net;
auto make_trt = [&net]() {
auto p = net.create_trt_network(false);
TensorRTUniquePtr<INetworkDefinition> trt_net{p.second, {}};
TensorRTUniquePtr<IBuilder> builder{p.first, {}};
builder->setMaxBatchSize(5);
#if NV_TENSOR_RT_VERSION >= 6001
TensorRTUniquePtr<IBuilderConfig> build_config{
builder->createBuilderConfig()};
TensorRTUniquePtr<ICudaEngine> cuda_engine{
builder->buildEngineWithConfig(*trt_net, *build_config)};
#else
TensorRTUniquePtr<ICudaEngine> cuda_engine{
builder->buildCudaEngine(*trt_net)};
#endif
TensorRTUniquePtr<IHostMemory> mem{cuda_engine->serialize(), {}};
auto nx = opr::Broadcast::make(net.x, {1, net.x.shape()[0], net.x.shape()[1], net.x.shape()[2]});
return TensorRTRuntimeOpr::make(mem->data(), mem->size(), {nx})[0];
};
auto y2 = make_trt();
HostTensorND host_z1;
HostTensorND host_z2;
auto func = net.graph->compile({make_callback_copy(net.y, host_z1),
make_callback_copy(y2, host_z2)});
func->execute();
MGB_ASSERT_TENSOR_NEAR(host_z1, host_z2, 5e-4);
}
TEST(TestOprTensorRT, ConcatRuntimeBasic) { TEST(TestOprTensorRT, ConcatRuntimeBasic) {
REQUIRE_GPU(1); REQUIRE_GPU(1);
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册