diff --git a/imperative/python/megengine/core/tensor/megbrain_graph.py b/imperative/python/megengine/core/tensor/megbrain_graph.py index 0799c726e1001a38d35d40956fc5923630f8ba34..2cb8187431bc09e161b0dde2dfbbd16fff434321 100644 --- a/imperative/python/megengine/core/tensor/megbrain_graph.py +++ b/imperative/python/megengine/core/tensor/megbrain_graph.py @@ -266,6 +266,8 @@ def optimize_for_inference(dest_vars, **kwargs): input for inference on nvidia backend(this optimization pass will result in mismatch of the precision of output of training and inference) + * enable_fuse_preprocess: whether to fuse astype\pad channel\dimshuffle and + etc opr from h2d opr. """ inference_options = GraphOptimizeOptions() inference_optimize_layout_transform_map = { @@ -291,6 +293,8 @@ def optimize_for_inference(dest_vars, **kwargs): inference_options.fuse_conv_bias_nonlinearity = True if kwargs.pop("enable_fuse_conv_bias_with_z", False): inference_options.fuse_conv_bias_with_z = True + if kwargs.pop("enable_fuse_preprocess", False): + inference_options.fuse_preprocess = True if kwargs: raise ValueError("unknown options: %s" % list(kwargs)) @@ -335,6 +339,8 @@ def deserialize_infer_option(x: int) -> Dict[str, bool]: ret["enable_fuse_conv_bias_nonlinearity"] = True if inference_options.fuse_conv_bias_with_z: ret["enable_fuse_conv_bias_with_z"] = True + if inference_options.fuse_preprocess: + ret["enable_fuse_preprocess"] = True return ret diff --git a/imperative/python/src/graph_rt.cpp b/imperative/python/src/graph_rt.cpp index b6fb0c202b7a4b035aa5c3f4854d417c6caa46f2..44736685daab2bccde612839bdbfb6ee0cda07ea 100644 --- a/imperative/python/src/graph_rt.cpp +++ b/imperative/python/src/graph_rt.cpp @@ -251,6 +251,7 @@ void init_graph_rt(py::module m) { .def_readwrite("f16_io_comp", &_OptimizeForInferenceOptions::f16_io_comp) .def_readwrite("fuse_conv_bias_nonlinearity", &_OptimizeForInferenceOptions::fuse_conv_bias_nonlinearity) .def_readwrite("fuse_conv_bias_with_z", &_OptimizeForInferenceOptions::fuse_conv_bias_with_z) + .def_readwrite("fuse_preprocess", &_OptimizeForInferenceOptions::fuse_preprocess) .def_readwrite("layout_transform", &_OptimizeForInferenceOptions::layout_transform) ; diff --git a/sdk/load-and-run/dump_with_testcase_mge.py b/sdk/load-and-run/dump_with_testcase_mge.py index bc6f4a2ba01037e7364db9108cf188b2699c87fc..8f173356b0ee4b734bdaf1d708dc9265ca5c2eec 100755 --- a/sdk/load-and-run/dump_with_testcase_mge.py +++ b/sdk/load-and-run/dump_with_testcase_mge.py @@ -309,6 +309,7 @@ def optimize_for_inference(args, outputs): "enable_chwn4", "enable_fuse_conv_bias_nonlinearity", "enable_fuse_conv_bias_with_z", + "eaable_fuse_preprocess", ] kwargs = {} for k in args_list: @@ -465,6 +466,12 @@ def main(): "nvidia GPU (this optimization pass will result in mismatch " "of the precision of output of training and inference)", ) + parser.add_argument( + "--enable-fuse-preprocess", + action="store_true", + help="fuse astype\pad_channel\dimshuffle and etc opr " + "from h2d opr", + ) args = parser.parse_args() feeds = make_feeds(args) diff --git a/src/tensorrt/impl/tensorrt_runtime_opr.cpp b/src/tensorrt/impl/tensorrt_runtime_opr.cpp index ee8a86397f844b75761b7e25df8e23e819b59e73..8f04f94bd35db381c4b61ed6bd0b8b8e6c0b7b6a 100644 --- a/src/tensorrt/impl/tensorrt_runtime_opr.cpp +++ b/src/tensorrt/impl/tensorrt_runtime_opr.cpp @@ -117,7 +117,7 @@ void TensorRTRuntimeOpr::get_output_var_shape( chan_pos = 1; } dims.nbDims = dims.nbDims + 1; - dims.d[chan_pos] = dims.d[chan_pos] / 4; + dims.d[chan_pos] = (dims.d[chan_pos] + 3) / 4; dims.d[dims.nbDims - 1] = 4; } #endif diff --git a/src/tensorrt/test/tensorrt_runtime.cpp b/src/tensorrt/test/tensorrt_runtime.cpp index bba31c0b499815e2e31bd0b94f6ae95569e4467c..0a659f2cebeae2dea9eb761f1ee2aac457e7e520 100644 --- a/src/tensorrt/test/tensorrt_runtime.cpp +++ b/src/tensorrt/test/tensorrt_runtime.cpp @@ -6,7 +6,8 @@ * * Unless required by applicable law or agreed to in writing, * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or + * implied. */ #include "megbrain/comp_node_env.h" @@ -14,12 +15,13 @@ #include "megbrain/test/helper.h" #include "megbrain/test/megdnn_helper.h" #include "megbrain/utils/debug.h" +#include "megbrain/opr/basic_arith.h" #if MGB_ENABLE_TENSOR_RT +#include "make_trt_net.h" #include "megbrain/tensorrt/tensorrt_opr.h" #include "megbrain/tensorrt/tensorrt_runtime_opr.h" -#include "make_trt_net.h" #include @@ -29,8 +31,6 @@ using namespace nvinfer1; template using TensorRTUniquePtr = intl::TensorRTUniquePtr; - - TEST(TestOprTensorRT, RuntimeBasic) { REQUIRE_GPU(1); intl::SimpleTensorRTNetwork net; @@ -61,7 +61,6 @@ TEST(TestOprTensorRT, RuntimeBasic) { MGB_ASSERT_TENSOR_NEAR(host_z1, host_z2, 5e-4); } - TEST(TestOprTensorRT, RuntimeBasicBatched) { REQUIRE_GPU(1); intl::BatchedTensorRTNetwork net; @@ -80,7 +79,9 @@ TEST(TestOprTensorRT, RuntimeBasicBatched) { builder->buildCudaEngine(*trt_net)}; #endif TensorRTUniquePtr mem{cuda_engine->serialize(), {}}; - auto nx = opr::Broadcast::make(net.x, {1, net.x.shape()[0], net.x.shape()[1], net.x.shape()[2]}); + auto nx = opr::Broadcast::make( + net.x, + {1, net.x.shape()[0], net.x.shape()[1], net.x.shape()[2]}); return TensorRTRuntimeOpr::make(mem->data(), mem->size(), {nx})[0]; }; auto y2 = make_trt(); @@ -93,7 +94,6 @@ TEST(TestOprTensorRT, RuntimeBasicBatched) { MGB_ASSERT_TENSOR_NEAR(host_z1, host_z2, 5e-4); } - TEST(TestOprTensorRT, ConcatRuntimeBasic) { REQUIRE_GPU(1); intl::ConcatConvTensorRTNetwork net; @@ -168,6 +168,97 @@ TEST(TestOprTensorRT, RuntimeChangeBatchSize) { MGB_ASSERT_TENSOR_NEAR(host_z1, host_z2, 5e-4); } +#if NV_TENSOR_RT_VERSION >= 6001 +TEST(TestOprTensorRT, IOFormatFree) { + size_t N = 1, C = 3, H = 7, W = 7; + REQUIRE_GPU(1); + TensorRTUniquePtr builder{ + createInferBuilder(TensorRTOpr::Logger::instance()), {}}; + nvinfer1::NetworkDefinitionCreationFlags flags; + ::memset(&flags, 0, sizeof(nvinfer1::NetworkDefinitionCreationFlags)); + flags = 1 << static_cast( + nvinfer1::NetworkDefinitionCreationFlag::kEXPLICIT_BATCH); + TensorRTUniquePtr network{ + builder->createNetworkV2(flags), {}}; + auto cast = [](size_t i) { return static_cast(i); }; + ITensor* data = network->addInput( + "data", DataType::kINT8, Dims4{cast(N), cast(C), cast(H), cast(W)}); + TensorFormats formats = 1 + << static_cast(nvinfer1::TensorFormat::kCHW4); + data->setAllowedFormats(formats); + data->setDynamicRange(-127.f * 1.2f, 127.f * 1.2f); + HostTensorGenerator<> fgen; + auto mean = fgen({N, C, H, W}); + Weights mean_weights{DataType::kFLOAT, nullptr, 0}; + mean_weights.values = mean->raw_ptr(); + mean_weights.count = N * C * H * W; + auto constant = network->addConstant( + Dims4{cast(N), cast(C), cast(H), cast(W)}, mean_weights); + auto out = network->addElementWise(*network->getInput(0), + *constant->getOutput(0), + ElementWiseOperation::kSUB); + out->getOutput(0)->setDynamicRange(-127.f * 2.3f, 127.f * 2.3f); + network->markOutput(*out->getOutput(0)); + network->getInput(0)->setType(DataType::kINT8); + network->getOutput(0)->setType(DataType::kFLOAT); + network->getOutput(0)->setAllowedFormats( + 1 << static_cast(nvinfer1::TensorFormat::kLINEAR)); + TensorRTUniquePtr build_config{ + builder->createBuilderConfig()}; + build_config->setFlag(BuilderFlag::kINT8); + build_config->setFlag(BuilderFlag::kSTRICT_TYPES); + TensorRTUniquePtr cuda_engine{ + builder->buildEngineWithConfig(*network, *build_config)}; + TensorRTUniquePtr mem{cuda_engine->serialize(), {}}; + + HostTensorGenerator gen; + auto graph = ComputingGraph::make(); + graph->options().graph_opt_level = 0; + auto mkvar = [&](const char* name, const TensorShape& shp, + const DType& dtype) { + return opr::TypeCvt::make( + opr::Host2DeviceCopy::make(*graph, gen(shp)).rename(name), + dtype); + }; + auto x = mkvar("x", {N, C, H, W}, dtype::QuantizedS8(1.2f)); + auto fx = opr::TypeCvt::make(x, dtype::Float32()); + auto wval = opr::SharedDeviceTensor::make(*graph, *mean).rename("mean"); + auto z = fx - wval; + HostTensorND y1; + auto func1 = graph->compile({make_callback_copy(z, y1)}); + func1->execute(); + + TensorShape shp{N, 1, H, W}; + auto host = std::make_shared(x.node()->comp_node(), x.node()->dtype()); + host->resize(shp); + auto ptr = host->raw_ptr(); + size_t size_bytes = TensorLayout{shp, x.node()->dtype()}.span().dist_byte(); + std::memset(ptr, 0, size_bytes); + auto padding = opr::ImmutableTensor::make(*graph, *host); + x = opr::Concat::make({x, padding}, 1); + + auto nchw2nchw4 = [](SymbolVar x) { + auto xshp = opr::GetVarShape::make(x); + + auto cv = [&x](int v) { return x.make_scalar(v); }; + auto sub = [&xshp, &cv](int idx) { + return opr::IndexAt::make(xshp, {{0, cv(idx)}}); + }; + auto tshp = opr::Concat::make( + {sub(0), sub(1) / 4, cv(4), sub(2), sub(3)}, 0); + auto y0 = opr::Reshape::make(x, tshp); + auto y1 = opr::Dimshuffle::make(y0, {0, 1, 3, 4, 2}); + return y1; + }; + x = nchw2nchw4(x); + auto trt = TensorRTRuntimeOpr::make(mem->data(), mem->size(), {x})[0]; + HostTensorND y2; + auto func2 = graph->compile({make_callback_copy(trt, y2)}); + func2->execute(); + MGB_ASSERT_TENSOR_EQ(y1, y2); +} +#endif + #endif // MGB_ENABLE_TENSOR_RT // vim: syntax=cpp.doxygen foldmethod=marker foldmarker=f{{{,f}}}