diff --git a/CMakeLists.txt b/CMakeLists.txt
index 537b296047f18ace948d5b62bf26c17976dd79ac..45005245a16576f398dbe557fa33df0af48fbaaf 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -659,9 +659,9 @@ if(MGE_WITH_CUDA)
         if(MGE_WITH_TRT)
             if(MSVC OR WIN32)
                 message(STATUS "windows TRT_LIBRARY: ${TRT_LIBRARY}")
-                list(APPEND MGE_CUDA_LIBS ${TRT_LIBRARY})
+                list(APPEND MGE_CUDA_LIBS ${TRT_LIBRARY} ${TRT_PLUGIN_LIBRARY})
             else()                
-                list(APPEND MGE_CUDA_LIBS -Wl,--whole-archive libnvinfer -Wl,--no-whole-archive)
+                list(APPEND MGE_CUDA_LIBS -Wl,--whole-archive libnvinfer libnvinfer_plugin -Wl,--no-whole-archive)
             endif()
             if(TensorRT_VERSION_MAJOR GREATER_EQUAL 7)
                 message(STATUS "handle trt myelin lib after trt7")
@@ -738,7 +738,7 @@ if(MGE_WITH_CUDA)
         endif()
     else()
         if(MGE_WITH_TRT)
-            list(APPEND MGE_CUDA_LIBS libnvinfer)
+            list(APPEND MGE_CUDA_LIBS libnvinfer libnvinfer_plugin)
             if(TensorRT_VERSION_MAJOR GREATER_EQUAL 7)
                 message(STATUS "handle trt myelin lib after trt7")
                 list(APPEND MGE_CUDA_LIBS libmyelin)
diff --git a/cmake/tensorrt.cmake b/cmake/tensorrt.cmake
index 52e31c96f4c8a7324682b2ad8a8e2756d0e489b6..53f0f43331bc76f46e2e5a55cfeb7bf3fc553fb3 100644
--- a/cmake/tensorrt.cmake
+++ b/cmake/tensorrt.cmake
@@ -9,6 +9,12 @@ if(MGE_CUDA_USE_STATIC)
         HINTS ${ALTER_LIBRARY_PATHS}
         PATH_SUFFIXES lib lib64
         DOC "TRT library." )
+    find_library(TRT_PLUGIN_LIBRARY
+        NAMES libnvinfer_plugin_static.a nvinfer_plugin.lib
+        PATHS ${ALTER_LD_LIBRARY_PATHS} ${TRT_ROOT_DIR} ${CMAKE_INSTALL_PREFIX}
+        HINTS ${ALTER_LIBRARY_PATHS}
+        PATH_SUFFIXES lib lib64
+        DOC "TRT plugin library." )
 else()
     find_library(TRT_LIBRARY 
         NAMES libnvinfer.so libnvinfer.dylib nvinfer.dll
@@ -16,11 +22,20 @@ else()
         HINTS ${ALTER_LIBRARY_PATHS}
         PATH_SUFFIXES lib lib64
         DOC "TRT library." )
+    find_library(TRT_PLUGIN_LIBRARY
+        NAMES libnvinfer_plugin.so libnvinfer_plugin.dylib nvinfer_plugin.dll
+        PATHS ${ALTER_LD_LIBRARY_PATHS} ${TRT_ROOT_DIR} ${CMAKE_INSTALL_PREFIX}
+        HINTS ${ALTER_LIBRARY_PATHS}
+        PATH_SUFFIXES lib lib64
+        DOC "TRT plugin library." )
 endif()
 
 if(TRT_LIBRARY STREQUAL "TRT_LIBRARY-NOTFOUND")
     message(FATAL_ERROR "Can not find TensorRT Library, please refer to scripts/cmake-build/BUILD_README.md to init TRT env")
 endif()
+if(TRT_PLUGIN_LIBRARY STREQUAL "TRT_PLUGIN_LIBRARY-NOTFOUND")
+    message(FATAL_ERROR "Can not find TensorRT Plugin Library, please refer to scripts/cmake-build/BUILD_README.md to init TRT env")
+endif()
 
 get_filename_component(__found_trt_root ${TRT_LIBRARY}/../.. REALPATH)
 find_path(TRT_INCLUDE_DIR 
@@ -28,10 +43,18 @@ find_path(TRT_INCLUDE_DIR
     HINTS ${TRT_ROOT_DIR} ${CUDA_TOOLKIT_INCLUDE} ${__found_trt_root}
     PATH_SUFFIXES include 
     DOC "Path to TRT include directory." )
+find_path(TRT_PLUGIN_INCLUDE_DIR
+    NAMES NvInferPlugin.h
+    HINTS ${TRT_ROOT_DIR} ${CUDA_TOOLKIT_INCLUDE} ${__found_trt_root}
+    PATH_SUFFIXES include
+    DOC "Path to TRT plugin include directory." )
 
 if(TRT_INCLUDE_DIR STREQUAL "TRT_INCLUDE_DIR-NOTFOUND")
     message(FATAL_ERROR "Can not find TensorRT INCLUDE, please refer to scripts/cmake-build/BUILD_README.md to init TRT env")
 endif()
+if(TRT_PLUGIN_INCLUDE_DIR STREQUAL "TRT_PLUGIN_INCLUDE_DIR-NOTFOUND")
+    message(FATAL_ERROR "Can not find TensorRT Plugin INCLUDE, please refer to scripts/cmake-build/BUILD_README.md to init TRT env")
+endif()
 
 file(STRINGS "${TRT_INCLUDE_DIR}/NvInfer.h" TensorRT_MAJOR REGEX "^#define NV_TENSORRT_MAJOR [0-9]+.*$")
 file(STRINGS "${TRT_INCLUDE_DIR}/NvInfer.h" TensorRT_MINOR REGEX "^#define NV_TENSORRT_MINOR [0-9]+.*$")
@@ -50,14 +73,20 @@ set(TRT_VERSION_STRING "${TensorRT_VERSION_MAJOR}.${TensorRT_VERSION_MINOR}.${Te
 
 if(MGE_CUDA_USE_STATIC)
     add_library(libnvinfer STATIC IMPORTED)
+    add_library(libnvinfer_plugin STATIC IMPORTED)
 else()
     add_library(libnvinfer SHARED IMPORTED)
+    add_library(libnvinfer_plugin SHARED IMPORTED)
 endif()
 
 set_target_properties(libnvinfer PROPERTIES
     IMPORTED_LOCATION ${TRT_LIBRARY}
     INTERFACE_INCLUDE_DIRECTORIES ${TRT_INCLUDE_DIR}
 )
+set_target_properties(libnvinfer_plugin PROPERTIES
+    IMPORTED_LOCATION ${TRT_PLUGIN_LIBRARY}
+    INTERFACE_INCLUDE_DIRECTORIES ${TRT_PLUGIN_INCLUDE_DIR}
+)
 
 message(STATUS "Found TensorRT: ${__found_trt_root} (found version: ${TRT_VERSION_STRING})")
 
diff --git a/scripts/whl/windows/windows_build_whl.sh b/scripts/whl/windows/windows_build_whl.sh
index 0dc619085ae1833502a6bc119bc6e9f834a05a1e..b3824fbc796debc0ec5730c3a592c33b041defd7 100755
--- a/scripts/whl/windows/windows_build_whl.sh
+++ b/scripts/whl/windows/windows_build_whl.sh
@@ -70,6 +70,7 @@ fi
 
 # config NVIDIA libs
 TRT_LIB="/c/Program Files/NVIDIA GPU Computing Toolkit/TensorRT-6.0.1.5/lib/nvinfer.dll"
+TRT_PLUGIN_LIB="/c/Program Files/NVIDIA GPU Computing Toolkit/TensorRT-6.0.1.5/lib/nvinfer_plugin.dll"
 CUDNN_LIB="/c/Program Files/NVIDIA GPU Computing Toolkit/cudnn-10.1-windows10-x64-v7.6.5.32/cuda/bin/cudnn64_7.dll"
 CUSOLVER_LIB="/c/Program Files/NVIDIA GPU Computing Toolkit/CUDA/v10.1/bin/cusolver64_10.dll"
 CUBLAS_LIB="/c/Program Files/NVIDIA GPU Computing Toolkit/CUDA/v10.1/bin/cublas64_10.dll"
@@ -86,6 +87,7 @@ function depend_real_copy() {
     if [ ${BUILD_WHL_CPU_ONLY} = "OFF" ]; then
         echo "copy nvidia lib...."
         cp "${TRT_LIB}" ${REAL_DST}
+        cp "${TRT_PLUGIN_LIB}" ${REAL_DST}
         cp "${CUDNN_LIB}" ${REAL_DST}
         cp "${CUSOLVER_LIB}" ${REAL_DST}
         cp "${CUBLAS_LIB}" ${REAL_DST}
diff --git a/src/tensorrt/impl/tensorrt_runtime_opr.cpp b/src/tensorrt/impl/tensorrt_runtime_opr.cpp
index fe1f777a3856597e9c9c4a2834b9e7489da2364c..78d38c10da3e2228a8fe892460ec26b513e472d7 100644
--- a/src/tensorrt/impl/tensorrt_runtime_opr.cpp
+++ b/src/tensorrt/impl/tensorrt_runtime_opr.cpp
@@ -19,6 +19,7 @@
 #include <cinttypes>
 
 #if MGB_ENABLE_TENSOR_RT
+#include <NvInferPlugin.h>
 
 using namespace mgb;
 using namespace opr;
@@ -208,6 +209,7 @@ SymbolVarArray TensorRTRuntimeOpr::make(
             !CompNode::get_device_count(CompNode::DeviceType::CUDA), SystemError,
             "can not create TensorRTRuntimeOpr when CUDA is not available");
     mgb_assert(!src.empty(), "no inputs provided");
+    initLibNvInferPlugins(&TensorRTOpr::Logger::instance(), "");
     TensorRTUniquePtr<nvinfer1::IRuntime> runtime{
             nvinfer1::createInferRuntime(TensorRTOpr::Logger::instance()), {}};
     auto gpu_allocator = std::make_shared<GpuAllocator>(src[0].node()->comp_node());
diff --git a/src/tensorrt/test/make_trt_net.cpp b/src/tensorrt/test/make_trt_net.cpp
index d61a9ad3162c6777e11c6d1e01d0c1a0817f18e8..8e1c2341e2562a2f4f7201453ac1d721779468d0 100644
--- a/src/tensorrt/test/make_trt_net.cpp
+++ b/src/tensorrt/test/make_trt_net.cpp
@@ -25,6 +25,7 @@
 #include "make_trt_net.h"
 #include "megbrain/tensorrt/tensorrt_opr.h"
 
+#include <NvInferPlugin.h>
 #include <random>
 
 using namespace mgb;
@@ -404,6 +405,84 @@ std::pair<nvinfer1::IBuilder*, INetworkDefinition*> intl::ConcatConvTensorRTNetw
     return std::make_pair(builder, network);
 }
 
+intl::ReshapeConcatTensorRTNetwork::ReshapeConcatTensorRTNetwork() {
+    host_x0 = gen({2, 2, 2, 2});
+    host_y0 = gen({2, 3, 2, 2});
+
+    graph = ComputingGraph::make();
+    x0 = Host2DeviceCopy::make(*graph, host_x0);
+    y0 = Host2DeviceCopy::make(*graph, host_y0);
+    auto x1 = opr::Reshape::make(x0, {2, 8, 1, 1}),
+         y1 = opr::Reshape::make(y0, {2, 12, 1, 1});
+    z = opr::Concat::make({x1, y1}, 1);
+}
+
+std::pair<nvinfer1::IBuilder*, INetworkDefinition*> intl::ReshapeConcatTensorRTNetwork::
+        create_trt_network(bool has_batch_dim) {
+    initLibNvInferPlugins(&TensorRTOpr::Logger::instance(), "");
+
+    CompNode::load("xpu0").activate();
+    auto builder = createInferBuilder(TensorRTOpr::Logger::instance());
+#if NV_TENSOR_RT_VERSION >= 6001
+    nvinfer1::NetworkDefinitionCreationFlags flags;
+    ::memset(&flags, 0, sizeof(nvinfer1::NetworkDefinitionCreationFlags));
+    if (has_batch_dim)
+        flags = 1 << static_cast<int>(
+                        nvinfer1::NetworkDefinitionCreationFlag::kEXPLICIT_BATCH);
+    auto network = builder->createNetworkV2(flags);
+#else
+    auto network = builder->createNetwork();
+#endif
+    nvinfer1::ITensor *data0, *data1;
+#if NV_TENSOR_RT_VERSION >= 6001
+    if (has_batch_dim) {
+        data0 = network->addInput("x0", DataType::kFLOAT, Dims4{2, 2, 2, 2});
+        data1 = network->addInput("y0", DataType::kFLOAT, Dims4{2, 3, 2, 2});
+    } else {
+        data0 = network->addInput("x0", DataType::kFLOAT, Dims3{2, 2, 2});
+        data1 = network->addInput("y0", DataType::kFLOAT, Dims3{3, 2, 2});
+    }
+    {
+        nvinfer1::TensorFormats formats =
+                1 << static_cast<int>(nvinfer1::TensorFormat::kLINEAR);
+        data0->setAllowedFormats(formats);
+        data1->setAllowedFormats(formats);
+    }
+#else
+    if (has_batch_dim) {
+        data0 = network->addInput("x0", DataType::kFLOAT, DimsNCHW{2, 2, 2, 2});
+        data1 = network->addInput("y0", DataType::kFLOAT, DimsNCHW{2, 3, 2, 2});
+    } else {
+        data0 = network->addInput("x0", DataType::kFLOAT, DimsCHW{2, 2, 2});
+        data1 = network->addInput("y0", DataType::kFLOAT, DimsCHW{3, 2, 2});
+    }
+#endif
+    int axis = 1;
+    bool ignoreBatch = false;
+    nvinfer1::PluginField fields[2] = {
+            nvinfer1::PluginField{"axis", &axis, nvinfer1::PluginFieldType::kINT32, 1},
+            nvinfer1::PluginField{
+                    "ignoreBatch", &ignoreBatch, nvinfer1::PluginFieldType::kINT32, 1},
+    };
+    nvinfer1::PluginFieldCollection fc{2, fields};
+
+    auto creator = getPluginRegistry()->getPluginCreator("FlattenConcat_TRT", "1", "");
+    TensorRTUniquePtr<nvinfer1::IPluginV2> plugin(
+            creator->createPlugin("FlattenConcat_TRT", &fc));
+    ITensor* inputTensors[] = {data0, data1};
+    auto flt_cct = network->addPluginV2(inputTensors, 2, *plugin);
+    mgb_assert(flt_cct != nullptr, "FlattenConcat_TRT is invalid");
+    network->markOutput(*flt_cct->getOutput(0));
+#if NV_TENSOR_RT_VERSION >= 6001
+    {
+        nvinfer1::TensorFormats formats =
+                1 << static_cast<int>(nvinfer1::TensorFormat::kLINEAR);
+        flt_cct->getOutput(0)->setAllowedFormats(formats);
+    }
+#endif
+    return std::make_pair(builder, network);
+}
+
 #pragma GCC diagnostic pop
 #endif  // MGB_ENABLE_TENSOR_RT
 
diff --git a/src/tensorrt/test/make_trt_net.h b/src/tensorrt/test/make_trt_net.h
index d7a1e5c1d38b1a1336e4668373f92a5192fc25c5..e18bd3ea83e5cef58cc3e2b5023b373f4e7cb532 100644
--- a/src/tensorrt/test/make_trt_net.h
+++ b/src/tensorrt/test/make_trt_net.h
@@ -92,6 +92,18 @@ struct ConcatConvTensorRTNetwork {
             bool has_batch_dim);
 };
 
+struct ReshapeConcatTensorRTNetwork {
+    HostTensorGenerator<> gen;
+    std::shared_ptr<HostTensorND> host_x0, host_y0;
+    std::shared_ptr<ComputingGraph> graph;
+    SymbolVar x0, y0, z;
+
+    ReshapeConcatTensorRTNetwork();
+
+    std::pair<nvinfer1::IBuilder*, INetworkDefinition*> create_trt_network(
+            bool has_batch_dim);
+};
+
 }  // namespace intl
 }  // namespace opr
 }  // namespace mgb
diff --git a/src/tensorrt/test/tensorrt_runtime.cpp b/src/tensorrt/test/tensorrt_runtime.cpp
index a219ec0c4c12f7aa82c67dd4d3a3a7712f2f9d58..69714042fb3dfda09c6d9ece9d13e8253ce91353 100644
--- a/src/tensorrt/test/tensorrt_runtime.cpp
+++ b/src/tensorrt/test/tensorrt_runtime.cpp
@@ -23,6 +23,7 @@
 #include "megbrain/tensorrt/tensorrt_opr.h"
 #include "megbrain/tensorrt/tensorrt_runtime_opr.h"
 
+#include <fstream>
 #include <random>
 
 using namespace mgb;
@@ -244,6 +245,68 @@ TEST(TestOprTensorRT, IOFormatFree) {
 }
 #endif
 
+TEST(TestOprTensorRT, FlattenConcatPlugin) {
+    REQUIRE_GPU(1);
+    intl::ReshapeConcatTensorRTNetwork net;
+    auto make_trt = [&net]() {
+        auto p = net.create_trt_network(false);
+        TensorRTUniquePtr<INetworkDefinition> trt_net{p.second, {}};
+        TensorRTUniquePtr<IBuilder> builder{p.first, {}};
+        builder->setMaxBatchSize(5);
+#if NV_TENSOR_RT_VERSION >= 6001
+        TensorRTUniquePtr<IBuilderConfig> build_config{builder->createBuilderConfig()};
+        TensorRTUniquePtr<ICudaEngine> cuda_engine{
+                builder->buildEngineWithConfig(*trt_net, *build_config)};
+#else
+        TensorRTUniquePtr<ICudaEngine> cuda_engine{builder->buildCudaEngine(*trt_net)};
+#endif
+        TensorRTUniquePtr<IHostMemory> mem{cuda_engine->serialize(), {}};
+        return TensorRTRuntimeOpr::make(mem->data(), mem->size(), {net.x0, net.y0})[0];
+    };
+    auto z2 = make_trt();
+
+    HostTensorND host_z1;
+    HostTensorND host_z2;
+    auto func = net.graph->compile(
+            {make_callback_copy(net.z, host_z1), make_callback_copy(z2, host_z2)});
+    func->execute();
+    MGB_ASSERT_TENSOR_EQ(host_z1, host_z2);
+}
+
+TEST(TestOprTensorRT, ICudaEngine) {
+    REQUIRE_GPU(1);
+    CompNode::load("xpu0").activate();
+    std::ifstream engineFile("model.trt", std::ios::binary);
+    if (!engineFile)
+        return;
+
+    engineFile.seekg(0, engineFile.end);
+    long int fsize = engineFile.tellg();
+    engineFile.seekg(0, engineFile.beg);
+
+    std::vector<char> engineData(fsize);
+    engineFile.read(engineData.data(), fsize);
+    if (!engineFile)
+        return;
+
+    std::shared_ptr<ComputingGraph> graph;
+    graph = ComputingGraph::make();
+
+    HostTensorGenerator<> gen;
+    std::shared_ptr<HostTensorND> host_x0, host_y0;
+    host_x0 = gen({2, 3, 375, 500});
+    host_y0 = gen({2, 1, 1, 3});
+
+    SymbolVar x0 = Host2DeviceCopy::make(*graph, host_x0);
+    SymbolVar y0 = Host2DeviceCopy::make(*graph, host_y0);
+
+    auto z = TensorRTRuntimeOpr::make(engineData.data(), fsize, {x0, y0})[0];
+    HostTensorND host_z;
+
+    auto func = graph->compile({make_callback_copy(z, host_z)});
+    func->execute();
+}
+
 #endif  // MGB_ENABLE_TENSOR_RT
 
 // vim: syntax=cpp.doxygen foldmethod=marker foldmarker=f{{{,f}}}