diff --git a/src/tensorrt/impl/tensorrt_opr.cpp b/src/tensorrt/impl/tensorrt_opr.cpp
index 91f5cc3bf0d273ed6de8a5d83ed7670a4305ab25..f7647a0cc7fcaf2604bd00fee4f33e158bdc6bf0 100644
--- a/src/tensorrt/impl/tensorrt_opr.cpp
+++ b/src/tensorrt/impl/tensorrt_opr.cpp
@@ -50,17 +50,6 @@ void TensorRTProfiler::print_layer_times() {
     printf("Total time: %4.3fms\n", total_time);
 }
 
-std::shared_ptr<json::Value> TensorRTProfiler::to_json() {
-    using namespace json;
-    auto prof_arr = Array::make();
-    for (auto&& rec : profile) {
-        auto&& item = Array::make();
-        item->add(String::make(rec.first));
-        item->add(Number::make(rec.second));
-        prof_arr->add(item);
-    }
-    return prof_arr;
-}
 #endif  // MGB_ENABLE_JSON
 
 
@@ -168,7 +157,7 @@ void TensorRTOpr::GpuAllocator::free(void* memory) {
 void TensorRTManager::exec(cg::SingleCNOperatorNodeBase* opr,
                            CompNode comp_node_check,
                            nvinfer1::ICudaEngine* engine,
-                           size_t batch) {
+                           size_t batch, bool use_trt_profiler) {
 
     auto comp_node = opr->comp_node();
     // ICudaEngine is bound to the currently active device
@@ -180,22 +169,11 @@ void TensorRTManager::exec(cg::SingleCNOperatorNodeBase* opr,
                    comp_node_check.to_string().c_str(),
                    comp_node.to_string().c_str());
     }
-#if MGB_ENABLE_JSON
-    auto pf_holder_pair =
-            opr->owner_graph()
-                    ->options()
-                    .user_data.get_user_data<opr_profile::OprProfileHolder>();
-    if (m_has_profiler && !pf_holder_pair.second) {
-        m_context.reset();
-        m_has_profiler = false;
-    }
-#endif
     auto workspace_ptr = opr->output().back()->dev_tensor().raw_ptr();
     bool should_reinit_device_memory =
             !m_context || m_device_workspace_memory_ptr != workspace_ptr;
     if (!m_context) {
         m_context = {engine->createExecutionContextWithoutDeviceMemory(), {}};
-        m_has_profiler = false;
     }
     m_trt_iobuf.resize(opr->input().size() + opr->output().size() - 1);
     bool is_trt_opr = false;
@@ -235,11 +213,7 @@ void TensorRTManager::exec(cg::SingleCNOperatorNodeBase* opr,
 
     bool exec_success = false;
 
-#if MGB_ENABLE_JSON
-    if (!pf_holder_pair.second) {
-        mgb_assert(!m_has_profiler,
-                   "Invalid state of TensorRTRuntimeOpr: should not have "
-                   "profiler.");
+    if (!use_trt_profiler) {
 #if NV_TENSOR_RT_VERSION >= 6001
         if (is_trt_opr)
             exec_success = m_context->enqueueV2(m_trt_iobuf.data(),
@@ -255,7 +229,6 @@ void TensorRTManager::exec(cg::SingleCNOperatorNodeBase* opr,
     } else {
         TensorRTProfiler trt_profiler;
         m_context->setProfiler(&trt_profiler);
-        m_has_profiler = true;
         // TensorRT documentation stated that IExecutionContext->execute
         // "Synchronously execute inference on a batch", and it does not take a
         // cudaStream_t, we expect it do a device synchronize. But it seems like
@@ -272,24 +245,9 @@ void TensorRTManager::exec(cg::SingleCNOperatorNodeBase* opr,
         exec_success = m_context->execute(batch, m_trt_iobuf.data());
 #endif
         mgb_assert(exec_success, "trt execution failed: opr=%s", opr->cname());
-        pf_holder_pair.first[0]->id2object_map[opr] = trt_profiler.to_json();
         printf("TRT profile info of opr %s:\n", opr->name().c_str());
         trt_profiler.print_layer_times();
     }
-#else
-#if NV_TENSOR_RT_VERSION >= 6001
-    if (is_trt_opr)
-        exec_success = m_context->enqueueV2(m_trt_iobuf.data(),
-                                            env.cuda_env().stream, nullptr);
-    else
-        exec_success = m_context->enqueue(batch, m_trt_iobuf.data(),
-                                          env.cuda_env().stream, nullptr);
-#else
-    exec_success = m_context->enqueue(batch, m_trt_iobuf.data(),
-                                      env.cuda_env().stream, nullptr);
-#endif
-    mgb_assert(exec_success, "trt execution failed: opr=%s", opr->cname());
-#endif
 }
 
 /* ========================== TensorRTOpr ========================== */
diff --git a/src/tensorrt/include/megbrain/tensorrt/tensorrt_opr.h b/src/tensorrt/include/megbrain/tensorrt/tensorrt_opr.h
index 04f0086a7ad71033b38c5e21589f6be2c098397e..7d520b89451ca0365ff95e4c9c18e4696b48da5d 100644
--- a/src/tensorrt/include/megbrain/tensorrt/tensorrt_opr.h
+++ b/src/tensorrt/include/megbrain/tensorrt/tensorrt_opr.h
@@ -50,11 +50,11 @@ class TensorRTManager {
     std::vector<void*> m_trt_iobuf;
     TensorRTUniquePtr<nvinfer1::IExecutionContext> m_context;
     void* m_device_workspace_memory_ptr;
-    bool m_has_profiler;
 
 public:
     void exec(cg::SingleCNOperatorNodeBase* opr, CompNode comp_node_check,
-              nvinfer1::ICudaEngine* engine, size_t batch = 1);
+              nvinfer1::ICudaEngine* engine, size_t batch = 1,
+              bool use_trt_profiler = false);
 
     void clear_trt_context() { m_context.reset(); }
 
diff --git a/src/tensorrt/test/tensorrt.cpp b/src/tensorrt/test/tensorrt.cpp
index c4797601e525e64f00e8807dde754f17baa45123..31ca8dc8533ad38f2b75924b931360f45f1047d7 100644
--- a/src/tensorrt/test/tensorrt.cpp
+++ b/src/tensorrt/test/tensorrt.cpp
@@ -28,50 +28,6 @@ using namespace mgb;
 using namespace nvinfer1;
 using namespace opr;
 
-TEST(TestOprTensorRT, Profile) {
-    REQUIRE_GPU(1);
-    intl::ConcatConvTensorRTNetwork net;
-
-    auto p = net.create_trt_network(true);
-
-    auto y2 = TensorRTOpr::make(TensorRTOpr::to_shared_ptr_builder(p.first),
-                                TensorRTOpr::to_shared_ptr_network(p.second),
-                                intl::TensorRTGraphFeatureBits::NCHW_FLOAT, {},
-                                {net.x0, net.x1})[0];
-
-    HostTensorND host_z1;
-    HostTensorND host_z2;
-    auto func = net.graph->compile({make_callback_copy(net.y, host_z1),
-                                    make_callback_copy(y2, host_z2)});
-    {
-        mgb::GraphProfiler profiler(net.graph.get());
-
-        func->execute();
-
-        profiler.to_json()->writeto_fpath(
-                output_file("TestOprTensorRT.Profile.FromProfiler.json"));
-        auto prof_obj = *static_cast<json::Object*>(profiler.to_json().get());
-
-        auto record_obj =
-                *static_cast<json::Object*>(prof_obj["opr_internal_pf"].get());
-        auto opr_prof_arr = *static_cast<json::Array*>(
-                record_obj[y2.node()->owner_opr()->id_str()].get());
-        for (auto item_arr : opr_prof_arr.get_impl()) {
-            auto layer_info_arr = *static_cast<json::Array*>(item_arr.get());
-            auto layer_time =
-                    *static_cast<json::Number*>(layer_info_arr[1].get());
-
-            mgb_assert(layer_time.get_impl() > 0, "Error occured in json.");
-        }
-
-        MGB_ASSERT_TENSOR_NEAR(host_z1, host_z2, 1e-4);
-    }
-    // Run it again after profiler is not in existance.
-    func->execute();
-
-    MGB_ASSERT_TENSOR_NEAR(host_z1, host_z2, 1e-4);
-}
-
 TEST(TestOprTensorRT, Basic) {
     REQUIRE_GPU(1);
     intl::SimpleTensorRTNetwork net;
diff --git a/src/tensorrt/test/tensorrt_runtime.cpp b/src/tensorrt/test/tensorrt_runtime.cpp
index debd27bbda4581a66ddc3040b1eaf1efa2c44136..bc1132f36ecaad71bda0d55c5fbed99b7a2a6a11 100644
--- a/src/tensorrt/test/tensorrt_runtime.cpp
+++ b/src/tensorrt/test/tensorrt_runtime.cpp
@@ -10,7 +10,6 @@
  */
 
 #include "megbrain/comp_node_env.h"
-#include "megbrain/plugin/profiler.h"
 #include "megbrain/test/autocheck.h"
 #include "megbrain/test/helper.h"
 #include "megbrain/test/megdnn_helper.h"
@@ -102,69 +101,6 @@ TEST(TestOprTensorRT, ConcatRuntimeBasic) {
     MGB_ASSERT_TENSOR_NEAR(host_z1, host_z2, 1e-4);
 }
 
-TEST(TestOprTensorRT, RuntimeProfile) {
-    REQUIRE_GPU(1);
-    intl::ConcatConvTensorRTNetwork net;
-    SymbolVar y2;
-    {
-        auto p = net.create_trt_network(false);
-        TensorRTUniquePtr<INetworkDefinition> trt_net{p.second, {}};
-        TensorRTUniquePtr<IBuilder> builder{p.first, {}};
-        builder->setMaxBatchSize(5);
-#if NV_TENSOR_RT_VERSION >= 6001
-        TensorRTUniquePtr<IBuilderConfig> build_config{
-                builder->createBuilderConfig()};
-        auto cuda_engine =
-                builder->buildEngineWithConfig(*trt_net, *build_config);
-#else
-        auto cuda_engine = builder->buildCudaEngine(*trt_net);
-#endif
-        TensorRTUniquePtr<IHostMemory> mem{cuda_engine->serialize(), {}};
-
-        FILE* fout = fopen(output_file("trt_cuda_engine").c_str(), "wb");
-        auto wr = fwrite(mem->data(), 1, mem->size(), fout);
-        mgb_assert(wr == mem->size());
-        fclose(fout);
-
-        y2 = TensorRTRuntimeOpr::make(
-                TensorRTRuntimeOpr::to_shared_ptr_engine(cuda_engine), {},
-                {net.x0, net.x1})[0];
-    }
-
-    HostTensorND host_z1;
-    HostTensorND host_z2;
-    auto func = net.graph->compile({make_callback_copy(net.y, host_z1),
-                                    make_callback_copy(y2, host_z2)});
-
-    {
-        mgb::GraphProfiler profiler(net.graph.get());
-
-        func->execute();
-
-        profiler.to_json()->writeto_fpath(output_file(
-                "TestOprTensorRT.RuntimeProfile.FromProfiler.json"));
-
-        auto prof_obj = *static_cast<json::Object*>(profiler.to_json().get());
-        auto record_obj =
-                *static_cast<json::Object*>(prof_obj["opr_internal_pf"].get());
-        auto opr_prof_arr = *static_cast<json::Array*>(
-                record_obj[y2.node()->owner_opr()->id_str()].get());
-        for (auto item_arr : opr_prof_arr.get_impl()) {
-            auto layer_info_arr = *static_cast<json::Array*>(item_arr.get());
-            auto layer_time =
-                    *static_cast<json::Number*>(layer_info_arr[1].get());
-
-            mgb_assert(layer_time.get_impl() > 0, "Error occured in json.");
-        }
-
-        MGB_ASSERT_TENSOR_NEAR(host_z1, host_z2, 1e-4);
-    }
-    // Run it again after profiler is not in existance.
-    func->execute();
-
-    MGB_ASSERT_TENSOR_NEAR(host_z1, host_z2, 1e-4);
-}
-
 TEST(TestOprTensorRT, RuntimeChangeBatchSize) {
     REQUIRE_GPU(1);
     intl::SimpleTensorRTNetwork net;