diff --git a/src/gopt/impl/subgraph_extractor.cpp b/src/gopt/impl/subgraph_extractor.cpp
index ffced9ea46ee3331a4e1753b5592343d682f4142..5e0d88f520a917907ec767e30f72e6368fc6696c 100644
--- a/src/gopt/impl/subgraph_extractor.cpp
+++ b/src/gopt/impl/subgraph_extractor.cpp
@@ -11,17 +11,214 @@
  */
 
 #include "megbrain/gopt/subgraph_extractor.h"
+#include <atomic>
+#include "megbrain/serialization/opr_shallow_copy.h"
 
 using namespace mgb;
 using namespace cg;
 using namespace gopt;
 
+/* ================== GraphPartition::InputPlaceholder =================*/
+// clang-format off
+MGB_DEFINE_OPR_CLASS(GraphPartition::InputPlaceholder,
+                     cg::SingleCNOperatorNodeBase) // {
+public:
+    InputPlaceholder(VarNode* src_var, const TensorShape& infer_shp,
+                     std::unique_ptr<HostTensorND> infer_val = nullptr);
+
+    static SymbolVar make(VarNode* src_var, const TensorShape& infer_shp,
+                          std::unique_ptr<HostTensorND> infer_val = nullptr);
+
+    size_t input_id() const { return m_id; }
+
+private:
+    void init_output_static_infer_desc() override;
+    void scn_do_execute() override;
+    void init_output_comp_node() override;
+
+    const size_t m_id;
+    TensorShape m_infer_shp;
+    std::unique_ptr<HostTensorND> m_infer_val;
+    static std::atomic_size_t sm_id;
+};
+// clang-format on
+
+MGB_DYN_TYPE_OBJ_FINAL_IMPL(GraphPartition::InputPlaceholder);
+
+std::atomic_size_t GraphPartition::InputPlaceholder::sm_id{0};
+GraphPartition::InputPlaceholder::InputPlaceholder(
+        VarNode* src_var, const TensorShape& infer_shp,
+        std::unique_ptr<HostTensorND> infer_val)
+        : Super(src_var->owner_graph(), {}, {}, {}),
+          m_id{sm_id.fetch_add(1, std::memory_order_relaxed)},
+          m_infer_shp{infer_shp},
+          m_infer_val{std::move(infer_val)} {
+    name(ssprintf("InputPlaceholder@%zu", m_id));
+    add_equivalence_component<ScalarHash<DTypeEnum>>(src_var->dtype().enumv());
+    add_equivalence_component<ScalarHash<size_t>>(m_id);
+    add_output(None)->dtype(src_var->dtype());
+}
+
+void GraphPartition::InputPlaceholder::init_output_comp_node() {
+    output(0)->comp_node(CompNode::default_cpu());
+}
+
+void GraphPartition::InputPlaceholder::scn_do_execute() {
+    mgb_throw(InternalError, "InputPlaceholder opr can not be executed");
+}
+
+void GraphPartition::InputPlaceholder::init_output_static_infer_desc() {
+    using namespace cg::static_infer;
+    auto&& mgr = owner_graph()->static_infer_manager();
+    if (m_infer_shp.ndim == 0) {
+        auto infer_shape = [](TensorShape&, const InpVal&) { return false; };
+        mgr.register_shape_infer(output(0),
+                                 {SourceType::MUTABLE, {}, infer_shape});
+    } else {
+        mgr.register_shape_infer(output(0),
+                                 ShapeInferDesc::make_const(m_infer_shp));
+    }
+
+    if (m_infer_val == nullptr) {
+        auto infer_value = [](DeviceTensorND&, const InpVal&) { return false; };
+        mgr.register_value_infer(output(0),
+                                 {SourceType::MUTABLE, {}, infer_value});
+    } else {
+        auto infer_value = [this](DeviceTensorND& dest, const InpVal&) {
+            dest.copy_from(*m_infer_val).sync();
+            return true;
+        };
+        mgr.register_value_infer(output(0),
+                                 {SourceType::CONSTANT, {}, infer_value});
+    }
+}
+
+SymbolVar GraphPartition::InputPlaceholder::make(
+        VarNode* src_var, const TensorShape& infer_shp,
+        std::unique_ptr<HostTensorND> infer_val) {
+    return src_var->owner_graph()
+            ->insert_opr(std::make_unique<InputPlaceholder>(
+                    src_var, infer_shp, std::move(infer_val)))
+            ->output(0);
+}
+
+/* ================== GraphPartition =================*/
+#if MGB_ENABLE_JSON
+std::shared_ptr<json::Value> GraphPartition::to_json() const {
+    auto replaced_outputs = std::get<1>(replace_graph_by_placeholder());
+
+    ThinHashSet<VarNode*> all_var_node;
+    ThinHashSet<OperatorNodeBase*> all_opr_node;
+    auto comp_seq = json::Array::make();
+
+    auto cb = [&](OperatorNodeBase* opr) {
+        comp_seq->add(json::String::make(opr->id_str()));
+        for (const auto& i : opr->input()) {
+            if (all_var_node.count(i) == 0) {
+                all_var_node.insert(i);
+            }
+        }
+        all_opr_node.insert(opr);
+        for (const auto& o : opr->output()) {
+            all_var_node.insert(o);
+        }
+    };
+    cg::DepOprIter iter{cb};
+    for (const auto& o : replaced_outputs)
+        iter.add(o->owner_opr());
+
+    auto dump_node_coll = [](auto&& collection) {
+        auto objptr = json::Object::make();
+        auto&& obj = *objptr;
+        for (auto&& i : collection)
+            obj[i->id_str()] = i->to_json();
+        return objptr;
+    };
+
+    return json::Object::make({{"operator", dump_node_coll(all_opr_node)},
+                               {"var", dump_node_coll(all_var_node)},
+                               {"comp_seq", comp_seq}});
+}
+#endif
+
+std::pair<VarNodeArray, VarNodeArray>
+GraphPartition::replace_graph_by_placeholder() const {
+    ThinHashMap<VarNode*, VarNode*> old2new;
+    auto graph_partition_copy_opr_shallow = [](OperatorNodeBase* opr,
+                                               const VarNodeArray& inps) {
+        OperatorNodeConfig config = opr->config();
+        return serialization::copy_opr_shallow(*opr, inps, config)->output(0);
+    };
+
+    OperatorNodeSet input_opr_set;
+    for (const auto& i : m_inputs)
+        input_opr_set.insert(i->owner_opr());
+
+    VarNodeArray placeholders;
+    VarNodeArray replaced_outputs;
+    VarNodeArray new_i;
+    auto cb = [&](OperatorNodeBase* opr) {
+        for (const auto& o : opr->output()) {
+            if (o->contain_flag(VarNode::Flag::VOLATILE_CONTENT) ||
+                (input_opr_set.count(opr) && !m_inputs.count(o))) {
+                continue;
+            }
+            VarNode* new_o;
+            if (m_inputs.count(o)) {
+                auto&& mgr = opr->owner_graph()->static_infer_manager();
+                const TensorShape* shp_ptr = nullptr;
+                if (cg::is_static_var_shape(o)) {
+                    shp_ptr = mgr.infer_shape_fallible(o);
+                }
+                TensorShape infer_shp;
+                if (shp_ptr)
+                    infer_shp = *shp_ptr;
+                std::unique_ptr<HostTensorND> hval = nullptr;
+                const DeviceTensorND* dval_ptr = nullptr;
+                if (cg::is_static_var_value(o)) {
+                    dval_ptr = mgr.infer_value_fallible(o);
+                }
+                if (dval_ptr) {
+                    hval.reset(new HostTensorND(CompNode::default_cpu(),
+                                                dval_ptr->dtype()));
+                    hval->resize(dval_ptr->shape()).copy_from(*dval_ptr).sync();
+                }
+                new_o = InputPlaceholder::make(o, infer_shp, std::move(hval))
+                                .node();
+                placeholders.push_back(new_o);
+            } else {
+                new_i.clear();
+                for (const auto& i : opr->input()) {
+                    new_i.push_back(old2new.at(i));
+                }
+                new_o = graph_partition_copy_opr_shallow(o->owner_opr(), new_i);
+            }
+            old2new[o] = new_o;
+        }
+    };
+    cg::DepOprIter iter{cb};
+    for (auto&& i : m_inputs) {
+        for (auto&& j : i->owner_opr()->input()) {
+            if (!input_opr_set.count(j->owner_opr()) &&
+                !m_opr_set.count(j->owner_opr())) {
+                iter.set_visited(j->owner_opr());
+            }
+        }
+    }
+    for (auto&& o : m_outputs)
+        iter.add(o->owner_opr());
+    for (auto&& o : m_outputs) {
+        replaced_outputs.push_back(old2new.at(o));
+    }
+    return std::make_pair(placeholders, replaced_outputs);
+}
+
 /* ================== SubGraphExtractor =================*/
-std::vector<InternalGraph> SubGraphExtractor::extract(
+std::vector<GraphPartition> SubGraphExtractor::extract(
         const SymbolVarArray& endpoint_vars) const {
     ThinHashMap<OperatorNodeBase*, std::pair<OperatorNodeBase*, int>> parent;
     thin_function<OperatorNodeBase*(OperatorNodeBase*)> union_find;
-    auto union_find = [&parent, &union_find](OperatorNodeBase* o) {
+    union_find = [&parent, &union_find](OperatorNodeBase* o) {
         if (parent[o].first == o)
             return o;
         else {
@@ -34,7 +231,7 @@ std::vector<InternalGraph> SubGraphExtractor::extract(
                                               OperatorNodeBase* y) {
         auto root_x = union_find(x), root_y = union_find(y);
         if (root_x != root_y) {
-            OperatorNodeBase *large, small;
+            OperatorNodeBase *large, *small;
             if (parent[root_x].second < parent[root_y].second) {
                 small = root_x, large = root_y;
             } else {
@@ -42,25 +239,23 @@ std::vector<InternalGraph> SubGraphExtractor::extract(
             }
             parent[small].first = large;
             if (parent[large].second == parent[small].second) {
-                parend[large].second += 1;
+                parent[large].second += 1;
             }
         }
     };
 
     std::vector<OperatorNodeBase*> topo;
-    auto cb = [&topo](OperatorNodeBase* opr) {
+    auto cb = [this, &parent, &union_merge, &topo](OperatorNodeBase* opr) {
         topo.push_back(opr);
-        if (opr_list.count(opr->dyn_typeinfo()) == 0)
+        if (m_opr_list.count(opr->dyn_typeinfo()) == 0)
             return;
         auto find = parent.find(opr);
         if (find == parent.end()) {
-            auto insert =
-                    parent.insert(std::make_pair(opr, std::make_pair(opr, 0)));
-            find = insert.first;
+            parent.insert(std::make_pair(opr, std::make_pair(opr, 0)));
         }
         for (auto&& i : opr->input()) {
             auto&& o = i->owner_opr();
-            if (opr_list.count(o->dyn_typeinfo()) == 0)
+            if (m_opr_list.count(o->dyn_typeinfo()) == 0)
                 continue;
             union_merge(opr, o);
         }
@@ -69,33 +264,51 @@ std::vector<InternalGraph> SubGraphExtractor::extract(
     for (const auto& v : endpoint_vars)
         iter.add(v.node()->owner_opr());
 
-    std::vector<InternalGraph> partitions;
-    ThinHashMap<OperatorNodeBase*, InternalGraph*> roots;
+    std::vector<GraphPartition> partitions;
+    partitions.reserve(topo.size());
+    ThinHashMap<OperatorNodeBase*, GraphPartition*> roots;
     for (const auto& opr : reverse_adaptor(topo)) {
-        auto root = union_find(opr);
-        auto find = roots.find(root);
-        InternalGraph* internal_graph = nullptr;
-        if (find == roots.end()) {
-            partitions.emplace_back(InternalGraph{});
-            auto insert =
-                    roots.insert(std::make_pair(root, &partitions.back()));
-            internal_graph = insert.first->second;
-            internal_graph->m_outputs.insert(opr->output(0));
+        if (m_opr_list.count(opr->dyn_typeinfo()) == 0) {
+            for (const auto& i : opr->input()) {
+                if (m_opr_list.count(i->owner_opr()->dyn_typeinfo())) {
+                    auto root = union_find(i->owner_opr());
+                    GraphPartition* partition;
+                    auto find = roots.find(root);
+                    if (find != roots.end()) {
+                        partition = find->second;
+                        partition->output().insert(i);
+                    }
+                }
+            }
         } else {
-            internal_graph = find->second;
-            auto erase = internal_graph->m_inputs.erase(opr->output(0));
-            if (erase > 0) {
-                internal_graph->m_internals.insert(opr->output(0));
+            auto root = union_find(opr);
+            auto find = roots.find(root);
+            GraphPartition* partition = nullptr;
+            if (find == roots.end()) {
+                partitions.emplace_back(GraphPartition{});
+                auto insert =
+                        roots.insert(std::make_pair(root, &partitions.back()));
+                partition = insert.first->second;
+                for (auto&& o : opr->output()) {
+                    if (!o->contain_flag(cg::VarNode::Flag::VOLATILE_CONTENT))
+                        partition->output().insert(o);
+                }
             } else {
-                internal_graph->m_outputs.insert(opr->output(0));
+                partition = find->second;
+                for (auto&& o : opr->output()) {
+                    if (!o->contain_flag(cg::VarNode::Flag::VOLATILE_CONTENT)) {
+                        auto erase = partition->input().erase(o);
+                        if (erase == 0)
+                            partition->output().insert(o);
+                    }
+                }
             }
+            partition->opr_set().insert(opr);
+            for (const auto& i : opr->input())
+                partition->input().insert(i);
         }
-        for (const auto& i : opr->input())
-            internal_graph->m_inputs.insert(i);
     }
     return partitions;
 }
 
-/* ============= SubGraphExtractor =================*/
-
 // vim: syntax=cpp.doxygen
diff --git a/src/gopt/include/megbrain/gopt/subgraph_extractor.h b/src/gopt/include/megbrain/gopt/subgraph_extractor.h
index e443c253df008cf2993d0d60ac50aecc099f6f56..f6e09a48dc838fc726557bb48a9051cf07092103 100644
--- a/src/gopt/include/megbrain/gopt/subgraph_extractor.h
+++ b/src/gopt/include/megbrain/gopt/subgraph_extractor.h
@@ -16,17 +16,37 @@
 namespace mgb {
 namespace gopt {
 
-struct InternalGraph {
-    ThinHashSet<VarNode*> m_internals;
-    ThinHashSet<VarNode*> m_inputs;
-    ThinHashSet<VarNode*> m_outputs;
+class GraphPartition {
+public:
+    using VarNodeSet = ThinHashSet<VarNode*>;
+    using OperatorNodeSet = ThinHashSet<cg::OperatorNodeBase*>;
+    class InputPlaceholder;
+
+    GraphPartition() = default;
+
+#if MGB_ENABLE_JSON
+    std::shared_ptr<json::Value> to_json() const;
+#endif
+
+    const OperatorNodeSet& opr_set() const { return m_opr_set; }
+    const VarNodeSet& input() const { return m_inputs; }
+    const VarNodeSet& output() const { return m_outputs; }
+    OperatorNodeSet& opr_set() { return m_opr_set; }
+    VarNodeSet& input() { return m_inputs; }
+    VarNodeSet& output() { return m_outputs; }
+
+private:
+    OperatorNodeSet m_opr_set;
+    VarNodeSet m_inputs;
+    VarNodeSet m_outputs;
+    std::pair<VarNodeArray, VarNodeArray> replace_graph_by_placeholder() const;
 };
 
 class SubGraphExtractor {
 public:
     using OprList = ThinHashSet<Typeinfo*>;
     SubGraphExtractor(OprList opr_list) : m_opr_list{opr_list} {};
-    std::vector<InternalGraph> extract(
+    std::vector<GraphPartition> extract(
             const SymbolVarArray& endpoint_vars) const;
 
 private:
diff --git a/src/gopt/test/subgraph_extractor.cpp b/src/gopt/test/subgraph_extractor.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..3d23a1ca23ea83083dddbd6bed4fbb96672ebe83
--- /dev/null
+++ b/src/gopt/test/subgraph_extractor.cpp
@@ -0,0 +1,275 @@
+/**
+ * \file src/gopt/test/subgraph_extractor.cpp
+ * MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
+ *
+ * Copyright (c) 2014-2021 Megvii Inc. All rights reserved.
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or
+ * implied.
+ */
+
+#include "./helper.h"
+
+#include "megbrain/gopt/subgraph_extractor.h"
+#include "megbrain/opr/basic_arith.h"
+#include "megbrain/opr/blas.h"
+#include "megbrain/opr/dnn/convolution.h"
+#include "megbrain/opr/dnn/pooling.h"
+#include "megbrain/opr/imgproc.h"
+#include "megbrain/opr/internal/identical_fwd.h"
+#include "megbrain/opr/nn_int.h"
+#include "megbrain/opr/tensor_manip.h"
+#include "megbrain/serialization/serializer.h"
+
+using namespace mgb;
+using namespace gopt;
+using namespace serialization;
+
+namespace {
+// clang-format off
+MGB_DEFINE_OPR_CLASS(MultipleInputOutput,
+                     cg::SingleCNOperatorNodeBase) // {
+public:
+    MultipleInputOutput(const VarNodeArray& inputs, const OperatorNodeConfig& config);
+
+    static SymbolVarArray make(const SymbolVarArray& inputs, const OperatorNodeConfig& config = {});
+private:
+    void scn_do_execute() override {  }
+    void init_output_static_infer_desc() override {  }
+};
+// clang-format on
+
+MGB_DYN_TYPE_OBJ_FINAL_IMPL(MultipleInputOutput);
+
+MultipleInputOutput::MultipleInputOutput(const VarNodeArray& inputs,
+                                         const OperatorNodeConfig& config)
+        : Super(inputs[0]->owner_graph(), config, "multiple_input_output",
+                inputs) {
+    for (auto&& i : inputs)
+        add_input({i});
+    if (inputs.size() == 1) {
+        add_output(None);
+    } else {
+        for (size_t i = 0; i < inputs.size(); ++i)
+            add_output(ssprintf("o%zu", i));
+    }
+    cg::add_workspace_output(this);
+}
+
+SymbolVarArray MultipleInputOutput::make(const SymbolVarArray& inputs,
+                                         const OperatorNodeConfig& config) {
+    auto src = cg::to_var_node_array(inputs);
+    auto multiple_io = std::make_unique<MultipleInputOutput>(src, config);
+    auto ret =
+            cg::to_symbol_var_array(src[0]->owner_graph()
+                                            ->insert_opr(std::move(multiple_io))
+                                            ->output());
+    ret.pop_back();
+    return ret;
+}
+}
+
+TEST(TestSubGraphExtractor, MultipleOutputs) {
+    HostTensorGenerator<> gen;
+    auto graph = ComputingGraph::make();
+
+    auto mkvar = [&](const char* name, const TensorShape& shp) {
+        return opr::Host2DeviceCopy::make(*graph, gen(shp)).rename(name);
+    };
+
+    auto mkcvar = [&](const char* name, const TensorShape& shp) {
+        return opr::SharedDeviceTensor::make(*graph, *gen(shp)).rename(name);
+    };
+
+    graph->options().graph_opt_level = 0;
+    auto x = mkvar("x", {8, 8, 8, 8}), w1 = mkcvar("w1", {4, 8, 3, 3});
+    auto y = mkvar("y", {1, 8, 1, 1});
+    auto add = x + y;
+
+    opr::Convolution::Param param;
+    param.pad_h = param.pad_w = 1;
+    auto c1 = opr::Convolution::make(add, w1, param);
+    auto w2 = mkcvar("w2", {8, 4, 3, 3});
+    auto c2 = opr::ConvolutionBackwardData::make(w2, add, param, {}, {});
+    auto sym_var_arr = MultipleInputOutput::make({c1, c2});
+    auto z = sym_var_arr[1];
+    z = z + (-128);
+
+    using OprList = SubGraphExtractor::OprList;
+    static const OprList opr_list = {
+            opr::ConvolutionForward::typeinfo(),
+            opr::Elemwise::typeinfo(),
+            opr::TypeCvt::typeinfo(),
+            MultipleInputOutput::typeinfo(),
+    };
+    SubGraphExtractor extractor(opr_list);
+    auto partitions = extractor.extract({z});
+    ASSERT_EQ(partitions.size(), 1u);
+    // outputs: sym_var_arr[0], z, add
+    ASSERT_EQ(partitions[0].output().size(), 3u);
+    ASSERT_TRUE(partitions[0].output().count(add.node()) > 0);
+    ASSERT_TRUE(partitions[0].output().count(z.node()) > 0);
+    ASSERT_TRUE(partitions[0].output().count(sym_var_arr[0].node()) > 0);
+    ASSERT_TRUE(partitions[0].output().count(sym_var_arr[1].node()) == 0);
+    // inputs: x, y, w1, c2, (-128)
+    ASSERT_EQ(partitions[0].input().size(), 5u);
+    ASSERT_TRUE(partitions[0].input().count(x.node()) > 0);
+    ASSERT_TRUE(partitions[0].input().count(c2.node()) > 0);
+    // opr: (x + y) conv1 multi_io, (z - 128)
+    ASSERT_EQ(partitions[0].opr_set().size(), 4u);
+    ASSERT_TRUE(partitions[0].opr_set().count(add.node()->owner_opr()) > 0);
+    ASSERT_TRUE(partitions[0].opr_set().count(c1.node()->owner_opr()) > 0);
+    ASSERT_TRUE(partitions[0].opr_set().count(
+                        sym_var_arr[0].node()->owner_opr()) > 0);
+    ASSERT_TRUE(partitions[0].opr_set().count(z.node()->owner_opr()) > 0);
+}
+
+TEST(TestSubGraphExtractor, MultipleReaders) {
+    HostTensorGenerator<> gen;
+    auto graph = ComputingGraph::make();
+
+    auto mkvar = [&](const char* name, const TensorShape& shp) {
+        return opr::Host2DeviceCopy::make(*graph, gen(shp)).rename(name);
+    };
+
+    auto mkcvar = [&](const char* name, const TensorShape& shp) {
+        return opr::SharedDeviceTensor::make(*graph, *gen(shp)).rename(name);
+    };
+
+    graph->options().graph_opt_level = 0;
+    auto x = mkvar("x", {8, 8, 8, 8}), w1 = mkcvar("w1", {4, 8, 3, 3});
+    auto y = mkvar("y", {1, 8, 1, 1});
+    auto add = x + y;
+
+    opr::Convolution::Param param;
+    param.pad_h = param.pad_w = 1;
+    auto c1 = opr::Convolution::make(add, w1, param);
+    auto w2 = mkcvar("w2", {8, 4, 3, 3});
+    auto c2 = opr::ConvolutionBackwardData::make(w2, add, param, {}, {});
+    auto z = c1 + c2;
+
+    using OprList = SubGraphExtractor::OprList;
+    static const OprList opr_list = {
+            opr::ConvolutionForward::typeinfo(),
+            opr::Elemwise::typeinfo(),
+            opr::TypeCvt::typeinfo(),
+    };
+    SubGraphExtractor extractor(opr_list);
+    auto partitions = extractor.extract({z});
+    ASSERT_EQ(partitions.size(), 1u);
+    ASSERT_EQ(partitions[0].output().size(), 2u);
+    ASSERT_TRUE(partitions[0].output().count(add.node()) > 0);
+    ASSERT_TRUE(partitions[0].output().count(z.node()) > 0);
+    ASSERT_EQ(partitions[0].input().size(), 4u);
+    ASSERT_TRUE(partitions[0].input().count(x.node()) > 0);
+    partitions[0].to_json()->writeto_fpath(
+            output_file("TestSubGraphExtractor.MultipleReaders.json"));
+}
+
+TEST(TestSubGraphExtractor, Complicated) {
+    const size_t N = 16, C = 3, H = 768, W = 1280;
+    HostTensorGenerator<dtype::Uint8> gen;
+    auto graph = ComputingGraph::make();
+    /* h2d
+        |
+        v
+       astype(f32)
+        |
+       add(-128)
+        |
+        v
+       astype(q8)
+        |
+        v
+       conv1
+        |
+        v
+       astype(u4)
+          |
+         / \
+      conv2 conv3 -> astype(q32) -> output
+         \ /
+         qadd
+          |
+          v
+        astype(q8)
+          / \
+      deconv conv4
+          \ /
+         concat -> output */
+    auto h2d = opr::Host2DeviceCopy::make(*graph, gen({N, C, H, W}));
+    auto data = opr::TypeCvt::make(h2d, dtype::Float32());
+    auto sub_128 = data + (-128);
+    auto x = opr::TypeCvt::make(sub_128, dtype::QuantizedS8(1.f));
+    auto mkcvar = [&](const char* name, const TensorShape& shp,
+                      const DType& dtype) {
+        return opr::TypeCvt::make(
+                opr::SharedDeviceTensor::make(*graph, *gen(shp)).rename(name),
+                dtype);
+    };
+    auto w1 = mkcvar("w1", {16, 3, 3, 3}, dtype::QuantizedS8(1.f));
+    auto b1 = mkcvar("b1", {1, 16, 1, 1}, dtype::QuantizedS32(1.f));
+    opr::ConvBias::Param param;
+    param.stride_h = param.stride_w = 2;
+    param.pad_h = param.pad_w = 1;
+    auto conv1 = opr::ConvBias::make(
+            x, w1, b1, param, {}, OperatorNodeConfig(dtype::QuantizedS8(1.f)));
+    conv1 = opr::TypeCvt::make(
+            conv1, dtype::Quantized4Asymm(1.f, static_cast<uint8_t>(8)));
+    auto w2 = mkcvar("w2", {16, 16, 3, 3}, dtype::QuantizedS4(1.f));
+    auto b2 = mkcvar("b2", {1, 16, 1, 1}, dtype::QuantizedS32(1.f));
+    auto conv2 = opr::ConvBias::make(conv1, w2, b2, param, {},
+                                     OperatorNodeConfig(dtype::Quantized4Asymm(
+                                             1.f, static_cast<uint8_t>(8))));
+    param.pad_h = param.pad_w = 0;
+    auto w3 = mkcvar("w3", {16, 16, 1, 1}, dtype::QuantizedS4(1.f));
+    auto b3 = mkcvar("b3", {1, 16, 1, 1}, dtype::QuantizedS32(1.f));
+    auto conv3 = opr::ConvBias::make(conv1, w3, b3, param, {},
+                                     OperatorNodeConfig(dtype::Quantized4Asymm(
+                                             1.f, static_cast<uint8_t>(8))));
+    auto conv3f = opr::TypeCvt::make(conv3, dtype::Float32());
+    auto qadd = opr::ElemwiseMultiType::make(
+            {conv2, conv3}, {opr::ElemwiseMultiType::Mode::QADD},
+            OperatorNodeConfig(
+                    dtype::Quantized4Asymm(1.f, static_cast<uint8_t>(8))));
+    auto q8 = opr::TypeCvt::make(qadd, dtype::QuantizedS8(1.f));
+
+    auto w4 = mkcvar("w4", {16, 16, 3, 3}, dtype::QuantizedS8(1.f));
+    param.stride_h = param.stride_w = 1;
+    param.pad_h = param.pad_w = 1;
+    auto conv4 = opr::ConvBiasForward::make(
+            q8, w4, param, {}, OperatorNodeConfig(dtype::QuantizedS8(1.f)));
+    conv4 = opr::TypeCvt::make(conv4, dtype::Float32());
+
+    opr::Convolution::Param conv_param;
+    conv_param.stride_h = param.stride_w = 1;
+    conv_param.pad_h = param.pad_w = 0;
+    auto w5 = mkcvar("w4", {16, 16, 1, 1}, dtype::QuantizedS8(1.f));
+    auto deconv = opr::ConvolutionBackwardData::make(
+            w5, q8, conv_param, {},
+            OperatorNodeConfig(dtype::QuantizedS8(1.f)));
+    deconv = opr::TypeCvt::make(deconv, dtype::Float32());
+    auto z = opr::Concat::make({conv4, deconv}, 1);
+
+    using OprList = SubGraphExtractor::OprList;
+    static const OprList opr_list = {
+            opr::ConvBiasForward::typeinfo(),
+            opr::ConvolutionForward::typeinfo(),
+            opr::ConvolutionBackwardData::typeinfo(),
+            opr::ElemwiseMultiType::typeinfo(),
+            opr::Elemwise::typeinfo(),
+            opr::TypeCvt::typeinfo(),
+            opr::PoolingForward::typeinfo(),
+            opr::WarpPerspectiveForward::typeinfo(),
+    };
+    SubGraphExtractor extractor(opr_list);
+    auto partitions = extractor.extract({conv3f.node(), z.node()});
+    ASSERT_EQ(partitions.size(), 1u);
+    const char* prefix = "TestSubGraphExtractor.Complicated";
+    partitions[0].to_json()->writeto_fpath(
+            output_file(ssprintf("%s.json", prefix).c_str()));
+}
+
+// vim: syntax=cpp.doxygen foldmethod=marker foldmarker=f{{{,f}}}