fix(mgb/core): use thread local fix multi thread use same compnode with recorder enabled

GitOrigin-RevId: 7d3daa866c114f77c312783ed7431cbaaddecdee

fix(mgb/core): use thread local fix multi thread use same compnode with recorder enabled
GitOrigin-RevId: 7d3daa866c114f77c312783ed7431cbaaddecdee
88574565 · Megvii Engine Team · 3246ee5e · 88574565 · 88574565 · 88574565
6 changed file
--- a/src/core/impl/comp_node/cpu/comp_node.cpp
+++ b/src/core/impl/comp_node/cpu/comp_node.cpp
--- a/src/core/impl/graph/cg_impl_seq.cpp
+++ b/src/core/impl/graph/cg_impl_seq.cpp
@@ -78,14 +78,16 @@ class ComputingGraphImpl::ComputingSequence::ExecContext {

    void warmup_for_fake_exec_with_recorder() {
        // Rerun recorder to ensure that all internal caches stabilize
-        m_recorder->enter_fake_exec();
+        auto comp_node = *(m_comp_seq->m_used_comp_node.begin());
+        m_recorder->enter_fake_exec(comp_node);
        m_comp_seq->m_exec_env.start_exec();
        m_comp_seq->m_exec_env.wait_all();
-        m_recorder->exit_fake_exec();
+        m_recorder->exit_fake_exec(comp_node);
    }

    void stop_and_move_recorder() {
-        m_recorder->stop();
+        auto comp_node = *(m_comp_seq->m_used_comp_node.begin());
+        m_recorder->stop(comp_node);
        if (m_fake_next_exec) {
            m_owner_graph->options().fake_next_exec = false;
        } else {
@@ -439,17 +441,22 @@ void ComputingGraphImpl::ComputingSequence::on_first_exec() {
            m_used_comp_node.insert(j->comp_node());
    }

+    // we maintain a recorder because events may depend on whether recorder
+    // is enabled
+    auto recorder = check_enable_comp_node_seq_recorder();
    auto&& options = m_owner_graph->options();
+    //! The recorder in comp_node is thread_local, so the create thread should
+    //! the same as the execute thread, so set the Synchronize mode
+    if (m_enable_comp_node_seq_recorder) {
+        m_exec_env.set_async_level(0);
+    } else {
        m_exec_env.set_async_level(options.async_exec_level);
+    }
    if (options.async_exec_level) {
        for (auto i : m_used_comp_node)
            m_exec_env.add_comp_node(i);
    }

-    // we maintain a recorder because events may depend on whether recorder
-    // is enabled
-    auto recorder = check_enable_comp_node_seq_recorder();
-
    // create events for timing and sync
    for (auto&& i : m_used_comp_node) {
        size_t flag = 0;

--- a/src/core/include/megbrain/comp_node.h
+++ b/src/core/include/megbrain/comp_node.h
@@ -32,39 +32,7 @@ namespace cg {
 class ComputingGraph;
 }

-/*!
- * \brief record computation operations on a computing node
- *
- * This is used for fast execution of an identical computation sequence where
- * only input/output data differ.
- *
- * When this object is created from a comp node, recording starts immediately.
- * Call stop() when computation finishes, and call replay() when it needs to be
- * re-executed.
- *
- * Implementations should hold a global lock on the comp node until stop() is
- * called.
- */
-class CompNodeSeqRecorder {
-    public:
-        virtual ~CompNodeSeqRecorder() noexcept = default;
-
-        /*!
-         * \brief Enter fake-exec mode
-         *
-         * Memory allocation/free is only allowed in fake-exec mode, and kernels
-         * should not be actually recorded in this mode.
-         *
-         * This should be paired with exit_fake_exec()
-         */
-        virtual void enter_fake_exec() = 0;
-
-        //! Exit fake-exec mode
-        virtual void exit_fake_exec() = 0;
-
-        virtual void stop() = 0;
-        virtual void replay() = 0;
-};
+class CompNodeSeqRecorder;

 /*!
 * \brief identifier for a memory node
@@ -563,18 +531,55 @@ class CompNode {
        //! is needed
        ImplBase *m_impl = nullptr;

-        CompNode(ImplBase *impl):
-            m_impl{impl}
-        {}
-
        friend class CompNodeEnv;
        friend struct HashTrait<CompNode>;
        friend class CompNodeImplHelper;
+    public:
+        CompNode(ImplBase* impl) : m_impl{impl} {}
 };


 MGB_DEF_ENUM_CLASS_BIT_OPR(CompNode::Flag)

+/*!
+ * \brief record computation operations on a computing node
+ *
+ * This is used for fast execution of an identical computation sequence where
+ * only input/output data differ.
+ *
+ * When this object is created from a comp node, recording starts immediately.
+ * Call stop() when computation finishes, and call replay() when it needs to be
+ * re-executed.
+ *
+ * Implementations should consider thread safe in comp_node, in order to support
+ * multi threads reording in the same comp_node simultaneously, using thread
+ * local recorder in comp_node.
+ *
+ * Note. When recording is over, the recorder is independent with comp_node, so
+ * the task dispatched into recorder should not related to the comp_node
+ * methord, and the thread of recorder replay is the user thread.
+ */
+class CompNodeSeqRecorder {
+public:
+    virtual ~CompNodeSeqRecorder() noexcept = default;
+
+    /*!
+     * \brief Enter fake-exec mode
+     *
+     * Memory allocation/free is only allowed in fake-exec mode, and kernels
+     * should not be actually recorded in this mode.
+     *
+     * This should be paired with exit_fake_exec()
+     */
+    virtual void enter_fake_exec(const CompNode& comp_node) = 0;
+
+    //! Exit fake-exec mode
+    virtual void exit_fake_exec(const CompNode& comp_node) = 0;
+
+    virtual void stop(const CompNode& comp_node) = 0;
+    virtual void replay() = 0;
+};
+
 /*!
 * \brief event associated with a CompNode node, used for cross-device
 *      synchronization

--- a/src/core/test/comp_node_helper.cpp
+++ b/src/core/test/comp_node_helper.cpp
@@ -471,6 +471,37 @@ void run<shape_dep_const_shape>(CompNode cn) {
    MGB_ASSERT_TENSOR_EQ(y_expect, host_y);
 }

+//! single thread multi recorder run interleave
+template <>
+void run<multi_recorder_run>(CompNode cn) {
+    using ConvParam = opr::Convolution::Param;
+    ConvParam param;
+    param.sparse = ConvParam::Sparse::GROUP;
+    HostTensorGenerator<> gen;
+    std::vector<HostTensorND> host_z_v(2, HostTensorND());
+    std::vector<std::unique_ptr<mgb::cg::AsyncExecutable>> funcs;
+    auto host_x = gen({3, 4, 10, 8}, cn), host_y = gen({2, 3, 2, 3, 3}, cn);
+    auto gen_graph =
+            [&](int graph_id) -> std::unique_ptr<mgb::cg::AsyncExecutable> {
+        auto graph = ComputingGraph::make();
+        auto x = opr::Host2DeviceCopy::make(*graph, host_x),
+             y = opr::Host2DeviceCopy::make(*graph, host_y),
+             z = opr::Convolution::make(x, y, param);
+        graph->options().comp_node_seq_record_level = 1;
+        return graph->compile({make_callback_copy(z, host_z_v[graph_id])});
+    };
+    funcs.push_back(gen_graph(0));
+    funcs.push_back(gen_graph(1));
+    for (int iter = 0; iter < 10; ++iter) {
+        host_x->copy_from_fixlayout(*gen(host_x->shape(), cn));
+        funcs[0]->execute();
+        funcs[1]->execute();
+        auto expect = eval_conv_cpu<opr::Convolution>(*host_x, *host_y, param);
+        MGB_ASSERT_TENSOR_NEAR(expect, host_z_v[0], 1e-3) << "iter " << iter;
+        MGB_ASSERT_TENSOR_NEAR(expect, host_z_v[1], 1e-3) << "iter " << iter;
+    }
+}
+
 template <>
 void run<void>(CompNode) {}


--- a/src/core/test/comp_node_helper.h
+++ b/src/core/test/comp_node_helper.h
@@ -56,7 +56,7 @@ namespace seq_rec {
    cb(dyn_elemwise_fake_exec)                                                 \
    cb(level2) cb(level2_multi_holder) cb(level2_share_storage)                \
    cb(level2_exec_check) cb(sync_from_func) cb(cb_non_contig)                 \
-    cb(shape_dep_const_shape)
+    cb(shape_dep_const_shape) cb(multi_recorder_run)
 // clang-format on

 #define def_tags(name) \

--- a/src/core/test/graph/multi_thread.cpp
+++ b/src/core/test/graph/multi_thread.cpp
@@ -12,6 +12,7 @@
 #include "megbrain/opr/io.h"
 #include "megbrain/opr/utility.h"
 #include "megbrain/system.h"
+#include "megbrain/opr/dnn/convolution.h"

 #include "megbrain/test/helper.h"

@@ -20,6 +21,37 @@

 using namespace mgb;

+namespace{
+template <typename Opr>
+HostTensorND eval_conv(const std::shared_ptr<HostTensorND>& src,
+                       const std::shared_ptr<HostTensorND>& filter,
+                       const typename Opr::Param& param = {}) {
+    auto graph = ComputingGraph::make();
+    graph->options().log_level = 0;
+    SymbolVar x = opr::Host2DeviceCopy::make(*graph, src);
+    SymbolVar y = opr::Host2DeviceCopy::make(*graph, filter);
+    SymbolVar z = Opr::make(x, y, param);
+    HostTensorND host_z;
+    auto func = graph->compile({make_callback_copy(z, host_z)});
+    func->execute();
+
+    host_z.sync();
+    return host_z;
+}
+
+template <typename Opr>
+HostTensorND eval_conv_cpu(const HostTensorND& xv, const HostTensorND& fv,
+                           const typename Opr::Param& param = {}) {
+    auto cn = CompNode::load("cpux");
+    auto src = std::make_shared<HostTensorND>(cn, xv.layout()),
+         filter = std::make_shared<HostTensorND>(cn, fv.layout());
+    memcpy(src->raw_ptr(), xv.raw_ptr(), xv.layout().span().dist_byte());
+    memcpy(filter->raw_ptr(), fv.raw_ptr(), fv.layout().span().dist_byte());
+    return eval_conv<Opr>(src, filter, param);
+}
+}  // namespace
+
+
 TEST(TestGraph, AsyncExecLevel) {
    REQUIRE_GPU(1);

@@ -165,4 +197,35 @@ TEST(TestGraph, ParallelRun) {
        i.join();
 }

+TEST(TestGraph, MultiThreadRecorder) {
+    using ConvParam = opr::Convolution::Param;
+    ConvParam param;
+    param.sparse = ConvParam::Sparse::GROUP;
+    HostTensorGenerator<> gen;
+    auto cn = CompNode::load("cpux");
+    auto host_x = gen({3, 4, 10, 8}, cn), host_y = gen({2, 3, 2, 3, 3}, cn);
+    auto worker = [&](int record_level) {
+        HostTensorND host_z;
+        auto graph = ComputingGraph::make();
+        auto x = opr::Host2DeviceCopy::make(*graph, host_x),
+             y = opr::Host2DeviceCopy::make(*graph, host_y),
+             z = opr::Convolution::make(x, y, param);
+        graph->options().comp_node_seq_record_level = record_level;
+        graph->options().var_sanity_check_first_run = false;
+        auto func = graph->compile({make_callback_copy(z, host_z)});
+        for (int i = 0; i < 5; i++) {
+            func->execute();
+        }
+        auto expect = eval_conv_cpu<opr::Convolution>(*host_x, *host_y, param);
+        MGB_ASSERT_TENSOR_NEAR(expect, host_z, 1e-3);
+    };
+
+    std::vector<std::thread> workers;
+    for (size_t i = 0; i < 4; ++i)
+        workers.emplace_back(worker, i % 2);
+
+    for (auto&& i : workers)
+        i.join();
+}
+
 // vim: syntax=cpp.doxygen foldmethod=marker foldmarker=f{{{,f}}}