提交 88574565 编写于 作者: M Megvii Engine Team

fix(mgb/core): use thread local fix multi thread use same compnode with recorder enabled

GitOrigin-RevId: 7d3daa866c114f77c312783ed7431cbaaddecdee
上级 3246ee5e
......@@ -78,14 +78,16 @@ class ComputingGraphImpl::ComputingSequence::ExecContext {
void warmup_for_fake_exec_with_recorder() {
// Rerun recorder to ensure that all internal caches stabilize
m_recorder->enter_fake_exec();
auto comp_node = *(m_comp_seq->m_used_comp_node.begin());
m_recorder->enter_fake_exec(comp_node);
m_comp_seq->m_exec_env.start_exec();
m_comp_seq->m_exec_env.wait_all();
m_recorder->exit_fake_exec();
m_recorder->exit_fake_exec(comp_node);
}
void stop_and_move_recorder() {
m_recorder->stop();
auto comp_node = *(m_comp_seq->m_used_comp_node.begin());
m_recorder->stop(comp_node);
if (m_fake_next_exec) {
m_owner_graph->options().fake_next_exec = false;
} else {
......@@ -439,17 +441,22 @@ void ComputingGraphImpl::ComputingSequence::on_first_exec() {
m_used_comp_node.insert(j->comp_node());
}
// we maintain a recorder because events may depend on whether recorder
// is enabled
auto recorder = check_enable_comp_node_seq_recorder();
auto&& options = m_owner_graph->options();
//! The recorder in comp_node is thread_local, so the create thread should
//! the same as the execute thread, so set the Synchronize mode
if (m_enable_comp_node_seq_recorder) {
m_exec_env.set_async_level(0);
} else {
m_exec_env.set_async_level(options.async_exec_level);
}
if (options.async_exec_level) {
for (auto i : m_used_comp_node)
m_exec_env.add_comp_node(i);
}
// we maintain a recorder because events may depend on whether recorder
// is enabled
auto recorder = check_enable_comp_node_seq_recorder();
// create events for timing and sync
for (auto&& i : m_used_comp_node) {
size_t flag = 0;
......
......@@ -32,39 +32,7 @@ namespace cg {
class ComputingGraph;
}
/*!
* \brief record computation operations on a computing node
*
* This is used for fast execution of an identical computation sequence where
* only input/output data differ.
*
* When this object is created from a comp node, recording starts immediately.
* Call stop() when computation finishes, and call replay() when it needs to be
* re-executed.
*
* Implementations should hold a global lock on the comp node until stop() is
* called.
*/
class CompNodeSeqRecorder {
public:
virtual ~CompNodeSeqRecorder() noexcept = default;
/*!
* \brief Enter fake-exec mode
*
* Memory allocation/free is only allowed in fake-exec mode, and kernels
* should not be actually recorded in this mode.
*
* This should be paired with exit_fake_exec()
*/
virtual void enter_fake_exec() = 0;
//! Exit fake-exec mode
virtual void exit_fake_exec() = 0;
virtual void stop() = 0;
virtual void replay() = 0;
};
class CompNodeSeqRecorder;
/*!
* \brief identifier for a memory node
......@@ -563,18 +531,55 @@ class CompNode {
//! is needed
ImplBase *m_impl = nullptr;
CompNode(ImplBase *impl):
m_impl{impl}
{}
friend class CompNodeEnv;
friend struct HashTrait<CompNode>;
friend class CompNodeImplHelper;
public:
CompNode(ImplBase* impl) : m_impl{impl} {}
};
MGB_DEF_ENUM_CLASS_BIT_OPR(CompNode::Flag)
/*!
* \brief record computation operations on a computing node
*
* This is used for fast execution of an identical computation sequence where
* only input/output data differ.
*
* When this object is created from a comp node, recording starts immediately.
* Call stop() when computation finishes, and call replay() when it needs to be
* re-executed.
*
* Implementations should consider thread safe in comp_node, in order to support
* multi threads reording in the same comp_node simultaneously, using thread
* local recorder in comp_node.
*
* Note. When recording is over, the recorder is independent with comp_node, so
* the task dispatched into recorder should not related to the comp_node
* methord, and the thread of recorder replay is the user thread.
*/
class CompNodeSeqRecorder {
public:
virtual ~CompNodeSeqRecorder() noexcept = default;
/*!
* \brief Enter fake-exec mode
*
* Memory allocation/free is only allowed in fake-exec mode, and kernels
* should not be actually recorded in this mode.
*
* This should be paired with exit_fake_exec()
*/
virtual void enter_fake_exec(const CompNode& comp_node) = 0;
//! Exit fake-exec mode
virtual void exit_fake_exec(const CompNode& comp_node) = 0;
virtual void stop(const CompNode& comp_node) = 0;
virtual void replay() = 0;
};
/*!
* \brief event associated with a CompNode node, used for cross-device
* synchronization
......
......@@ -471,6 +471,37 @@ void run<shape_dep_const_shape>(CompNode cn) {
MGB_ASSERT_TENSOR_EQ(y_expect, host_y);
}
//! single thread multi recorder run interleave
template <>
void run<multi_recorder_run>(CompNode cn) {
using ConvParam = opr::Convolution::Param;
ConvParam param;
param.sparse = ConvParam::Sparse::GROUP;
HostTensorGenerator<> gen;
std::vector<HostTensorND> host_z_v(2, HostTensorND());
std::vector<std::unique_ptr<mgb::cg::AsyncExecutable>> funcs;
auto host_x = gen({3, 4, 10, 8}, cn), host_y = gen({2, 3, 2, 3, 3}, cn);
auto gen_graph =
[&](int graph_id) -> std::unique_ptr<mgb::cg::AsyncExecutable> {
auto graph = ComputingGraph::make();
auto x = opr::Host2DeviceCopy::make(*graph, host_x),
y = opr::Host2DeviceCopy::make(*graph, host_y),
z = opr::Convolution::make(x, y, param);
graph->options().comp_node_seq_record_level = 1;
return graph->compile({make_callback_copy(z, host_z_v[graph_id])});
};
funcs.push_back(gen_graph(0));
funcs.push_back(gen_graph(1));
for (int iter = 0; iter < 10; ++iter) {
host_x->copy_from_fixlayout(*gen(host_x->shape(), cn));
funcs[0]->execute();
funcs[1]->execute();
auto expect = eval_conv_cpu<opr::Convolution>(*host_x, *host_y, param);
MGB_ASSERT_TENSOR_NEAR(expect, host_z_v[0], 1e-3) << "iter " << iter;
MGB_ASSERT_TENSOR_NEAR(expect, host_z_v[1], 1e-3) << "iter " << iter;
}
}
template <>
void run<void>(CompNode) {}
......
......@@ -56,7 +56,7 @@ namespace seq_rec {
cb(dyn_elemwise_fake_exec) \
cb(level2) cb(level2_multi_holder) cb(level2_share_storage) \
cb(level2_exec_check) cb(sync_from_func) cb(cb_non_contig) \
cb(shape_dep_const_shape)
cb(shape_dep_const_shape) cb(multi_recorder_run)
// clang-format on
#define def_tags(name) \
......
......@@ -12,6 +12,7 @@
#include "megbrain/opr/io.h"
#include "megbrain/opr/utility.h"
#include "megbrain/system.h"
#include "megbrain/opr/dnn/convolution.h"
#include "megbrain/test/helper.h"
......@@ -20,6 +21,37 @@
using namespace mgb;
namespace{
template <typename Opr>
HostTensorND eval_conv(const std::shared_ptr<HostTensorND>& src,
const std::shared_ptr<HostTensorND>& filter,
const typename Opr::Param& param = {}) {
auto graph = ComputingGraph::make();
graph->options().log_level = 0;
SymbolVar x = opr::Host2DeviceCopy::make(*graph, src);
SymbolVar y = opr::Host2DeviceCopy::make(*graph, filter);
SymbolVar z = Opr::make(x, y, param);
HostTensorND host_z;
auto func = graph->compile({make_callback_copy(z, host_z)});
func->execute();
host_z.sync();
return host_z;
}
template <typename Opr>
HostTensorND eval_conv_cpu(const HostTensorND& xv, const HostTensorND& fv,
const typename Opr::Param& param = {}) {
auto cn = CompNode::load("cpux");
auto src = std::make_shared<HostTensorND>(cn, xv.layout()),
filter = std::make_shared<HostTensorND>(cn, fv.layout());
memcpy(src->raw_ptr(), xv.raw_ptr(), xv.layout().span().dist_byte());
memcpy(filter->raw_ptr(), fv.raw_ptr(), fv.layout().span().dist_byte());
return eval_conv<Opr>(src, filter, param);
}
} // namespace
TEST(TestGraph, AsyncExecLevel) {
REQUIRE_GPU(1);
......@@ -165,4 +197,35 @@ TEST(TestGraph, ParallelRun) {
i.join();
}
TEST(TestGraph, MultiThreadRecorder) {
using ConvParam = opr::Convolution::Param;
ConvParam param;
param.sparse = ConvParam::Sparse::GROUP;
HostTensorGenerator<> gen;
auto cn = CompNode::load("cpux");
auto host_x = gen({3, 4, 10, 8}, cn), host_y = gen({2, 3, 2, 3, 3}, cn);
auto worker = [&](int record_level) {
HostTensorND host_z;
auto graph = ComputingGraph::make();
auto x = opr::Host2DeviceCopy::make(*graph, host_x),
y = opr::Host2DeviceCopy::make(*graph, host_y),
z = opr::Convolution::make(x, y, param);
graph->options().comp_node_seq_record_level = record_level;
graph->options().var_sanity_check_first_run = false;
auto func = graph->compile({make_callback_copy(z, host_z)});
for (int i = 0; i < 5; i++) {
func->execute();
}
auto expect = eval_conv_cpu<opr::Convolution>(*host_x, *host_y, param);
MGB_ASSERT_TENSOR_NEAR(expect, host_z, 1e-3);
};
std::vector<std::thread> workers;
for (size_t i = 0; i < 4; ++i)
workers.emplace_back(worker, i % 2);
for (auto&& i : workers)
i.join();
}
// vim: syntax=cpp.doxygen foldmethod=marker foldmarker=f{{{,f}}}
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册