提交 734c498d 编写于 作者: M Megvii Engine Team

perf(mgb/core): improve DevMemAlloc when it has single stream

GitOrigin-RevId: 61874faa6d3be40ff9984efceb44a0cd0b4f2435
上级 39bd66fc
......@@ -267,45 +267,59 @@ MemAllocImplHelper::MemAddr DevMemAllocImpl::alloc_from_parent(size_t size) {
}
size_t DevMemAllocImpl::gather_stream_free_blk_and_release_full() {
size_t gathered_size = 0;
MGB_LOCK_GUARD(m_mutex);
for (auto &&pair: m_stream_alloc) {
auto ch = pair.second.get();
auto &&chmtx = ch->m_mutex;
MGB_LOCK_GUARD(chmtx);
for (auto &&i: ch->m_free_blk_size) {
merge_free_unsafe(i.first);
gathered_size += i.first.size;
}
ch->m_free_blk_addr.clear();
ch->m_free_blk_size.clear();
}
mgb_assert(gathered_size <= m_used_size.load());
m_used_size -= gathered_size;
size_t free_size = 0;
using Iter = decltype(m_free_blk_size.begin());
std::vector<void*> to_free_by_raw;
for (Iter i = m_free_blk_size.begin(), inext; i != m_free_blk_size.end();
i = inext) {
inext = i;
++ inext;
auto &&blk = i->first;
if (blk.addr.is_head) {
auto riter = m_alloc_from_raw.find(blk.addr.addr_ptr());
mgb_assert(riter != m_alloc_from_raw.end() &&
blk.size <= riter->second);
if (blk.size == riter->second) {
to_free_by_raw.push_back(blk.addr.addr_ptr());
free_size += blk.size;
auto j = i->second.aiter;
m_free_blk_size.erase(i);
m_free_blk_addr.erase(j);
m_alloc_from_raw.erase(riter);
MGB_LOCK_GUARD(m_mutex);
auto return_full_free_blk_unsafe = [&](MemAllocImplHelper* alloc) {
auto&& free_blk_size = alloc->m_free_blk_size;
auto&& free_blk_addr = alloc->m_free_blk_addr;
using Iter = decltype(m_free_blk_size.begin());
for (Iter i = free_blk_size.begin(), inext; i != free_blk_size.end();
i = inext) {
inext = i;
++ inext;
auto &&blk = i->first;
if (blk.addr.is_head) {
auto riter = m_alloc_from_raw.find(blk.addr.addr_ptr());
mgb_assert(riter != m_alloc_from_raw.end() &&
blk.size <= riter->second);
if (blk.size == riter->second) {
to_free_by_raw.push_back(blk.addr.addr_ptr());
free_size += blk.size;
auto j = i->second.aiter;
free_blk_size.erase(i);
free_blk_addr.erase(j);
m_alloc_from_raw.erase(riter);
}
}
}
};
if (auto child = get_single_child_stream_unsafe()) {
MGB_LOCK_GUARD(child->m_mutex);
return_full_free_blk_unsafe(child);
mgb_assert(free_size <= m_used_size.load());
m_used_size -= free_size;
} else {
size_t gathered_size = 0;
for (auto &&pair: m_stream_alloc) {
auto ch = pair.second.get();
auto &&chmtx = ch->m_mutex;
MGB_LOCK_GUARD(chmtx);
for (auto &&i: ch->m_free_blk_size) {
merge_free_unsafe(i.first);
gathered_size += i.first.size;
}
ch->m_free_blk_addr.clear();
ch->m_free_blk_size.clear();
}
mgb_assert(gathered_size <= m_used_size.load());
m_used_size -= gathered_size;
}
return_full_free_blk_unsafe(this);
m_tot_allocated_from_raw -= free_size;
// we have to sync to ensure no kernel on the child stream still uses
......@@ -359,6 +373,25 @@ FreeMemStat DevMemAllocImpl::get_free_memory_dev() {
return ret;
}
void DevMemAllocImpl::insert_free_unsafe(const FreeBlock &block) {
if (auto child = get_single_child_stream_unsafe()) {
{
MGB_LOCK_GUARD(child->m_mutex);
child->insert_free_unsafe(block);
}
m_used_size += block.size;
} else {
MemAllocImplHelper::insert_free_unsafe(block);
}
}
StreamMemAllocImpl* DevMemAllocImpl::get_single_child_stream_unsafe() {
if (m_stream_alloc.size() == 1) {
return m_stream_alloc.begin()->second.get();
}
return nullptr;
}
DevMemAllocImpl::~DevMemAllocImpl() {
for (auto &&i: m_alloc_from_raw)
m_raw_allocator->free(i.first);
......
......@@ -94,7 +94,7 @@ class MemAllocImplHelper: virtual public MemAllocBase {
* \brief directly insert a free block into m_free_blk_size and
* m_free_blk_addr, without merging
*/
inline void insert_free_unsafe(const FreeBlock &block);
virtual void insert_free_unsafe(const FreeBlock &block);
/*!
* \brief allocate from parent allocator; this method must either return
......@@ -153,6 +153,12 @@ class StreamMemAllocImpl final: public StreamMemAlloc,
{}
};
/*!
* \Note: DevMemAlloc has two-level structure, but when only one stream was
* registered into the DevMemAlloc, the DevMemAlloc would behave like a
* single-level allocator(i.e. only the FreeBlock pool in its child stream
* allocator will be used) for better performance
*/
class DevMemAllocImpl final: public DevMemAlloc,
public MemAllocImplHelper {
friend class StreamMemAllocImpl;
......@@ -193,6 +199,14 @@ class DevMemAllocImpl final: public DevMemAlloc,
size_t get_used_memory() override { return m_used_size.load(); }
void insert_free_unsafe(const FreeBlock &block) override;
/*!
* \brief return stream allocator if DevMemAlloc has single child,
* otherwise return nullptr
*/
StreamMemAllocImpl* get_single_child_stream_unsafe();
public:
DevMemAllocImpl(
int device, size_t reserve_size,
......
......@@ -209,18 +209,73 @@ TEST(TestMemAlloc, Alloc) {
auto ptr = strm_alloc->alloc_shared(REQ);
EXPECT_EQ(REQ, strm_alloc->get_used_memory());
EXPECT_EQ(0u, strm_alloc->get_free_memory().tot);
EXPECT_EQ(REQ, dev_alloc->get_used_memory());
EXPECT_EQ(TOT - REQ, dev_alloc->get_free_memory().tot);
EXPECT_EQ(TOT - REQ, strm_alloc->get_free_memory().tot);
EXPECT_EQ(TOT, dev_alloc->get_used_memory());
EXPECT_EQ(0u, dev_alloc->get_free_memory().tot);
auto addr = ptr.get();
ptr.reset();
EXPECT_EQ(0u, strm_alloc->get_used_memory());
EXPECT_EQ(REQ, strm_alloc->get_free_memory().tot);
EXPECT_EQ(REQ, dev_alloc->get_used_memory());
EXPECT_EQ(TOT - REQ, dev_alloc->get_free_memory().tot);
EXPECT_EQ(TOT, strm_alloc->get_free_memory().tot);
EXPECT_EQ(TOT, dev_alloc->get_used_memory());
EXPECT_EQ(0u, dev_alloc->get_free_memory().tot);
EXPECT_EQ(addr, strm_alloc->alloc_shared(REQ).get());
}
TEST(TestMemAlloc, MergeFreeBlock) {
using StreamKey = DevMemAlloc::StreamKey;
auto raw_alloc = std::make_shared<DummyAllocator>(7000);
auto runtime_policy = std::make_shared<DummyRuntimePolicy>(0);
auto dev_alloc = DevMemAlloc::make(0, 7000, raw_alloc, runtime_policy);
StreamKey stream_key = nullptr;
auto strm_alloc =
dev_alloc->add_stream(static_cast<StreamKey>(&stream_key));
auto ptr = strm_alloc->alloc_shared(2000);
auto addr = ptr.get();
ptr.reset();
ptr = strm_alloc->alloc_shared(3000);
EXPECT_EQ(addr, ptr.get());
strm_alloc->alloc_shared(4000);
}
TEST(TestMemAlloc, AllocTwoStream) {
constexpr size_t TOT = 2048, REQ0 = 1000, REQ1 = 2000;
using StreamKey = DevMemAlloc::StreamKey;
auto raw_alloc = std::make_shared<DummyAllocator>(TOT);
auto runtime_policy = std::make_shared<DummyRuntimePolicy>(0);
auto dev_alloc = DevMemAlloc::make(0, TOT, raw_alloc, runtime_policy);
StreamKey stream_key0, stream_key1;
auto strm_alloc0 =
dev_alloc->add_stream(static_cast<StreamKey>(&stream_key0)),
strm_alloc1 =
dev_alloc->add_stream(static_cast<StreamKey>(&stream_key1));
ASSERT_NE(strm_alloc0, strm_alloc1);
auto ptr0 = strm_alloc0->alloc_shared(REQ0);
EXPECT_EQ(REQ0, strm_alloc0->get_used_memory());
EXPECT_EQ(0u, strm_alloc0->get_free_memory().tot);
EXPECT_EQ(REQ0, dev_alloc->get_used_memory());
EXPECT_EQ(TOT - REQ0, dev_alloc->get_free_memory().tot);
ptr0.reset();
EXPECT_EQ(0u, strm_alloc0->get_used_memory());
EXPECT_EQ(REQ0, strm_alloc0->get_free_memory().tot);
EXPECT_EQ(REQ0, dev_alloc->get_used_memory());
EXPECT_EQ(TOT - REQ0, dev_alloc->get_free_memory().tot);
auto ptr1 = strm_alloc1->alloc_shared(REQ1);
EXPECT_EQ(0u, strm_alloc0->get_free_memory().tot);
EXPECT_EQ(REQ1, strm_alloc1->get_used_memory());
EXPECT_EQ(0u, strm_alloc1->get_free_memory().tot);
EXPECT_EQ(REQ1, dev_alloc->get_used_memory());
EXPECT_EQ(0u, dev_alloc->get_free_memory().tot);
ptr1.reset();
EXPECT_EQ(0u, strm_alloc1->get_used_memory());
EXPECT_EQ(REQ1, strm_alloc1->get_free_memory().tot);
EXPECT_EQ(REQ1, dev_alloc->get_used_memory());
EXPECT_EQ(0u, dev_alloc->get_free_memory().tot);
}
TEST(TestMemAlloc, AllocMoreThanReserve) {
constexpr size_t RES = 1000, TOT = 2048, REQ = 2048;
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册