提交 0a56a5b1 编写于 作者: M Megvii Engine Team

feat(cuda/comp_node): gets (maximum) GPU memory allocated/reserved

GitOrigin-RevId: da2cc22436022ac5187ce3d2a686cc258ac94150
上级 896a6fb0
...@@ -25,6 +25,11 @@ __all__ = [ ...@@ -25,6 +25,11 @@ __all__ = [
"set_default_device", "set_default_device",
"get_mem_status_bytes", "get_mem_status_bytes",
"get_cuda_compute_capability", "get_cuda_compute_capability",
"get_allocated_memory",
"get_reserved_memory",
"get_max_reserved_memory",
"get_max_allocated_memory",
"reset_max_memory_stats",
"set_prealloc_config", "set_prealloc_config",
"coalesce_free_memory", "coalesce_free_memory",
"DeviceType", "DeviceType",
...@@ -157,6 +162,61 @@ def get_cuda_compute_capability(device: int, device_type=DeviceType.CUDA) -> int ...@@ -157,6 +162,61 @@ def get_cuda_compute_capability(device: int, device_type=DeviceType.CUDA) -> int
return _get_cuda_compute_capability(device, device_type) return _get_cuda_compute_capability(device, device_type)
def get_allocated_memory(device: Optional[str] = None):
r"""Returns the current memory occupied by tensors on the computing device in bytes.
Due to the asynchronous execution of MegEngine, please call megengine._full_sync
before calling this function in order to get accurate value.
"""
if device is None:
device = get_default_device()
return CompNode(device).get_used_memory
def get_reserved_memory(device: Optional[str] = None):
r"""Returns the current memory managed by the caching allocator on the computing device in bytes.
Due to the asynchronous execution of MegEngine, please call megengine._full_sync
before calling this function in order to get accurate value.
"""
if device is None:
device = get_default_device()
return CompNode(device).get_reserved_memory
def get_max_reserved_memory(device: Optional[str] = None):
r"""Returns the maximum memory managed by the caching allocator on the computing device in bytes.
Due to the asynchronous execution of MegEngine, please call megengine._full_sync
before calling this function in order to get accurate value.
"""
if device is None:
device = get_default_device()
return CompNode(device).get_max_reserved_memory
def get_max_allocated_memory(device: Optional[str] = None):
r"""Returns the maximum memory occupied by tensors on the computing device in bytes.
Due to the asynchronous execution of MegEngine, please call megengine._full_sync
before calling this function in order to get accurate value.
"""
if device is None:
device = get_default_device()
return CompNode(device).get_max_used_memory
def reset_max_memory_stats(device: Optional[str] = None):
r"""Resets the maximum stats on the computing device.
Due to the asynchronous execution of MegEngine, please call megengine._full_sync
before calling this function in order to properly reset memory stats.
"""
if device is None:
device = get_default_device()
CompNode.reset_max_memory_stats(device)
set_default_device(os.getenv("MGE_DEFAULT_DEVICE", "xpux")) set_default_device(os.getenv("MGE_DEFAULT_DEVICE", "xpux"))
......
...@@ -73,6 +73,26 @@ void init_common(py::module m) { ...@@ -73,6 +73,26 @@ void init_common(py::module m) {
[](const CompNode& cn) { [](const CompNode& cn) {
return cn.get_mem_status_bytes(); return cn.get_mem_status_bytes();
}) })
.def_property_readonly(
"get_used_memory",
[](const CompNode& cn) { return cn.get_used_memory(); })
.def_property_readonly(
"get_max_used_memory",
[](const CompNode& cn) { return cn.get_max_used_memory(); })
.def_property_readonly(
"get_reserved_memory",
[](const CompNode& cn) { return cn.get_reserved_memory(); })
.def_property_readonly(
"get_max_reserved_memory",
[](const CompNode& cn) {
return cn.get_max_reserved_memory();
})
.def_static(
"reset_max_memory_stats",
[](const CompNode& cn) {
cn.reset_max_used_memory();
cn.reset_max_reserved_memory();
})
.def("create_event", &CompNode::create_event, .def("create_event", &CompNode::create_event,
py::arg("flags") = 0ul) py::arg("flags") = 0ul)
.def_static("_set_default_device", &set_default_device) .def_static("_set_default_device", &set_default_device)
......
...@@ -208,20 +208,7 @@ class CudaCompNode::CompNodeImpl final : public CompNode::Impl { ...@@ -208,20 +208,7 @@ class CudaCompNode::CompNodeImpl final : public CompNode::Impl {
public: public:
CompNodeImpl() : Impl(static_free_device, static_free_host) {} CompNodeImpl() : Impl(static_free_device, static_free_host) {}
void* alloc_device(size_t size) override { void* alloc_device(size_t size) override;
activate();
#if MGB_BUILD_SLIM_SERVING
return m_mem_alloc->alloc(size);
#else
void* ptr = m_mem_alloc->alloc(size);
{
MGB_LOCK_GUARD(m_update_mem);
ptr2size[ptr] = size;
m_used_mem += size;
}
return ptr;
#endif
}
void free_device(void* ptr); void free_device(void* ptr);
...@@ -311,20 +298,30 @@ public: ...@@ -311,20 +298,30 @@ public:
uint64_t get_uid() override { return m_uid; } uint64_t get_uid() override { return m_uid; }
#if !MGB_BUILD_SLIM_SERVING #if !MGB_BUILD_SLIM_SERVING
size_t get_used_memory() override { return m_used_mem; } size_t get_used_memory() override;
size_t get_max_used_memory() override;
size_t get_reserved_memory() override;
size_t get_max_reserved_memory() override;
void reset_max_used_memory() override;
void reset_max_reserved_memory() override;
#endif #endif
private: private:
uint64_t m_uid; uint64_t m_uid;
#if !MGB_BUILD_SLIM_SERVING #if !MGB_BUILD_SLIM_SERVING
std::unordered_map<void*, size_t> ptr2size; std::unordered_map<void*, size_t> ptr2size;
size_t m_used_mem = 0;
#endif #endif
}; };
MGB_DYN_TYPE_OBJ_FINAL_IMPL(CudaCompNode::CompNodeImpl); MGB_DYN_TYPE_OBJ_FINAL_IMPL(CudaCompNode::CompNodeImpl);
struct CudaCompNodeImpl::DeviceInfo { struct CudaCompNodeImpl::DeviceInfo {
int dev_num = -1; int dev_num = -1;
std::atomic_size_t m_used_mem{0};
std::atomic_size_t m_max_used_mem{0};
std::unique_ptr<mem_alloc::DevMemAlloc> mem_alloc; std::unique_ptr<mem_alloc::DevMemAlloc> mem_alloc;
bool init_done() const { return mem_alloc.get(); } bool init_done() const { return mem_alloc.get(); }
...@@ -438,6 +435,24 @@ void CudaCompNodeImpl::fini() { ...@@ -438,6 +435,24 @@ void CudaCompNodeImpl::fini() {
m_initialized = false; m_initialized = false;
} }
void* CudaCompNodeImpl::alloc_device(size_t size) {
activate();
#if MGB_BUILD_SLIM_SERVING
return m_mem_alloc->alloc(size);
#else
void* ptr = m_mem_alloc->alloc(size);
{
MGB_LOCK_GUARD(m_update_mem);
ptr2size[ptr] = size;
m_device_info->m_used_mem += size;
if (m_device_info->m_used_mem > m_device_info->m_max_used_mem) {
m_device_info->m_max_used_mem = m_device_info->m_used_mem.load();
}
}
return ptr;
#endif
}
void CudaCompNodeImpl::free_device(void* ptr) { void CudaCompNodeImpl::free_device(void* ptr) {
if (check_global_finalized()) if (check_global_finalized())
return; return;
...@@ -447,13 +462,39 @@ void CudaCompNodeImpl::free_device(void* ptr) { ...@@ -447,13 +462,39 @@ void CudaCompNodeImpl::free_device(void* ptr) {
{ {
MGB_LOCK_GUARD(m_update_mem); MGB_LOCK_GUARD(m_update_mem);
mgb_assert(ptr2size.find(ptr) != ptr2size.end(), "ptr %p not found!", ptr); mgb_assert(ptr2size.find(ptr) != ptr2size.end(), "ptr %p not found!", ptr);
m_used_mem -= ptr2size.at(ptr); m_device_info->m_used_mem -= ptr2size.at(ptr);
ptr2size.erase(ptr); ptr2size.erase(ptr);
} }
#endif #endif
m_mem_alloc->free(ptr); m_mem_alloc->free(ptr);
} }
#if !MGB_BUILD_SLIM_SERVING
size_t CudaCompNodeImpl::get_used_memory() {
return m_device_info->m_used_mem.load();
}
size_t CudaCompNodeImpl::get_max_used_memory() {
return m_device_info->m_max_used_mem.load();
}
void CudaCompNodeImpl::reset_max_used_memory() {
m_device_info->m_max_used_mem = 0;
}
size_t CudaCompNodeImpl::get_reserved_memory() {
return m_device_info->mem_alloc->get_used_memory();
}
size_t CudaCompNodeImpl::get_max_reserved_memory() {
return m_device_info->mem_alloc->get_max_used_memory();
}
void CudaCompNodeImpl::reset_max_reserved_memory() {
m_device_info->mem_alloc->reset_max_used_memory();
}
#endif
void* CudaCompNodeImpl::alloc_host(size_t size) { void* CudaCompNodeImpl::alloc_host(size_t size) {
// need activate because it create cuda cuda context in current device // need activate because it create cuda cuda context in current device
activate(); activate();
......
...@@ -226,6 +226,9 @@ StreamMemAlloc* DevMemAllocImpl::add_stream(StreamKey stream) { ...@@ -226,6 +226,9 @@ StreamMemAlloc* DevMemAllocImpl::add_stream(StreamKey stream) {
MemAllocImplHelper::MemAddr DevMemAllocImpl::alloc(size_t size) { MemAllocImplHelper::MemAddr DevMemAllocImpl::alloc(size_t size) {
auto addr = do_alloc(size, true); auto addr = do_alloc(size, true);
m_used_size += size; m_used_size += size;
if (m_used_size > m_max_used_size) {
m_max_used_size = m_used_size.load();
}
return addr; return addr;
} }
...@@ -291,6 +294,9 @@ MemAllocImplHelper::MemAddr DevMemAllocImpl::alloc_from_parent(size_t size) { ...@@ -291,6 +294,9 @@ MemAllocImplHelper::MemAddr DevMemAllocImpl::alloc_from_parent(size_t size) {
// exception would be thrown from here // exception would be thrown from here
auto t = do_alloc(size, false, true); auto t = do_alloc(size, false, true);
m_used_size += size; m_used_size += size;
if (m_used_size > m_max_used_size) {
m_max_used_size = m_used_size.load();
}
return t; return t;
} }
} }
...@@ -419,6 +425,9 @@ void DevMemAllocImpl::insert_free_unsafe(const FreeBlock& block) { ...@@ -419,6 +425,9 @@ void DevMemAllocImpl::insert_free_unsafe(const FreeBlock& block) {
child->insert_free_unsafe(block); child->insert_free_unsafe(block);
} }
m_used_size += block.size; m_used_size += block.size;
if (m_used_size > m_max_used_size) {
m_max_used_size = m_used_size.load();
}
} else { } else {
MemAllocImplHelper::insert_free_unsafe(block); MemAllocImplHelper::insert_free_unsafe(block);
} }
......
...@@ -171,6 +171,7 @@ class DevMemAllocImpl final : public DevMemAlloc, public MemAllocImplHelper { ...@@ -171,6 +171,7 @@ class DevMemAllocImpl final : public DevMemAlloc, public MemAllocImplHelper {
size_t m_tot_allocated_from_raw = 0; size_t m_tot_allocated_from_raw = 0;
std::atomic_size_t m_used_size{0}; std::atomic_size_t m_used_size{0};
std::atomic_size_t m_max_used_size{0};
/*! /*!
* \brief gather all free blocks from child streams, and release full chunks * \brief gather all free blocks from child streams, and release full chunks
...@@ -197,6 +198,10 @@ class DevMemAllocImpl final : public DevMemAlloc, public MemAllocImplHelper { ...@@ -197,6 +198,10 @@ class DevMemAllocImpl final : public DevMemAlloc, public MemAllocImplHelper {
size_t get_used_memory() override { return m_used_size.load(); } size_t get_used_memory() override { return m_used_size.load(); }
size_t get_max_used_memory() override { return m_max_used_size.load(); }
void reset_max_used_memory() override { m_max_used_size = 0; }
void insert_free_unsafe(const FreeBlock& block) override; void insert_free_unsafe(const FreeBlock& block) override;
/*! /*!
......
...@@ -335,11 +335,23 @@ public: ...@@ -335,11 +335,23 @@ public:
size_t get_used_memory() const { return m_impl->get_used_memory(); } size_t get_used_memory() const { return m_impl->get_used_memory(); }
size_t get_reserved_memory() const { return m_impl->get_reserved_memory(); }
size_t get_max_reserved_memory() const { return m_impl->get_max_reserved_memory(); }
size_t get_max_used_memory() const { return m_impl->get_max_used_memory(); }
size_t get_max_block_size_available() const { size_t get_max_block_size_available() const {
return m_impl->get_max_block_size_available(); return m_impl->get_max_block_size_available();
} }
size_t get_free_mem() const { return m_impl->get_free_mem(); } size_t get_free_mem() const { return m_impl->get_free_mem(); }
void reset_max_reserved_memory() const {
return m_impl->reset_max_reserved_memory();
}
void reset_max_used_memory() const { return m_impl->reset_max_used_memory(); }
#endif #endif
//! change to another stream on the same memory node //! change to another stream on the same memory node
...@@ -533,8 +545,13 @@ protected: ...@@ -533,8 +545,13 @@ protected:
return {x - x, y - y}; return {x - x, y - y};
} }
virtual size_t get_used_memory() { return 0; } virtual size_t get_used_memory() { return 0; }
virtual size_t get_reserved_memory() { return 0; }
virtual size_t get_max_reserved_memory() { return 0; }
virtual size_t get_max_used_memory() { return 0; }
virtual size_t get_max_block_size_available() { return 0; } virtual size_t get_max_block_size_available() { return 0; }
virtual size_t get_free_mem() { return 0; } virtual size_t get_free_mem() { return 0; }
virtual void reset_max_reserved_memory() {}
virtual void reset_max_used_memory() {}
#endif #endif
virtual Locator locator() = 0; virtual Locator locator() = 0;
......
...@@ -275,6 +275,10 @@ public: ...@@ -275,6 +275,10 @@ public:
const PreAllocConfig& prealloc_config() { return m_prealloc_config; } const PreAllocConfig& prealloc_config() { return m_prealloc_config; }
virtual size_t get_used_memory() { return 0; }
virtual size_t get_max_used_memory() { return 0; }
virtual void reset_max_used_memory() {}
private: private:
size_t m_alignment = 1; size_t m_alignment = 1;
PreAllocConfig m_prealloc_config; PreAllocConfig m_prealloc_config;
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册