未验证 提交 3fdc105f 编写于 作者: Z zhangbo9674 提交者: GitHub

Refine munmap freq for RefcountedMemoryMapAllocation (#49691)

* refine munmap freq for ref_cnt_mmap_allocator

* add shm reuse logic

* fix compile bug

* fix compile bug

* fix bug of file refcount

* fix compile bug

* fix compile bug

* refine code for delete shm case

* polish code

* refine shm cache pool size setting logic

* set buffer is 2

* refine shm cache size logic

* refine max shm cache

* refine shm cache size
上级 18745e6f
...@@ -26,6 +26,8 @@ ...@@ -26,6 +26,8 @@
#include "glog/logging.h" #include "glog/logging.h"
#include "paddle/fluid/platform/enforce.h" #include "paddle/fluid/platform/enforce.h"
DECLARE_bool(use_shm_cache);
namespace paddle { namespace paddle {
namespace memory { namespace memory {
namespace allocation { namespace allocation {
...@@ -111,20 +113,33 @@ void AllocateMemoryMap( ...@@ -111,20 +113,33 @@ void AllocateMemoryMap(
std::shared_ptr<RefcountedMemoryMapAllocation> std::shared_ptr<RefcountedMemoryMapAllocation>
AllocateRefcountedMemoryMapAllocation(std::string filename, AllocateRefcountedMemoryMapAllocation(std::string filename,
int flags, int flags,
size_t size) { size_t size,
int buffer_id) {
int fd = -1; int fd = -1;
void *base_ptr = nullptr; void *base_ptr = nullptr;
AllocateMemoryMap(filename, flags, size + mmap_alignment, &base_ptr, &fd); if (buffer_id == -1) {
AllocateMemoryMap(filename, flags, size + mmap_alignment, &base_ptr, &fd);
VLOG(4) << "Create and mmap a new shm: " << filename;
} else {
base_ptr = MemoryMapAllocationPool::Instance().GetById(buffer_id).mmap_ptr_;
VLOG(4) << "Get a cached shm " << filename;
}
void *aliged_base_ptr = void *aliged_base_ptr =
static_cast<void *>(static_cast<char *>(base_ptr) + mmap_alignment); static_cast<void *>(static_cast<char *>(base_ptr) + mmap_alignment);
return std::make_shared<RefcountedMemoryMapAllocation>( return std::make_shared<RefcountedMemoryMapAllocation>(
aliged_base_ptr, size, filename, flags, fd); aliged_base_ptr, size, filename, flags, fd, buffer_id);
} }
RefcountedMemoryMapAllocation::RefcountedMemoryMapAllocation( RefcountedMemoryMapAllocation::RefcountedMemoryMapAllocation(
void *ptr, size_t size, std::string ipc_name, int fd, int flags) void *ptr,
size_t size,
std::string ipc_name,
int fd,
int flags,
int buffer_id)
: MemoryMapAllocation(ptr, size, ipc_name, fd, flags) { : MemoryMapAllocation(ptr, size, ipc_name, fd, flags) {
// must reset base ptr first. // must reset base ptr first.
buffer_id_ = buffer_id;
resetBaseptr(); resetBaseptr();
initializeRefercount(); initializeRefercount();
} }
...@@ -165,25 +180,40 @@ void RefcountedMemoryMapAllocation::initializeRefercount() { ...@@ -165,25 +180,40 @@ void RefcountedMemoryMapAllocation::initializeRefercount() {
} }
void RefcountedMemoryMapAllocation::close() { void RefcountedMemoryMapAllocation::close() {
VLOG(4) << "Close a RefcountedMemoryMapAllocation: " << ipc_name_;
if (closed_) { if (closed_) {
return; return;
} }
closed_ = true; closed_ = true;
void *data = map_ptr_; void *data = map_ptr_;
CountInfo *info = reinterpret_cast<CountInfo *>(data); CountInfo *info = reinterpret_cast<CountInfo *>(data);
if (--info->refcount == 0) { --info->refcount;
shm_unlink(ipc_name_.c_str()); if (FLAGS_use_shm_cache && buffer_id_ != -1) {
VLOG(6) << "shm_unlink file: " << ipc_name_; return;
} else {
if (FLAGS_use_shm_cache &&
MemoryMapAllocationPool::Instance().BufferSize() <
static_cast<size_t>(
MemoryMapAllocationPool::Instance().MaxPoolSize())) {
MemoryMapAllocationPool::Instance().Insert(MemoryMapInfo(
flags_, map_size_ - mmap_alignment, ipc_name_, map_ptr_));
} else {
if (info->refcount == 0 &&
shm_open(ipc_name_.c_str(), O_RDWR, (mode_t)0600) != -1) {
shm_unlink(ipc_name_.c_str());
VLOG(6) << "shm_unlink file: " << ipc_name_;
}
PADDLE_ENFORCE_NE(munmap(map_ptr_, map_size_),
-1,
platform::errors::Unavailable(
"could not unmap the shared memory file: ",
strerror(errno),
" (",
errno,
")"));
}
} }
PADDLE_ENFORCE_NE(
munmap(map_ptr_, map_size_),
-1,
platform::errors::Unavailable("could not unmap the shared memory file: ",
strerror(errno),
" (",
errno,
")"));
} }
MemoryMapWriterAllocation::~MemoryMapWriterAllocation() { MemoryMapWriterAllocation::~MemoryMapWriterAllocation() {
...@@ -299,6 +329,67 @@ void MemoryMapFdSet::Clear() { ...@@ -299,6 +329,67 @@ void MemoryMapFdSet::Clear() {
MemoryMapFdSet::~MemoryMapFdSet() { Clear(); } MemoryMapFdSet::~MemoryMapFdSet() { Clear(); }
MemoryMapAllocationPool *MemoryMapAllocationPool::pool_ = nullptr;
void MemoryMapAllocationPool::Insert(const MemoryMapInfo &memory_map) {
std::lock_guard<std::mutex> guard(mtx_);
memory_map_allocations_.push_back(memory_map);
VLOG(4) << this << "Intsert a new shm: " << memory_map.file_name_;
}
int MemoryMapAllocationPool::FindFromCache(const int &flag,
const size_t &data_size,
const std::string &file_name,
bool check_refcount) {
std::lock_guard<std::mutex> guard(mtx_);
for (size_t idx = 0; idx < memory_map_allocations_.size(); idx++) {
if (memory_map_allocations_.at(idx).flags_ == flag &&
memory_map_allocations_.at(idx).data_size_ == data_size) {
if (file_name == "" ||
memory_map_allocations_.at(idx).file_name_ == file_name) {
if (!check_refcount || reinterpret_cast<CountInfo *>(
memory_map_allocations_.at(idx).mmap_ptr_)
->refcount == 0) {
VLOG(4) << "Match at: " << idx;
return idx;
}
}
}
}
return -1;
}
const MemoryMapInfo &MemoryMapAllocationPool::GetById(int id) {
std::lock_guard<std::mutex> guard(mtx_);
return memory_map_allocations_.at(id);
}
void MemoryMapAllocationPool::SetMaxPoolSize(const int &size) {
max_pool_size_ = size;
VLOG(4) << this << "Set max pool size is: " << max_pool_size_;
}
void MemoryMapAllocationPool::Clear() {
std::lock_guard<std::mutex> guard(mtx_);
for (auto mmap : memory_map_allocations_) {
int rlt = shm_unlink(mmap.file_name_.c_str());
if (rlt == 0) {
VLOG(4) << "MemoryMapAllocationPool: clear " << mmap.file_name_;
}
PADDLE_ENFORCE_NE(munmap(mmap.mmap_ptr_, mmap.data_size_ + mmap_alignment),
-1,
platform::errors::Unavailable(
"could not unmap the shared memory file: ",
strerror(errno),
" (",
errno,
")"));
}
memory_map_allocations_.clear();
}
MemoryMapAllocationPool::~MemoryMapAllocationPool() { Clear(); }
} // namespace allocation } // namespace allocation
} // namespace memory } // namespace memory
} // namespace paddle } // namespace paddle
......
...@@ -75,8 +75,12 @@ class MemoryMapAllocation : public Allocation { ...@@ -75,8 +75,12 @@ class MemoryMapAllocation : public Allocation {
class RefcountedMemoryMapAllocation : public MemoryMapAllocation { class RefcountedMemoryMapAllocation : public MemoryMapAllocation {
public: public:
RefcountedMemoryMapAllocation( RefcountedMemoryMapAllocation(void *ptr,
void *ptr, size_t size, std::string ipc_name, int flags, int fd); size_t size,
std::string ipc_name,
int flags,
int fd,
int buffer_id = -1);
void incref(); void incref();
int decref(); int decref();
...@@ -84,6 +88,7 @@ class RefcountedMemoryMapAllocation : public MemoryMapAllocation { ...@@ -84,6 +88,7 @@ class RefcountedMemoryMapAllocation : public MemoryMapAllocation {
virtual ~RefcountedMemoryMapAllocation() { close(); } virtual ~RefcountedMemoryMapAllocation() { close(); }
protected: protected:
int buffer_id_ = -1;
void initializeRefercount(); void initializeRefercount();
void resetBaseptr(); void resetBaseptr();
}; };
...@@ -94,7 +99,8 @@ void AllocateMemoryMap( ...@@ -94,7 +99,8 @@ void AllocateMemoryMap(
std::shared_ptr<RefcountedMemoryMapAllocation> std::shared_ptr<RefcountedMemoryMapAllocation>
AllocateRefcountedMemoryMapAllocation(std::string filename, AllocateRefcountedMemoryMapAllocation(std::string filename,
int flags, int flags,
size_t size); size_t size,
int buffer_id = -1);
class MemoryMapWriterAllocation : public Allocation { class MemoryMapWriterAllocation : public Allocation {
public: public:
...@@ -153,6 +159,68 @@ class MemoryMapFdSet { ...@@ -153,6 +159,68 @@ class MemoryMapFdSet {
std::mutex mtx_; std::mutex mtx_;
}; };
class MemoryMapInfo {
public:
explicit MemoryMapInfo(int flags,
size_t data_size,
std::string file_name,
void *mmap_ptr)
: flags_(flags),
data_size_(data_size),
file_name_(file_name),
mmap_ptr_(mmap_ptr) {}
int flags_ = 0;
size_t data_size_ = 0;
std::string file_name_;
void *mmap_ptr_ = nullptr;
};
/* Note(zhangbo):
MemoryMapAllocationPool is used to cache and reuse shm, thus reducing munmap in
dataloader. The munmap(shm_mmap_ptr) instruction in
RefcountedMemoryMapAllocation::close() function may block other threads of the
process. Therefore, the logic of shm cache and reuse is designed: the shm
created by the _share_filename process will be cached and reused according to
the data_size of shm, thus eliminating the problem of munmap blocking other
threads
*/
class MemoryMapAllocationPool {
public:
static MemoryMapAllocationPool &Instance() {
if (pool_ == nullptr) {
pool_ = new MemoryMapAllocationPool();
}
return *pool_;
}
void Insert(const MemoryMapInfo &memory_map);
int FindFromCache(const int &flag,
const size_t &data_size,
const std::string &file_name = "",
bool check_refcount = true);
const MemoryMapInfo &GetById(int id);
size_t BufferSize() { return memory_map_allocations_.size(); }
void Clear();
void SetMaxPoolSize(const int &size);
int MaxPoolSize() { return max_pool_size_; }
~MemoryMapAllocationPool();
private:
MemoryMapAllocationPool() = default;
static MemoryMapAllocationPool *pool_;
std::vector<MemoryMapInfo> memory_map_allocations_;
int max_pool_size_ = 0;
std::mutex mtx_;
};
} // namespace allocation } // namespace allocation
} // namespace memory } // namespace memory
} // namespace paddle } // namespace paddle
......
...@@ -624,6 +624,11 @@ void BindImperative(py::module *m_ptr) { ...@@ -624,6 +624,11 @@ void BindImperative(py::module *m_ptr) {
m.def("_cleanup_mmap_fds", m.def("_cleanup_mmap_fds",
[]() { memory::allocation::MemoryMapFdSet::Instance().Clear(); }); []() { memory::allocation::MemoryMapFdSet::Instance().Clear(); });
m.def("_set_max_memory_map_allocation_pool_size", [](int32_t size) {
memory::allocation::MemoryMapAllocationPool::Instance().SetMaxPoolSize(
size);
});
#endif #endif
m.def("start_imperative_gperf_profiler", m.def("start_imperative_gperf_profiler",
......
...@@ -182,6 +182,7 @@ limitations under the License. */ ...@@ -182,6 +182,7 @@ limitations under the License. */
#include "pybind11/stl.h" #include "pybind11/stl.h"
DECLARE_bool(use_mkldnn); DECLARE_bool(use_mkldnn);
DECLARE_bool(use_shm_cache);
// disable auto conversion to list in Python // disable auto conversion to list in Python
PYBIND11_MAKE_OPAQUE(paddle::framework::LoDTensorArray); PYBIND11_MAKE_OPAQUE(paddle::framework::LoDTensorArray);
...@@ -910,9 +911,16 @@ void BindTensor(pybind11::module &m) { // NOLINT ...@@ -910,9 +911,16 @@ void BindTensor(pybind11::module &m) { // NOLINT
int flags = memory::allocation::MAPPED_SHAREDMEM | int flags = memory::allocation::MAPPED_SHAREDMEM |
memory::allocation::MAPPED_EXCLUSIVE; memory::allocation::MAPPED_EXCLUSIVE;
std::string handle = memory::allocation::GetIPCName(); std::string handle = memory::allocation::GetIPCName();
int find_id = -1;
if (FLAGS_use_shm_cache) {
find_id = memory::allocation::MemoryMapAllocationPool::Instance().FindFromCache(flags, data_size); // NOLINT
}
if (find_id != -1) {
handle = memory::allocation::MemoryMapAllocationPool::Instance().GetById(find_id).file_name_; // NOLINT
}
auto shared_holder = auto shared_holder =
memory::allocation::AllocateRefcountedMemoryMapAllocation( memory::allocation::AllocateRefcountedMemoryMapAllocation(
handle, flags, data_size); handle, flags, data_size, find_id);
// copy data & reset holder // copy data & reset holder
if (platform::is_cuda_pinned_place(holder->place())) { if (platform::is_cuda_pinned_place(holder->place())) {
...@@ -961,10 +969,13 @@ void BindTensor(pybind11::module &m) { // NOLINT ...@@ -961,10 +969,13 @@ void BindTensor(pybind11::module &m) { // NOLINT
size_t size = t[1].cast<size_t>(); size_t size = t[1].cast<size_t>();
int flags = memory::allocation::MAPPED_SHAREDMEM | int flags = memory::allocation::MAPPED_SHAREDMEM |
memory::allocation::MAPPED_NOCREATE; memory::allocation::MAPPED_NOCREATE;
int find_id = -1;
if (FLAGS_use_shm_cache) {
find_id = memory::allocation::MemoryMapAllocationPool::Instance().FindFromCache(flags, size, ipc_name, /*check_refcount*/ false); // NOLINT
}
auto shared_holder = auto shared_holder =
memory::allocation::AllocateRefcountedMemoryMapAllocation( memory::allocation::AllocateRefcountedMemoryMapAllocation(
ipc_name, flags, size); ipc_name, flags, size, find_id);
// 3. Rebuild Tensor // 3. Rebuild Tensor
tensor.ResetHolderWithType( tensor.ResetHolderWithType(
......
...@@ -1193,3 +1193,16 @@ PADDLE_DEFINE_EXPORTED_int32(cudnn_cache_saturation_count, 1, ""); ...@@ -1193,3 +1193,16 @@ PADDLE_DEFINE_EXPORTED_int32(cudnn_cache_saturation_count, 1, "");
PADDLE_DEFINE_EXPORTED_bool(trt_ibuilder_cache, PADDLE_DEFINE_EXPORTED_bool(trt_ibuilder_cache,
false, false,
"Add a persistent ibuilder."); "Add a persistent ibuilder.");
/**
* mmap_allocator related FLAG
* Name: use_shm_cache
* Since Version: 2.5.0
* Value Range: bool, default=true
* Example:
* Note: . If True, mmap_allocator will cache shm file to decrease munmap
* operation.
*/
PADDLE_DEFINE_EXPORTED_bool(use_shm_cache,
true,
"Use shm cache in mmap_allocator.");
...@@ -317,6 +317,7 @@ try: ...@@ -317,6 +317,7 @@ try:
from .libpaddle import _array_to_share_memory_tensor from .libpaddle import _array_to_share_memory_tensor
from .libpaddle import _cleanup_mmap_fds from .libpaddle import _cleanup_mmap_fds
from .libpaddle import _remove_tensor_list_mmap_fds from .libpaddle import _remove_tensor_list_mmap_fds
from .libpaddle import _set_max_memory_map_allocation_pool_size
except Exception as e: except Exception as e:
if has_paddle_dy_lib: if has_paddle_dy_lib:
sys.stderr.write( sys.stderr.write(
......
...@@ -20,6 +20,7 @@ import numbers ...@@ -20,6 +20,7 @@ import numbers
import logging import logging
import itertools import itertools
import threading import threading
import warnings
import numpy as np import numpy as np
from collections import namedtuple from collections import namedtuple
from paddle.fluid.framework import ( from paddle.fluid.framework import (
...@@ -406,6 +407,20 @@ class _DataLoaderIterMultiProcess(_DataLoaderIterBase): ...@@ -406,6 +407,20 @@ class _DataLoaderIterMultiProcess(_DataLoaderIterBase):
self._base_seed = np.random.randint(low=0, high=sys.maxsize) self._base_seed = np.random.randint(low=0, high=sys.maxsize)
# Note(zhangbo): shm_buffer_size is used for MemoryMapAllocationPool.
# MemoryMapAllocationPool is used to cache and reuse shm, thus reducing munmap in dataloader.
# For more details, please see: paddle/fluid/memory/allocation/mmap_allocator.h
try:
self._worker_shm_buffer_size = (2 + 1) * len(self._dataset[0])
except:
self._worker_shm_buffer_size = 0
warnings.warn(
"Setting the shm cache buffer size to 0, equivalent to not using the shm cache policy."
)
self._main_thread_shm_buffer_size = (
(self._worker_shm_buffer_size) * 2 * self._num_workers
)
# init workers and indices queues and put 2 indices in each indices queue # init workers and indices queues and put 2 indices in each indices queue
self._init_workers() self._init_workers()
for _ in range(self._outstanding_capacity): for _ in range(self._outstanding_capacity):
...@@ -450,6 +465,7 @@ class _DataLoaderIterMultiProcess(_DataLoaderIterBase): ...@@ -450,6 +465,7 @@ class _DataLoaderIterMultiProcess(_DataLoaderIterBase):
self._num_workers, self._num_workers,
self._use_shared_memory, self._use_shared_memory,
self._base_seed, self._base_seed,
self._worker_shm_buffer_size,
), ),
) )
worker.daemon = True worker.daemon = True
...@@ -481,6 +497,9 @@ class _DataLoaderIterMultiProcess(_DataLoaderIterBase): ...@@ -481,6 +497,9 @@ class _DataLoaderIterMultiProcess(_DataLoaderIterBase):
self._blocking_queue = core.init_lod_tensor_blocking_queue( self._blocking_queue = core.init_lod_tensor_blocking_queue(
core.Variable(), self._outstanding_capacity, len(self._places) > 1 core.Variable(), self._outstanding_capacity, len(self._places) > 1
) )
core._set_max_memory_map_allocation_pool_size(
self._main_thread_shm_buffer_size
)
self._reader = core.create_py_reader( self._reader = core.create_py_reader(
self._blocking_queue, self._blocking_queue,
self._var_names, self._var_names,
......
...@@ -275,6 +275,7 @@ def _worker_loop( ...@@ -275,6 +275,7 @@ def _worker_loop(
num_workers, num_workers,
use_shared_memory, use_shared_memory,
base_seed, base_seed,
shm_cahce_size=0,
): ):
try: try:
# NOTE: [ mmap files clear ] When the child process exits unexpectedly, # NOTE: [ mmap files clear ] When the child process exits unexpectedly,
...@@ -286,6 +287,8 @@ def _worker_loop( ...@@ -286,6 +287,8 @@ def _worker_loop(
# set signal handler # set signal handler
core._set_process_signal_handler() core._set_process_signal_handler()
core._set_max_memory_map_allocation_pool_size(shm_cahce_size)
# set different numpy seed for each worker # set different numpy seed for each worker
try: try:
import numpy as np import numpy as np
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册