From cf1df5d3cb34d59258427c14b3fe86c77e2e2246 Mon Sep 17 00:00:00 2001 From: Yi Wu Date: Thu, 29 Nov 2018 17:30:33 -0800 Subject: [PATCH] JemallocNodumpAllocator: option to limit tcache memory usage (#4736) Summary: Add option to limit tcache usage by allocation size. This is to reduce total tcache size in case there are many user threads accessing the allocator and incur non-trivial memory usage. Pull Request resolved: https://github.com/facebook/rocksdb/pull/4736 Differential Revision: D13269305 Pulled By: yiwu-arbug fbshipit-source-id: 95a9b7fc67facd66837c849137e30e137112e19d --- include/rocksdb/cache.h | 4 ++++ include/rocksdb/memory_allocator.h | 35 +++++++++++++++++++++++++++++- util/jemalloc_nodump_allocator.cc | 26 +++++++++++++++++----- util/jemalloc_nodump_allocator.h | 8 +++++-- 4 files changed, 65 insertions(+), 8 deletions(-) diff --git a/include/rocksdb/cache.h b/include/rocksdb/cache.h index 3ceda0d01..190112b37 100644 --- a/include/rocksdb/cache.h +++ b/include/rocksdb/cache.h @@ -62,6 +62,10 @@ struct LRUCacheOptions { // If non-nullptr will use this allocator instead of system allocator when // allocating memory for cache blocks. Call this method before you start using // the cache! + // + // Caveat: when the cache is used as block cache, the memory allocator is + // ignored when dealing with compression libraries that allocate memory + // internally (currently only XPRESS). std::shared_ptr memory_allocator; LRUCacheOptions() {} diff --git a/include/rocksdb/memory_allocator.h b/include/rocksdb/memory_allocator.h index 15aab65fc..889c0e921 100644 --- a/include/rocksdb/memory_allocator.h +++ b/include/rocksdb/memory_allocator.h @@ -36,9 +36,42 @@ class MemoryAllocator { } }; -// Generate cache allocators which allocates through Jemalloc and utilize +struct JemallocAllocatorOptions { + // Jemalloc tcache cache allocations by size class. For each size class, + // it caches between 20 (for large size classes) to 200 (for small size + // classes). To reduce tcache memory usage in case the allocator is access + // by large number of threads, we can control whether to cache an allocation + // by its size. + bool limit_tcache_size = false; + + // Lower bound of allocation size to use tcache, if limit_tcache_size=true. + // When used with block cache, it is recommneded to set it to block_size/4. + size_t tcache_size_lower_bound = 1024; + + // Upper bound of allocation size to use tcache, if limit_tcache_size=true. + // When used with block cache, it is recommneded to set it to block_size. + size_t tcache_size_upper_bound = 16 * 1024; +}; + +// Generate memory allocators which allocates through Jemalloc and utilize // MADV_DONTDUMP through madvice to exclude cache items from core dump. +// Applications can use the allocator with block cache to exclude block cache +// usage from core dump. +// +// Implementation details: +// The JemallocNodumpAllocator creates a delicated jemalloc arena, and all +// allocations of the JemallocNodumpAllocator is through the same arena. +// The memory allocator hooks memory allocation of the arena, and call +// madvice() with MADV_DONTDUMP flag to exclude the piece of memory from +// core dump. Side benefit of using single arena would be reduce of jemalloc +// metadata for some workload. +// +// To mitigate mutex contention for using one single arena, jemalloc tcache +// (thread-local cache) is enabled to cache unused allocations for future use. +// The tcache normally incur 0.5M extra memory usage per-thread. The usage +// can be reduce by limitting allocation sizes to cache. extern Status NewJemallocNodumpAllocator( + JemallocAllocatorOptions& options, std::shared_ptr* memory_allocator); } // namespace rocksdb diff --git a/util/jemalloc_nodump_allocator.cc b/util/jemalloc_nodump_allocator.cc index 5b8b735bc..a783d33d5 100644 --- a/util/jemalloc_nodump_allocator.cc +++ b/util/jemalloc_nodump_allocator.cc @@ -19,15 +19,21 @@ namespace rocksdb { std::atomic JemallocNodumpAllocator::original_alloc_{nullptr}; JemallocNodumpAllocator::JemallocNodumpAllocator( + JemallocAllocatorOptions& options, std::unique_ptr&& arena_hooks, unsigned arena_index) - : arena_hooks_(std::move(arena_hooks)), + : options_(options), + arena_hooks_(std::move(arena_hooks)), arena_index_(arena_index), tcache_(&JemallocNodumpAllocator::DestroyThreadSpecificCache) {} -int JemallocNodumpAllocator::GetThreadSpecificCache() { +int JemallocNodumpAllocator::GetThreadSpecificCache(size_t size) { // We always enable tcache. The only corner case is when there are a ton of // threads accessing with low frequency, then it could consume a lot of // memory (may reach # threads * ~1MB) without bringing too much benefit. + if (options_.limit_tcache_size && (size <= options_.tcache_size_lower_bound || + size > options_.tcache_size_upper_bound)) { + return MALLOCX_TCACHE_NONE; + } unsigned* tcache_index = reinterpret_cast(tcache_.Get()); if (UNLIKELY(tcache_index == nullptr)) { // Instantiate tcache. @@ -46,13 +52,17 @@ int JemallocNodumpAllocator::GetThreadSpecificCache() { } void* JemallocNodumpAllocator::Allocate(size_t size) { - int tcache_flag = GetThreadSpecificCache(); + int tcache_flag = GetThreadSpecificCache(size); return mallocx(size, MALLOCX_ARENA(arena_index_) | tcache_flag); } void JemallocNodumpAllocator::Deallocate(void* p) { // Obtain tcache. - int tcache_flag = GetThreadSpecificCache(); + size_t size = 0; + if (options_.limit_tcache_size) { + size = malloc_usable_size(p); + } + int tcache_flag = GetThreadSpecificCache(size); // No need to pass arena index to dallocx(). Jemalloc will find arena index // from its own metadata. dallocx(p, tcache_flag); @@ -120,6 +130,7 @@ size_t JemallocNodumpAllocator::UsableSize(void* p, #endif // ROCKSDB_JEMALLOC_NODUMP_ALLOCATOR Status NewJemallocNodumpAllocator( + JemallocAllocatorOptions& options, std::shared_ptr* memory_allocator) { *memory_allocator = nullptr; #ifndef ROCKSDB_JEMALLOC_NODUMP_ALLOCATOR @@ -130,6 +141,11 @@ Status NewJemallocNodumpAllocator( if (memory_allocator == nullptr) { return Status::InvalidArgument("memory_allocator must be non-null."); } + if (options.limit_tcache_size && + options.tcache_size_lower_bound >= options.tcache_size_upper_bound) { + return Status::InvalidArgument( + "tcache_size_lower_bound larger or equal to tcache_size_upper_bound."); + } // Create arena. unsigned arena_index = 0; @@ -177,7 +193,7 @@ Status NewJemallocNodumpAllocator( // Create cache allocator. memory_allocator->reset( - new JemallocNodumpAllocator(std::move(new_hooks), arena_index)); + new JemallocNodumpAllocator(options, std::move(new_hooks), arena_index)); return Status::OK(); #endif // ROCKSDB_JEMALLOC_NODUMP_ALLOCATOR } diff --git a/util/jemalloc_nodump_allocator.h b/util/jemalloc_nodump_allocator.h index 209cee234..914088de1 100644 --- a/util/jemalloc_nodump_allocator.h +++ b/util/jemalloc_nodump_allocator.h @@ -25,7 +25,8 @@ namespace rocksdb { class JemallocNodumpAllocator : public MemoryAllocator { public: - JemallocNodumpAllocator(std::unique_ptr&& arena_hooks, + JemallocNodumpAllocator(JemallocAllocatorOptions& options, + std::unique_ptr&& arena_hooks, unsigned arena_index); ~JemallocNodumpAllocator(); @@ -36,6 +37,7 @@ class JemallocNodumpAllocator : public MemoryAllocator { private: friend Status NewJemallocNodumpAllocator( + JemallocAllocatorOptions& options, std::shared_ptr* memory_allocator); // Custom alloc hook to replace jemalloc default alloc. @@ -51,7 +53,7 @@ class JemallocNodumpAllocator : public MemoryAllocator { // Get or create tcache. Return flag suitable to use with `mallocx`: // either MALLOCX_TCACHE_NONE or MALLOCX_TCACHE(tc). - int GetThreadSpecificCache(); + int GetThreadSpecificCache(size_t size); // A function pointer to jemalloc default alloc. Use atomic to make sure // NewJemallocNodumpAllocator is thread-safe. @@ -60,6 +62,8 @@ class JemallocNodumpAllocator : public MemoryAllocator { // alloc needs to be static to pass to jemalloc as function pointer. static std::atomic original_alloc_; + const JemallocAllocatorOptions options_; + // Custom hooks has to outlive corresponding arena. const std::unique_ptr arena_hooks_; -- GitLab