JemallocNodumpAllocator: option to limit tcache memory usage (#4736)

Summary: Add option to limit tcache usage by allocation size. This is to reduce total tcache size in case there are many user threads accessing the allocator and incur non-trivial memory usage. Pull Request resolved: https://github.com/facebook/rocksdb/pull/4736 Differential Revision: D13269305 Pulled By: yiwu-arbug fbshipit-source-id: 95a9b7fc67facd66837c849137e30e137112e19d

JemallocNodumpAllocator: option to limit tcache memory usage (#4736)
Summary: Add option to limit tcache usage by allocation size. This is to reduce total tcache size in case there are many user threads accessing the allocator and incur non-trivial memory usage. Pull Request resolved: https://github.com/facebook/rocksdb/pull/4736 Differential Revision: D13269305 Pulled By: yiwu-arbug fbshipit-source-id: 95a9b7fc67facd66837c849137e30e137112e19d
cf1df5d3 · Yi Wu · Facebook Github Bot · 70645355 · cf1df5d3 · cf1df5d3
4 changed file
--- a/include/rocksdb/cache.h
+++ b/include/rocksdb/cache.h
@@ -62,6 +62,10 @@ struct LRUCacheOptions {
  // If non-nullptr will use this allocator instead of system allocator when
  // allocating memory for cache blocks. Call this method before you start using
  // the cache!
+  //
+  // Caveat: when the cache is used as block cache, the memory allocator is
+  // ignored when dealing with compression libraries that allocate memory
+  // internally (currently only XPRESS).
  std::shared_ptr<MemoryAllocator> memory_allocator;
  LRUCacheOptions() {}

--- a/include/rocksdb/memory_allocator.h
+++ b/include/rocksdb/memory_allocator.h
@@ -36,9 +36,42 @@ class MemoryAllocator {
  }
 };
-// Generate cache allocators which allocates through Jemalloc and utilize
+struct JemallocAllocatorOptions {
+  // Jemalloc tcache cache allocations by size class. For each size class,
+  // it caches between 20 (for large size classes) to 200 (for small size
+  // classes). To reduce tcache memory usage in case the allocator is access
+  // by large number of threads, we can control whether to cache an allocation
+  // by its size.
+  bool limit_tcache_size = false;
+  // Lower bound of allocation size to use tcache, if limit_tcache_size=true.
+  // When used with block cache, it is recommneded to set it to block_size/4.
+  size_t tcache_size_lower_bound = 1024;
+  // Upper bound of allocation size to use tcache, if limit_tcache_size=true.
+  // When used with block cache, it is recommneded to set it to block_size.
+  size_t tcache_size_upper_bound = 16 * 1024;
+};
+// Generate memory allocators which allocates through Jemalloc and utilize
 // MADV_DONTDUMP through madvice to exclude cache items from core dump.
+// Applications can use the allocator with block cache to exclude block cache
+// usage from core dump.
+//
+// Implementation details:
+// The JemallocNodumpAllocator creates a delicated jemalloc arena, and all
+// allocations of the JemallocNodumpAllocator is through the same arena.
+// The memory allocator hooks memory allocation of the arena, and call
+// madvice() with MADV_DONTDUMP flag to exclude the piece of memory from
+// core dump. Side benefit of using single arena would be reduce of jemalloc
+// metadata for some workload.
+//
+// To mitigate mutex contention for using one single arena, jemalloc tcache
+// (thread-local cache) is enabled to cache unused allocations for future use.
+// The tcache normally incur 0.5M extra memory usage per-thread. The usage
+// can be reduce by limitting allocation sizes to cache.
 extern Status NewJemallocNodumpAllocator(
+    JemallocAllocatorOptions& options,
    std::shared_ptr<MemoryAllocator>* memory_allocator);
 }  // namespace rocksdb
--- a/util/jemalloc_nodump_allocator.cc
+++ b/util/jemalloc_nodump_allocator.cc
@@ -19,15 +19,21 @@ namespace rocksdb {
 std::atomic<extent_alloc_t*> JemallocNodumpAllocator::original_alloc_{nullptr};
 JemallocNodumpAllocator::JemallocNodumpAllocator(
+    JemallocAllocatorOptions& options,
    std::unique_ptr<extent_hooks_t>&& arena_hooks, unsigned arena_index)
-    : arena_hooks_(std::move(arena_hooks)),
+    : options_(options),
+      arena_hooks_(std::move(arena_hooks)),
      arena_index_(arena_index),
      tcache_(&JemallocNodumpAllocator::DestroyThreadSpecificCache) {}
-int JemallocNodumpAllocator::GetThreadSpecificCache() {
+int JemallocNodumpAllocator::GetThreadSpecificCache(size_t size) {
  // We always enable tcache. The only corner case is when there are a ton of
  // threads accessing with low frequency, then it could consume a lot of
  // memory (may reach # threads * ~1MB) without bringing too much benefit.
+  if (options_.limit_tcache_size && (size <= options_.tcache_size_lower_bound ||
+                                     size > options_.tcache_size_upper_bound)) {
+    return MALLOCX_TCACHE_NONE;
+  }
  unsigned* tcache_index = reinterpret_cast<unsigned*>(tcache_.Get());
  if (UNLIKELY(tcache_index == nullptr)) {
    // Instantiate tcache.
@@ -46,13 +52,17 @@ int JemallocNodumpAllocator::GetThreadSpecificCache() {
 }
 void* JemallocNodumpAllocator::Allocate(size_t size) {
-  int tcache_flag = GetThreadSpecificCache();
+  int tcache_flag = GetThreadSpecificCache(size);
  return mallocx(size, MALLOCX_ARENA(arena_index_) | tcache_flag);
 }
 void JemallocNodumpAllocator::Deallocate(void* p) {
  // Obtain tcache.
-  int tcache_flag = GetThreadSpecificCache();
+  size_t size = 0;
+  if (options_.limit_tcache_size) {
+    size = malloc_usable_size(p);
+  }
+  int tcache_flag = GetThreadSpecificCache(size);
  // No need to pass arena index to dallocx(). Jemalloc will find arena index
  // from its own metadata.
  dallocx(p, tcache_flag);
@@ -120,6 +130,7 @@ size_t JemallocNodumpAllocator::UsableSize(void* p,
 #endif  // ROCKSDB_JEMALLOC_NODUMP_ALLOCATOR
 Status NewJemallocNodumpAllocator(
+    JemallocAllocatorOptions& options,
    std::shared_ptr<MemoryAllocator>* memory_allocator) {
  *memory_allocator = nullptr;
 #ifndef ROCKSDB_JEMALLOC_NODUMP_ALLOCATOR
@@ -130,6 +141,11 @@ Status NewJemallocNodumpAllocator(
  if (memory_allocator == nullptr) {
    return Status::InvalidArgument("memory_allocator must be non-null.");
  }
+  if (options.limit_tcache_size &&
+      options.tcache_size_lower_bound >= options.tcache_size_upper_bound) {
+    return Status::InvalidArgument(
+        "tcache_size_lower_bound larger or equal to tcache_size_upper_bound.");
+  }
  // Create arena.
  unsigned arena_index = 0;
@@ -177,7 +193,7 @@ Status NewJemallocNodumpAllocator(
  // Create cache allocator.
  memory_allocator->reset(
-      new JemallocNodumpAllocator(std::move(new_hooks), arena_index));
+      new JemallocNodumpAllocator(options, std::move(new_hooks), arena_index));
  return Status::OK();
 #endif  // ROCKSDB_JEMALLOC_NODUMP_ALLOCATOR
 }

--- a/util/jemalloc_nodump_allocator.h
+++ b/util/jemalloc_nodump_allocator.h
@@ -25,7 +25,8 @@ namespace rocksdb {
 class JemallocNodumpAllocator : public MemoryAllocator {
 public:
-  JemallocNodumpAllocator(std::unique_ptr<extent_hooks_t>&& arena_hooks,
+  JemallocNodumpAllocator(JemallocAllocatorOptions& options,
+                          std::unique_ptr<extent_hooks_t>&& arena_hooks,
                          unsigned arena_index);
  ~JemallocNodumpAllocator();
@@ -36,6 +37,7 @@ class JemallocNodumpAllocator : public MemoryAllocator {
 private:
  friend Status NewJemallocNodumpAllocator(
+      JemallocAllocatorOptions& options,
      std::shared_ptr<MemoryAllocator>* memory_allocator);
  // Custom alloc hook to replace jemalloc default alloc.
@@ -51,7 +53,7 @@ class JemallocNodumpAllocator : public MemoryAllocator {
  // Get or create tcache. Return flag suitable to use with `mallocx`:
  // either MALLOCX_TCACHE_NONE or MALLOCX_TCACHE(tc).
-  int GetThreadSpecificCache();
+  int GetThreadSpecificCache(size_t size);
  // A function pointer to jemalloc default alloc. Use atomic to make sure
  // NewJemallocNodumpAllocator is thread-safe.
@@ -60,6 +62,8 @@ class JemallocNodumpAllocator : public MemoryAllocator {
  // alloc needs to be static to pass to jemalloc as function pointer.
  static std::atomic<extent_alloc_t*> original_alloc_;
+  const JemallocAllocatorOptions options_;
  // Custom hooks has to outlive corresponding arena.
  const std::unique_ptr<extent_hooks_t> arena_hooks_;