From cf1df5d3cb34d59258427c14b3fe86c77e2e2246 Mon Sep 17 00:00:00 2001
From: Yi Wu <yiwu@fb.com>
Date: Thu, 29 Nov 2018 17:30:33 -0800
Subject: [PATCH] JemallocNodumpAllocator: option to limit tcache memory usage
 (#4736)

Summary:
Add option to limit tcache usage by allocation size. This is to reduce total tcache size in case there are many user threads accessing the allocator and incur non-trivial memory usage.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/4736

Differential Revision: D13269305

Pulled By: yiwu-arbug

fbshipit-source-id: 95a9b7fc67facd66837c849137e30e137112e19d
---
 include/rocksdb/cache.h            |  4 ++++
 include/rocksdb/memory_allocator.h | 35 +++++++++++++++++++++++++++++-
 util/jemalloc_nodump_allocator.cc  | 26 +++++++++++++++++-----
 util/jemalloc_nodump_allocator.h   |  8 +++++--
 4 files changed, 65 insertions(+), 8 deletions(-)
diff --git a/include/rocksdb/cache.h b/include/rocksdb/cache.h
index 3ceda0d01..190112b37 100644
--- a/include/rocksdb/cache.h
+++ b/include/rocksdb/cache.h
@@ -62,6 +62,10 @@ struct LRUCacheOptions {
   // If non-nullptr will use this allocator instead of system allocator when
   // allocating memory for cache blocks. Call this method before you start using
   // the cache!
+  //
+  // Caveat: when the cache is used as block cache, the memory allocator is
+  // ignored when dealing with compression libraries that allocate memory
+  // internally (currently only XPRESS).
   std::shared_ptr<MemoryAllocator> memory_allocator;
 
   LRUCacheOptions() {}
diff --git a/include/rocksdb/memory_allocator.h b/include/rocksdb/memory_allocator.h
index 15aab65fc..889c0e921 100644
--- a/include/rocksdb/memory_allocator.h
+++ b/include/rocksdb/memory_allocator.h
@@ -36,9 +36,42 @@ class MemoryAllocator {
   }
 };
 
-// Generate cache allocators which allocates through Jemalloc and utilize
+struct JemallocAllocatorOptions {
+  // Jemalloc tcache cache allocations by size class. For each size class,
+  // it caches between 20 (for large size classes) to 200 (for small size
+  // classes). To reduce tcache memory usage in case the allocator is access
+  // by large number of threads, we can control whether to cache an allocation
+  // by its size.
+  bool limit_tcache_size = false;
+
+  // Lower bound of allocation size to use tcache, if limit_tcache_size=true.
+  // When used with block cache, it is recommneded to set it to block_size/4.
+  size_t tcache_size_lower_bound = 1024;
+
+  // Upper bound of allocation size to use tcache, if limit_tcache_size=true.
+  // When used with block cache, it is recommneded to set it to block_size.
+  size_t tcache_size_upper_bound = 16 * 1024;
+};
+
+// Generate memory allocators which allocates through Jemalloc and utilize
 // MADV_DONTDUMP through madvice to exclude cache items from core dump.
+// Applications can use the allocator with block cache to exclude block cache
+// usage from core dump.
+//
+// Implementation details:
+// The JemallocNodumpAllocator creates a delicated jemalloc arena, and all
+// allocations of the JemallocNodumpAllocator is through the same arena.
+// The memory allocator hooks memory allocation of the arena, and call
+// madvice() with MADV_DONTDUMP flag to exclude the piece of memory from
+// core dump. Side benefit of using single arena would be reduce of jemalloc
+// metadata for some workload.
+//
+// To mitigate mutex contention for using one single arena, jemalloc tcache
+// (thread-local cache) is enabled to cache unused allocations for future use.
+// The tcache normally incur 0.5M extra memory usage per-thread. The usage
+// can be reduce by limitting allocation sizes to cache.
 extern Status NewJemallocNodumpAllocator(
+    JemallocAllocatorOptions& options,
     std::shared_ptr<MemoryAllocator>* memory_allocator);
 
 }  // namespace rocksdb
diff --git a/util/jemalloc_nodump_allocator.cc b/util/jemalloc_nodump_allocator.cc
index 5b8b735bc..a783d33d5 100644
--- a/util/jemalloc_nodump_allocator.cc
+++ b/util/jemalloc_nodump_allocator.cc
@@ -19,15 +19,21 @@ namespace rocksdb {
 std::atomic<extent_alloc_t*> JemallocNodumpAllocator::original_alloc_{nullptr};
 
 JemallocNodumpAllocator::JemallocNodumpAllocator(
+    JemallocAllocatorOptions& options,
     std::unique_ptr<extent_hooks_t>&& arena_hooks, unsigned arena_index)
-    : arena_hooks_(std::move(arena_hooks)),
+    : options_(options),
+      arena_hooks_(std::move(arena_hooks)),
       arena_index_(arena_index),
       tcache_(&JemallocNodumpAllocator::DestroyThreadSpecificCache) {}
 
-int JemallocNodumpAllocator::GetThreadSpecificCache() {
+int JemallocNodumpAllocator::GetThreadSpecificCache(size_t size) {
   // We always enable tcache. The only corner case is when there are a ton of
   // threads accessing with low frequency, then it could consume a lot of
   // memory (may reach # threads * ~1MB) without bringing too much benefit.
+  if (options_.limit_tcache_size && (size <= options_.tcache_size_lower_bound ||
+                                     size > options_.tcache_size_upper_bound)) {
+    return MALLOCX_TCACHE_NONE;
+  }
   unsigned* tcache_index = reinterpret_cast<unsigned*>(tcache_.Get());
   if (UNLIKELY(tcache_index == nullptr)) {
     // Instantiate tcache.
@@ -46,13 +52,17 @@ int JemallocNodumpAllocator::GetThreadSpecificCache() {
 }
 
 void* JemallocNodumpAllocator::Allocate(size_t size) {
-  int tcache_flag = GetThreadSpecificCache();
+  int tcache_flag = GetThreadSpecificCache(size);
   return mallocx(size, MALLOCX_ARENA(arena_index_) | tcache_flag);
 }
 
 void JemallocNodumpAllocator::Deallocate(void* p) {
   // Obtain tcache.
-  int tcache_flag = GetThreadSpecificCache();
+  size_t size = 0;
+  if (options_.limit_tcache_size) {
+    size = malloc_usable_size(p);
+  }
+  int tcache_flag = GetThreadSpecificCache(size);
   // No need to pass arena index to dallocx(). Jemalloc will find arena index
   // from its own metadata.
   dallocx(p, tcache_flag);
@@ -120,6 +130,7 @@ size_t JemallocNodumpAllocator::UsableSize(void* p,
 #endif  // ROCKSDB_JEMALLOC_NODUMP_ALLOCATOR
 
 Status NewJemallocNodumpAllocator(
+    JemallocAllocatorOptions& options,
     std::shared_ptr<MemoryAllocator>* memory_allocator) {
   *memory_allocator = nullptr;
 #ifndef ROCKSDB_JEMALLOC_NODUMP_ALLOCATOR
@@ -130,6 +141,11 @@ Status NewJemallocNodumpAllocator(
   if (memory_allocator == nullptr) {
     return Status::InvalidArgument("memory_allocator must be non-null.");
   }
+  if (options.limit_tcache_size &&
+      options.tcache_size_lower_bound >= options.tcache_size_upper_bound) {
+    return Status::InvalidArgument(
+        "tcache_size_lower_bound larger or equal to tcache_size_upper_bound.");
+  }
 
   // Create arena.
   unsigned arena_index = 0;
@@ -177,7 +193,7 @@ Status NewJemallocNodumpAllocator(
 
   // Create cache allocator.
   memory_allocator->reset(
-      new JemallocNodumpAllocator(std::move(new_hooks), arena_index));
+      new JemallocNodumpAllocator(options, std::move(new_hooks), arena_index));
   return Status::OK();
 #endif  // ROCKSDB_JEMALLOC_NODUMP_ALLOCATOR
 }
diff --git a/util/jemalloc_nodump_allocator.h b/util/jemalloc_nodump_allocator.h
index 209cee234..914088de1 100644
--- a/util/jemalloc_nodump_allocator.h
+++ b/util/jemalloc_nodump_allocator.h
@@ -25,7 +25,8 @@ namespace rocksdb {
 
 class JemallocNodumpAllocator : public MemoryAllocator {
  public:
-  JemallocNodumpAllocator(std::unique_ptr<extent_hooks_t>&& arena_hooks,
+  JemallocNodumpAllocator(JemallocAllocatorOptions& options,
+                          std::unique_ptr<extent_hooks_t>&& arena_hooks,
                           unsigned arena_index);
   ~JemallocNodumpAllocator();
 
@@ -36,6 +37,7 @@ class JemallocNodumpAllocator : public MemoryAllocator {
 
  private:
   friend Status NewJemallocNodumpAllocator(
+      JemallocAllocatorOptions& options,
       std::shared_ptr<MemoryAllocator>* memory_allocator);
 
   // Custom alloc hook to replace jemalloc default alloc.
@@ -51,7 +53,7 @@ class JemallocNodumpAllocator : public MemoryAllocator {
 
   // Get or create tcache. Return flag suitable to use with `mallocx`:
   // either MALLOCX_TCACHE_NONE or MALLOCX_TCACHE(tc).
-  int GetThreadSpecificCache();
+  int GetThreadSpecificCache(size_t size);
 
   // A function pointer to jemalloc default alloc. Use atomic to make sure
   // NewJemallocNodumpAllocator is thread-safe.
@@ -60,6 +62,8 @@ class JemallocNodumpAllocator : public MemoryAllocator {
   // alloc needs to be static to pass to jemalloc as function pointer.
   static std::atomic<extent_alloc_t*> original_alloc_;
 
+  const JemallocAllocatorOptions options_;
+
   // Custom hooks has to outlive corresponding arena.
   const std::unique_ptr<extent_hooks_t> arena_hooks_;
 
-- 
GitLab