From b1b6f87fbe27500f93388322eaaf57d41d7416d3 Mon Sep 17 00:00:00 2001
From: Peter Dillinger <peterd@fb.com>
Date: Fri, 14 Jul 2023 16:19:22 -0700
Subject: [PATCH] Some small improvements to HyperClockCache (#11601)

Summary:
Stacked on https://github.com/facebook/rocksdb/issues/11572
* Minimize use of std::function and lambdas to minimize chances of
compiler heap-allocating closures (unnecessary stress on allocator). It
appears that converting FindSlot to a template enables inlining the
lambda parameters, avoiding heap allocations.
* Clean up some logic with FindSlot (FIXMEs from https://github.com/facebook/rocksdb/issues/11572)
* Fix handling of rare case of probing all slots, with new unit test.
(Previously Insert would not roll back displacements in that case, which
would kill performance if it were to happen.)
* Add an -early_exit option to cache_bench for gathering memory stats
before deallocation.

Pull Request resolved: https://github.com/facebook/rocksdb/pull/11601

Test Plan:
unit test added for probing all slots

## Seeing heap allocations
Run `MALLOC_CONF="stats_print:true" ./cache_bench -cache_type=hyper_clock_cache`
before https://github.com/facebook/rocksdb/issues/11572 vs. after this change. Before, we see this in the
interesting bin statistics:

```
size  nrequests
----  ---------
  32     578460
  64      24340
8192     578460
```
And after:
```
size  nrequests
----  ---------
  32  (insignificant)
  64      24370
8192     579130
```

## Performance test
Build with `make USE_CLANG=1 PORTABLE=0 DEBUG_LEVEL=0 -j32 cache_bench`

Run `./cache_bench -cache_type=hyper_clock_cache -ops_per_thread=5000000`
in before and after configurations, simultaneously:

```
Before: Complete in 33.244 s; Rough parallel ops/sec = 2406442
After:  Complete in 32.773 s; Rough parallel ops/sec = 2441019
```

Reviewed By: jowlyzhang

Differential Revision: D47375092

Pulled By: pdillinger

fbshipit-source-id: 46f0f57257ddb374290a0a38c651764ea60ba410
---
 cache/cache_bench_tool.cc                     |   8 ++
 cache/clock_cache.cc                          | 102 +++++++++---------
 cache/clock_cache.h                           |  40 ++++---
 cache/lru_cache_test.cc                       |  56 ++++++++++
 .../performance_improvements/hcc_perf         |   1 +
 5 files changed, 140 insertions(+), 67 deletions(-)
 create mode 100644 unreleased_history/performance_improvements/hcc_perf

diff --git a/cache/cache_bench_tool.cc b/cache/cache_bench_tool.cc
index 1d93c1d96..61b12b19e 100644
--- a/cache/cache_bench_tool.cc
+++ b/cache/cache_bench_tool.cc
@@ -77,6 +77,10 @@ DEFINE_bool(lean, false,
             "If true, no additional computation is performed besides cache "
             "operations.");
 
+DEFINE_bool(early_exit, false,
+            "Exit before deallocating most memory. Good for malloc stats, e.g."
+            "MALLOC_CONF=\"stats_print:true\"");
+
 DEFINE_string(secondary_cache_uri, "",
               "Full URI for creating a custom secondary cache object");
 static class std::shared_ptr<ROCKSDB_NAMESPACE::SecondaryCache> secondary_cache;
@@ -593,6 +597,10 @@ class CacheBench {
       }
       thread->latency_ns_hist.Add(timer.ElapsedNanos());
     }
+    if (FLAGS_early_exit) {
+      MutexLock l(thread->shared->GetMutex());
+      exit(0);
+    }
     if (handle) {
       cache_->Release(handle);
       handle = nullptr;
diff --git a/cache/clock_cache.cc b/cache/clock_cache.cc
index 6b1caabfa..4d4faeb30 100644
--- a/cache/clock_cache.cc
+++ b/cache/clock_cache.cc
@@ -195,7 +195,6 @@ inline void CorrectNearOverflow(uint64_t old_meta,
 inline bool BeginSlotInsert(const ClockHandleBasicData& proto, ClockHandle& h,
                             uint64_t initial_countdown, bool* already_matches) {
   assert(*already_matches == false);
-
   // Optimistically transition the slot from "empty" to
   // "under construction" (no effect on other states)
   uint64_t old_meta = h.meta.fetch_or(
@@ -486,9 +485,6 @@ Status BaseClockTable::Insert(const ClockHandleBasicData& proto,
   // Do we have the available occupancy? Optimistically assume we do
   // and deal with it if we don't.
   size_t old_occupancy = occupancy_.fetch_add(1, std::memory_order_acquire);
-  auto revert_occupancy_fn = [&]() {
-    occupancy_.fetch_sub(1, std::memory_order_relaxed);
-  };
   // Whether we over-committed and need an eviction to make up for it
   bool need_evict_for_occupancy =
       !derived.GrowIfNeeded(old_occupancy + 1, state);
@@ -501,7 +497,8 @@ Status BaseClockTable::Insert(const ClockHandleBasicData& proto,
     Status s = ChargeUsageMaybeEvictStrict<Table>(
         total_charge, capacity, need_evict_for_occupancy, state);
     if (!s.ok()) {
-      revert_occupancy_fn();
+      // Revert occupancy
+      occupancy_.fetch_sub(1, std::memory_order_relaxed);
       return s;
     }
   } else {
@@ -509,7 +506,8 @@ Status BaseClockTable::Insert(const ClockHandleBasicData& proto,
     bool success = ChargeUsageMaybeEvictNonStrict<Table>(
         total_charge, capacity, need_evict_for_occupancy, state);
     if (!success) {
-      revert_occupancy_fn();
+      // Revert occupancy
+      occupancy_.fetch_sub(1, std::memory_order_relaxed);
       if (handle == nullptr) {
         // Don't insert the entry but still return ok, as if the entry
         // inserted into cache and evicted immediately.
@@ -522,11 +520,6 @@ Status BaseClockTable::Insert(const ClockHandleBasicData& proto,
       }
     }
   }
-  auto revert_usage_fn = [&]() {
-    usage_.fetch_sub(total_charge, std::memory_order_relaxed);
-    // No underflow
-    assert(usage_.load(std::memory_order_relaxed) < SIZE_MAX / 2);
-  };
 
   if (!use_standalone_insert) {
     // Attempt a table insert, but abort if we find an existing entry for the
@@ -551,10 +544,14 @@ Status BaseClockTable::Insert(const ClockHandleBasicData& proto,
       return Status::OK();
     }
     // Not inserted
-    revert_occupancy_fn();
+    // Revert occupancy
+    occupancy_.fetch_sub(1, std::memory_order_relaxed);
     // Maybe fall back on standalone insert
     if (handle == nullptr) {
-      revert_usage_fn();
+      // Revert usage
+      usage_.fetch_sub(total_charge, std::memory_order_relaxed);
+      // No underflow
+      assert(usage_.load(std::memory_order_relaxed) < SIZE_MAX / 2);
       // As if unrefed entry immdiately evicted
       proto.FreeData(allocator_);
       return Status::OK();
@@ -680,47 +677,52 @@ bool HyperClockTable::GrowIfNeeded(size_t new_occupancy, InsertState&) {
 HyperClockTable::HandleImpl* HyperClockTable::DoInsert(
     const ClockHandleBasicData& proto, uint64_t initial_countdown,
     bool keep_ref, InsertState&) {
-  size_t probe = 0;
   bool already_matches = false;
   HandleImpl* e = FindSlot(
       proto.hashed_key,
       [&](HandleImpl* h) {
-        // FIXME: simplify and handle in abort_fn below?
-        bool inserted =
-            TryInsert(proto, *h, initial_countdown, keep_ref, &already_matches);
-        return inserted || already_matches;
+        return TryInsert(proto, *h, initial_countdown, keep_ref,
+                         &already_matches);
       },
-      [&](HandleImpl* /*h*/) { return false; },
       [&](HandleImpl* h) {
-        h->displacements.fetch_add(1, std::memory_order_relaxed);
+        if (already_matches) {
+          // Stop searching & roll back displacements
+          Rollback(proto.hashed_key, h);
+          return true;
+        } else {
+          // Keep going
+          return false;
+        }
       },
-      probe);
-  if (e == nullptr) {
-    // Occupancy check and never abort FindSlot above should generally
-    // prevent this, except it's theoretically possible for other threads
-    // to evict and replace entries in the right order to hit every slot
-    // when it is populated. Assuming random hashing, the chance of that
-    // should be no higher than pow(kStrictLoadFactor, n) for n slots.
-    // That should be infeasible for roughly n >= 256, so if this assertion
-    // fails, that suggests something is going wrong.
-    assert(GetTableSize() < 256);
-    // WART/FIXME: need to roll back every slot
-    already_matches = true;
+      [&](HandleImpl* h, bool is_last) {
+        if (is_last) {
+          // Search is ending. Roll back displacements
+          Rollback(proto.hashed_key, h);
+        } else {
+          h->displacements.fetch_add(1, std::memory_order_relaxed);
+        }
+      });
+  if (already_matches) {
+    // Insertion skipped
+    return nullptr;
   }
-  if (!already_matches) {
+  if (e != nullptr) {
     // Successfully inserted
-    assert(e);
     return e;
   }
-  // Roll back displacements from failed table insertion
-  Rollback(proto.hashed_key, e);
-  // Insertion skipped
+  // Else, no available slot found. Occupancy check should generally prevent
+  // this, except it's theoretically possible for other threads to evict and
+  // replace entries in the right order to hit every slot when it is populated.
+  // Assuming random hashing, the chance of that should be no higher than
+  // pow(kStrictLoadFactor, n) for n slots. That should be infeasible for
+  // roughly n >= 256, so if this assertion fails, that suggests something is
+  // going wrong.
+  assert(GetTableSize() < 256);
   return nullptr;
 }
 
 HyperClockTable::HandleImpl* HyperClockTable::Lookup(
     const UniqueId64x2& hashed_key) {
-  size_t probe = 0;
   HandleImpl* e = FindSlot(
       hashed_key,
       [&](HandleImpl* h) {
@@ -780,7 +782,7 @@ HyperClockTable::HandleImpl* HyperClockTable::Lookup(
       [&](HandleImpl* h) {
         return h->displacements.load(std::memory_order_relaxed) == 0;
       },
-      [&](HandleImpl* /*h*/) {}, probe);
+      [&](HandleImpl* /*h*/, bool /*is_last*/) {});
 
   return e;
 }
@@ -873,7 +875,6 @@ void HyperClockTable::TEST_ReleaseN(HandleImpl* h, size_t n) {
 #endif
 
 void HyperClockTable::Erase(const UniqueId64x2& hashed_key) {
-  size_t probe = 0;
   (void)FindSlot(
       hashed_key,
       [&](HandleImpl* h) {
@@ -940,7 +941,7 @@ void HyperClockTable::Erase(const UniqueId64x2& hashed_key) {
       [&](HandleImpl* h) {
         return h->displacements.load(std::memory_order_relaxed) == 0;
       },
-      [&](HandleImpl* /*h*/) {}, probe);
+      [&](HandleImpl* /*h*/, bool /*is_last*/) {});
 }
 
 void HyperClockTable::ConstApplyToEntriesRange(
@@ -1005,10 +1006,10 @@ void HyperClockTable::EraseUnRefEntries() {
   }
 }
 
+template <typename MatchFn, typename AbortFn, typename UpdateFn>
 inline HyperClockTable::HandleImpl* HyperClockTable::FindSlot(
-    const UniqueId64x2& hashed_key, std::function<bool(HandleImpl*)> match_fn,
-    std::function<bool(HandleImpl*)> abort_fn,
-    std::function<void(HandleImpl*)> update_fn, size_t& probe) {
+    const UniqueId64x2& hashed_key, MatchFn match_fn, AbortFn abort_fn,
+    UpdateFn update_fn) {
   // NOTE: upper 32 bits of hashed_key[0] is used for sharding
   //
   // We use double-hashing probing. Every probe in the sequence is a
@@ -1022,20 +1023,21 @@ inline HyperClockTable::HandleImpl* HyperClockTable::FindSlot(
   // TODO: we could also reconsider linear probing, though locality benefits
   // are limited because each slot is a full cache line
   size_t increment = static_cast<size_t>(hashed_key[0]) | 1U;
-  size_t current = ModTableSize(base + probe * increment);
-  while (probe <= length_bits_mask_) {
+  size_t first = ModTableSize(base);
+  size_t current = first;
+  bool is_last;
+  do {
     HandleImpl* h = &array_[current];
     if (match_fn(h)) {
-      probe++;
       return h;
     }
     if (abort_fn(h)) {
       return nullptr;
     }
-    probe++;
-    update_fn(h);
     current = ModTableSize(current + increment);
-  }
+    is_last = current == first;
+    update_fn(h, is_last);
+  } while (!is_last);
   // We looped back.
   return nullptr;
 }
diff --git a/cache/clock_cache.h b/cache/clock_cache.h
index fff3ef43d..992083e4f 100644
--- a/cache/clock_cache.h
+++ b/cache/clock_cache.h
@@ -549,7 +549,12 @@ class HyperClockTable : public BaseClockTable {
   size_t GetOccupancyLimit() const { return occupancy_limit_; }
 
 #ifndef NDEBUG
-  void TEST_ReleaseN(HandleImpl* h, size_t n);
+  size_t& TEST_MutableOccupancyLimit() const {
+    return const_cast<size_t&>(occupancy_limit_);
+  }
+
+  // Release N references
+  void TEST_ReleaseN(HandleImpl* handle, size_t n);
 #endif
 
  private:  // functions
@@ -558,22 +563,18 @@ class HyperClockTable : public BaseClockTable {
     return static_cast<size_t>(x) & length_bits_mask_;
   }
 
-  // Returns the first slot in the probe sequence, starting from the given
-  // probe number, with a handle e such that match(e) is true. At every
-  // step, the function first tests whether match(e) holds. If this is false,
-  // it evaluates abort(e) to decide whether the search should be aborted,
-  // and in the affirmative returns -1. For every handle e probed except
-  // the last one, the function runs update(e).
-  // The probe parameter is modified as follows. We say a probe to a handle
-  // e is aborting if match(e) is false and abort(e) is true. Then the final
-  // value of probe is one more than the last non-aborting probe during the
-  // call. This is so that that the variable can be used to keep track of
-  // progress across consecutive calls to FindSlot.
-  inline HandleImpl* FindSlot(const UniqueId64x2& hashed_key,
-                              std::function<bool(HandleImpl*)> match,
-                              std::function<bool(HandleImpl*)> stop,
-                              std::function<void(HandleImpl*)> update,
-                              size_t& probe);
+  // Returns the first slot in the probe sequence with a handle e such that
+  // match_fn(e) is true. At every step, the function first tests whether
+  // match_fn(e) holds. If this is false, it evaluates abort_fn(e) to decide
+  // whether the search should be aborted, and if so, FindSlot immediately
+  // returns nullptr. For every handle e that is not a match and not aborted,
+  // FindSlot runs update_fn(e, is_last) where is_last is set to true iff that
+  // slot will be the last probed because the next would cycle back to the first
+  // slot probed. This function uses templates instead of std::function to
+  // minimize the risk of heap-allocated closures being created.
+  template <typename MatchFn, typename AbortFn, typename UpdateFn>
+  inline HandleImpl* FindSlot(const UniqueId64x2& hashed_key, MatchFn match_fn,
+                              AbortFn abort_fn, UpdateFn update_fn);
 
   // Re-decrement all displacements in probe path starting from beginning
   // until (not including) the given handle
@@ -704,9 +705,14 @@ class ALIGN_AS(CACHE_LINE_SIZE) ClockCacheShard final : public CacheShardBase {
     return Lookup(key, hashed_key);
   }
 
+#ifndef NDEBUG
+  size_t& TEST_MutableOccupancyLimit() const {
+    return table_.TEST_MutableOccupancyLimit();
+  }
   // Acquire/release N references
   void TEST_RefN(HandleImpl* handle, size_t n);
   void TEST_ReleaseN(HandleImpl* handle, size_t n);
+#endif
 
  private:  // data
   Table table_;
diff --git a/cache/lru_cache_test.cc b/cache/lru_cache_test.cc
index 6e5118fb0..cb7beb7b1 100644
--- a/cache/lru_cache_test.cc
+++ b/cache/lru_cache_test.cc
@@ -715,6 +715,62 @@ TEST_F(ClockCacheTest, ClockCounterOverflowTest) {
   ASSERT_EQ(val.deleted, 1);
 }
 
+TEST_F(ClockCacheTest, ClockTableFull) {
+  // Force clock cache table to fill up (not usually allowed) in order
+  // to test full probe sequence that is theoretically possible due to
+  // parallel operations
+  NewShard(6, /*strict_capacity_limit*/ false);
+  size_t size = shard_->GetTableAddressCount();
+  ASSERT_LE(size + 3, 256);  // for using char keys
+  // Modify occupancy and capacity limits to attempt insert on full
+  shard_->TEST_MutableOccupancyLimit() = size + 100;
+  shard_->SetCapacity(size + 100);
+
+  DeleteCounter val;
+  std::vector<HandleImpl*> handles;
+  // NOTE: the three extra insertions should create standalone entries
+  for (size_t i = 0; i < size + 3; ++i) {
+    UniqueId64x2 hkey = TestHashedKey(static_cast<char>(i));
+    ASSERT_OK(shard_->Insert(TestKey(hkey), hkey, &val, &kDeleteCounterHelper,
+                             1, &handles.emplace_back(),
+                             Cache::Priority::HIGH));
+  }
+
+  for (size_t i = 0; i < size + 3; ++i) {
+    UniqueId64x2 hkey = TestHashedKey(static_cast<char>(i));
+    HandleImpl* h = shard_->Lookup(TestKey(hkey), hkey);
+    if (i < size) {
+      ASSERT_NE(h, nullptr);
+      shard_->Release(h);
+    } else {
+      // Standalone entries not visible by lookup
+      ASSERT_EQ(h, nullptr);
+    }
+  }
+
+  for (size_t i = 0; i < size + 3; ++i) {
+    ASSERT_NE(handles[i], nullptr);
+    shard_->Release(handles[i]);
+    if (i < size) {
+      // Everything still in cache
+      ASSERT_EQ(val.deleted, 0);
+    } else {
+      // Standalone entries freed on release
+      ASSERT_EQ(val.deleted, i + 1 - size);
+    }
+  }
+
+  for (size_t i = size + 3; i > 0; --i) {
+    UniqueId64x2 hkey = TestHashedKey(static_cast<char>(i - 1));
+    shard_->Erase(TestKey(hkey), hkey);
+    if (i - 1 > size) {
+      ASSERT_EQ(val.deleted, 3);
+    } else {
+      ASSERT_EQ(val.deleted, 3 + size - (i - 1));
+    }
+  }
+}
+
 // This test is mostly to exercise some corner case logic, by forcing two
 // keys to have the same hash, and more
 TEST_F(ClockCacheTest, CollidingInsertEraseTest) {
diff --git a/unreleased_history/performance_improvements/hcc_perf b/unreleased_history/performance_improvements/hcc_perf
new file mode 100644
index 000000000..c129393dc
--- /dev/null
+++ b/unreleased_history/performance_improvements/hcc_perf
@@ -0,0 +1 @@
+Small efficiency improvement to HyperClockCache by reducing chance of compiler-generated heap allocations
-- 
GitLab