Some small improvements to HyperClockCache (#11601)

Summary: Stacked on https://github.com/facebook/rocksdb/issues/11572 * Minimize use of std::function and lambdas to minimize chances of compiler heap-allocating closures (unnecessary stress on allocator). It appears that converting FindSlot to a template enables inlining the lambda parameters, avoiding heap allocations. * Clean up some logic with FindSlot (FIXMEs from https://github.com/facebook/rocksdb/issues/11572) * Fix handling of rare case of probing all slots, with new unit test. (Previously Insert would not roll back displacements in that case, which would kill performance if it were to happen.) * Add an -early_exit option to cache_bench for gathering memory stats before deallocation. Pull Request resolved: https://github.com/facebook/rocksdb/pull/11601 Test Plan: unit test added for probing all slots ## Seeing heap allocations Run `MALLOC_CONF="stats_print:true" ./cache_bench -cache_type=hyper_clock_cache` before https://github.com/facebook/rocksdb/issues/11572 vs. after this change. Before, we see this in the interesting bin statistics: ``` size nrequests ---- --------- 32 578460 64 24340 8192 578460 ``` And after: ``` size nrequests ---- --------- 32 (insignificant) 64 24370 8192 579130 ``` ## Performance test Build with `make USE_CLANG=1 PORTABLE=0 DEBUG_LEVEL=0 -j32 cache_bench` Run `./cache_bench -cache_type=hyper_clock_cache -ops_per_thread=5000000` in before and after configurations, simultaneously: ``` Before: Complete in 33.244 s; Rough parallel ops/sec = 2406442 After: Complete in 32.773 s; Rough parallel ops/sec = 2441019 ``` Reviewed By: jowlyzhang Differential Revision: D47375092 Pulled By: pdillinger fbshipit-source-id: 46f0f57257ddb374290a0a38c651764ea60ba410

Some small improvements to HyperClockCache (#11601)
Summary: Stacked on https://github.com/facebook/rocksdb/issues/11572 * Minimize use of std::function and lambdas to minimize chances of compiler heap-allocating closures (unnecessary stress on allocator). It appears that converting FindSlot to a template enables inlining the lambda parameters, avoiding heap allocations. * Clean up some logic with FindSlot (FIXMEs from https://github.com/facebook/rocksdb/issues/11572) * Fix handling of rare case of probing all slots, with new unit test. (Previously Insert would not roll back displacements in that case, which would kill performance if it were to happen.) * Add an -early_exit option to cache_bench for gathering memory stats before deallocation. Pull Request resolved: https://github.com/facebook/rocksdb/pull/11601 Test Plan: unit test added for probing all slots ## Seeing heap allocations Run `MALLOC_CONF="stats_print:true" ./cache_bench -cache_type=hyper_clock_cache` before https://github.com/facebook/rocksdb/issues/11572 vs. after this change. Before, we see this in the interesting bin statistics: ``` size nrequests ---- --------- 32 578460 64 24340 8192 578460 ``` And after: ``` size nrequests ---- --------- 32 (insignificant) 64 24370 8192 579130 ``` ## Performance test Build with `make USE_CLANG=1 PORTABLE=0 DEBUG_LEVEL=0 -j32 cache_bench` Run `./cache_bench -cache_type=hyper_clock_cache -ops_per_thread=5000000` in before and after configurations, simultaneously: ``` Before: Complete in 33.244 s; Rough parallel ops/sec = 2406442 After: Complete in 32.773 s; Rough parallel ops/sec = 2441019 ``` Reviewed By: jowlyzhang Differential Revision: D47375092 Pulled By: pdillinger fbshipit-source-id: 46f0f57257ddb374290a0a38c651764ea60ba410
b1b6f87f · Peter Dillinger · Facebook GitHub Bot · bc0db334 · b1b6f87f · b1b6f87f
5 changed file
--- a/cache/cache_bench_tool.cc
+++ b/cache/cache_bench_tool.cc
@@ -77,6 +77,10 @@ DEFINE_bool(lean, false,
            "If true, no additional computation is performed besides cache "
            "operations.");

+DEFINE_bool(early_exit, false,
+            "Exit before deallocating most memory. Good for malloc stats, e.g."
+            "MALLOC_CONF=\"stats_print:true\"");
+
 DEFINE_string(secondary_cache_uri, "",
              "Full URI for creating a custom secondary cache object");
 static class std::shared_ptr<ROCKSDB_NAMESPACE::SecondaryCache> secondary_cache;
@@ -593,6 +597,10 @@ class CacheBench {
      }
      thread->latency_ns_hist.Add(timer.ElapsedNanos());
    }
+    if (FLAGS_early_exit) {
+      MutexLock l(thread->shared->GetMutex());
+      exit(0);
+    }
    if (handle) {
      cache_->Release(handle);
      handle = nullptr;

--- a/cache/clock_cache.cc
+++ b/cache/clock_cache.cc
@@ -195,7 +195,6 @@ inline void CorrectNearOverflow(uint64_t old_meta,
 inline bool BeginSlotInsert(const ClockHandleBasicData& proto, ClockHandle& h,
                            uint64_t initial_countdown, bool* already_matches) {
  assert(*already_matches == false);
-
  // Optimistically transition the slot from "empty" to
  // "under construction" (no effect on other states)
  uint64_t old_meta = h.meta.fetch_or(
@@ -486,9 +485,6 @@ Status BaseClockTable::Insert(const ClockHandleBasicData& proto,
  // Do we have the available occupancy? Optimistically assume we do
  // and deal with it if we don't.
  size_t old_occupancy = occupancy_.fetch_add(1, std::memory_order_acquire);
-  auto revert_occupancy_fn = [&]() {
-    occupancy_.fetch_sub(1, std::memory_order_relaxed);
-  };
  // Whether we over-committed and need an eviction to make up for it
  bool need_evict_for_occupancy =
      !derived.GrowIfNeeded(old_occupancy + 1, state);
@@ -501,7 +497,8 @@ Status BaseClockTable::Insert(const ClockHandleBasicData& proto,
    Status s = ChargeUsageMaybeEvictStrict<Table>(
        total_charge, capacity, need_evict_for_occupancy, state);
    if (!s.ok()) {
-      revert_occupancy_fn();
+      // Revert occupancy
+      occupancy_.fetch_sub(1, std::memory_order_relaxed);
      return s;
    }
  } else {
@@ -509,7 +506,8 @@ Status BaseClockTable::Insert(const ClockHandleBasicData& proto,
    bool success = ChargeUsageMaybeEvictNonStrict<Table>(
        total_charge, capacity, need_evict_for_occupancy, state);
    if (!success) {
-      revert_occupancy_fn();
+      // Revert occupancy
+      occupancy_.fetch_sub(1, std::memory_order_relaxed);
      if (handle == nullptr) {
        // Don't insert the entry but still return ok, as if the entry
        // inserted into cache and evicted immediately.
@@ -522,11 +520,6 @@ Status BaseClockTable::Insert(const ClockHandleBasicData& proto,
      }
    }
  }
-  auto revert_usage_fn = [&]() {
-    usage_.fetch_sub(total_charge, std::memory_order_relaxed);
-    // No underflow
-    assert(usage_.load(std::memory_order_relaxed) < SIZE_MAX / 2);
-  };

  if (!use_standalone_insert) {
    // Attempt a table insert, but abort if we find an existing entry for the
@@ -551,10 +544,14 @@ Status BaseClockTable::Insert(const ClockHandleBasicData& proto,
      return Status::OK();
    }
    // Not inserted
-    revert_occupancy_fn();
+    // Revert occupancy
+    occupancy_.fetch_sub(1, std::memory_order_relaxed);
    // Maybe fall back on standalone insert
    if (handle == nullptr) {
-      revert_usage_fn();
+      // Revert usage
+      usage_.fetch_sub(total_charge, std::memory_order_relaxed);
+      // No underflow
+      assert(usage_.load(std::memory_order_relaxed) < SIZE_MAX / 2);
      // As if unrefed entry immdiately evicted
      proto.FreeData(allocator_);
      return Status::OK();
@@ -680,47 +677,52 @@ bool HyperClockTable::GrowIfNeeded(size_t new_occupancy, InsertState&) {
 HyperClockTable::HandleImpl* HyperClockTable::DoInsert(
    const ClockHandleBasicData& proto, uint64_t initial_countdown,
    bool keep_ref, InsertState&) {
-  size_t probe = 0;
  bool already_matches = false;
  HandleImpl* e = FindSlot(
      proto.hashed_key,
      [&](HandleImpl* h) {
-        // FIXME: simplify and handle in abort_fn below?
-        bool inserted =
-            TryInsert(proto, *h, initial_countdown, keep_ref, &already_matches);
-        return inserted || already_matches;
+        return TryInsert(proto, *h, initial_countdown, keep_ref,
+                         &already_matches);
      },
-      [&](HandleImpl* /*h*/) { return false; },
      [&](HandleImpl* h) {
-        h->displacements.fetch_add(1, std::memory_order_relaxed);
+        if (already_matches) {
+          // Stop searching & roll back displacements
+          Rollback(proto.hashed_key, h);
+          return true;
+        } else {
+          // Keep going
+          return false;
+        }
      },
-      probe);
-  if (e == nullptr) {
-    // Occupancy check and never abort FindSlot above should generally
-    // prevent this, except it's theoretically possible for other threads
-    // to evict and replace entries in the right order to hit every slot
-    // when it is populated. Assuming random hashing, the chance of that
-    // should be no higher than pow(kStrictLoadFactor, n) for n slots.
-    // That should be infeasible for roughly n >= 256, so if this assertion
-    // fails, that suggests something is going wrong.
-    assert(GetTableSize() < 256);
-    // WART/FIXME: need to roll back every slot
-    already_matches = true;
+      [&](HandleImpl* h, bool is_last) {
+        if (is_last) {
+          // Search is ending. Roll back displacements
+          Rollback(proto.hashed_key, h);
+        } else {
+          h->displacements.fetch_add(1, std::memory_order_relaxed);
+        }
+      });
+  if (already_matches) {
+    // Insertion skipped
+    return nullptr;
  }
-  if (!already_matches) {
+  if (e != nullptr) {
    // Successfully inserted
-    assert(e);
    return e;
  }
-  // Roll back displacements from failed table insertion
-  Rollback(proto.hashed_key, e);
-  // Insertion skipped
+  // Else, no available slot found. Occupancy check should generally prevent
+  // this, except it's theoretically possible for other threads to evict and
+  // replace entries in the right order to hit every slot when it is populated.
+  // Assuming random hashing, the chance of that should be no higher than
+  // pow(kStrictLoadFactor, n) for n slots. That should be infeasible for
+  // roughly n >= 256, so if this assertion fails, that suggests something is
+  // going wrong.
+  assert(GetTableSize() < 256);
  return nullptr;
 }

 HyperClockTable::HandleImpl* HyperClockTable::Lookup(
    const UniqueId64x2& hashed_key) {
-  size_t probe = 0;
  HandleImpl* e = FindSlot(
      hashed_key,
      [&](HandleImpl* h) {
@@ -780,7 +782,7 @@ HyperClockTable::HandleImpl* HyperClockTable::Lookup(
      [&](HandleImpl* h) {
        return h->displacements.load(std::memory_order_relaxed) == 0;
      },
-      [&](HandleImpl* /*h*/) {}, probe);
+      [&](HandleImpl* /*h*/, bool /*is_last*/) {});

  return e;
 }
@@ -873,7 +875,6 @@ void HyperClockTable::TEST_ReleaseN(HandleImpl* h, size_t n) {
 #endif

 void HyperClockTable::Erase(const UniqueId64x2& hashed_key) {
-  size_t probe = 0;
  (void)FindSlot(
      hashed_key,
      [&](HandleImpl* h) {
@@ -940,7 +941,7 @@ void HyperClockTable::Erase(const UniqueId64x2& hashed_key) {
      [&](HandleImpl* h) {
        return h->displacements.load(std::memory_order_relaxed) == 0;
      },
-      [&](HandleImpl* /*h*/) {}, probe);
+      [&](HandleImpl* /*h*/, bool /*is_last*/) {});
 }

 void HyperClockTable::ConstApplyToEntriesRange(
@@ -1005,10 +1006,10 @@ void HyperClockTable::EraseUnRefEntries() {
  }
 }

+template <typename MatchFn, typename AbortFn, typename UpdateFn>
 inline HyperClockTable::HandleImpl* HyperClockTable::FindSlot(
-    const UniqueId64x2& hashed_key, std::function<bool(HandleImpl*)> match_fn,
-    std::function<bool(HandleImpl*)> abort_fn,
-    std::function<void(HandleImpl*)> update_fn, size_t& probe) {
+    const UniqueId64x2& hashed_key, MatchFn match_fn, AbortFn abort_fn,
+    UpdateFn update_fn) {
  // NOTE: upper 32 bits of hashed_key[0] is used for sharding
  //
  // We use double-hashing probing. Every probe in the sequence is a
@@ -1022,20 +1023,21 @@ inline HyperClockTable::HandleImpl* HyperClockTable::FindSlot(
  // TODO: we could also reconsider linear probing, though locality benefits
  // are limited because each slot is a full cache line
  size_t increment = static_cast<size_t>(hashed_key[0]) | 1U;
-  size_t current = ModTableSize(base + probe * increment);
-  while (probe <= length_bits_mask_) {
+  size_t first = ModTableSize(base);
+  size_t current = first;
+  bool is_last;
+  do {
    HandleImpl* h = &array_[current];
    if (match_fn(h)) {
-      probe++;
      return h;
    }
    if (abort_fn(h)) {
      return nullptr;
    }
-    probe++;
-    update_fn(h);
    current = ModTableSize(current + increment);
-  }
+    is_last = current == first;
+    update_fn(h, is_last);
+  } while (!is_last);
  // We looped back.
  return nullptr;
 }

--- a/cache/clock_cache.h
+++ b/cache/clock_cache.h
@@ -549,7 +549,12 @@ class HyperClockTable : public BaseClockTable {
  size_t GetOccupancyLimit() const { return occupancy_limit_; }

 #ifndef NDEBUG
-  void TEST_ReleaseN(HandleImpl* h, size_t n);
+  size_t& TEST_MutableOccupancyLimit() const {
+    return const_cast<size_t&>(occupancy_limit_);
+  }
+
+  // Release N references
+  void TEST_ReleaseN(HandleImpl* handle, size_t n);
 #endif

 private:  // functions
@@ -558,22 +563,18 @@ class HyperClockTable : public BaseClockTable {
    return static_cast<size_t>(x) & length_bits_mask_;
  }

-  // Returns the first slot in the probe sequence, starting from the given
-  // probe number, with a handle e such that match(e) is true. At every
-  // step, the function first tests whether match(e) holds. If this is false,
-  // it evaluates abort(e) to decide whether the search should be aborted,
-  // and in the affirmative returns -1. For every handle e probed except
-  // the last one, the function runs update(e).
-  // The probe parameter is modified as follows. We say a probe to a handle
-  // e is aborting if match(e) is false and abort(e) is true. Then the final
-  // value of probe is one more than the last non-aborting probe during the
-  // call. This is so that that the variable can be used to keep track of
-  // progress across consecutive calls to FindSlot.
-  inline HandleImpl* FindSlot(const UniqueId64x2& hashed_key,
-                              std::function<bool(HandleImpl*)> match,
-                              std::function<bool(HandleImpl*)> stop,
-                              std::function<void(HandleImpl*)> update,
-                              size_t& probe);
+  // Returns the first slot in the probe sequence with a handle e such that
+  // match_fn(e) is true. At every step, the function first tests whether
+  // match_fn(e) holds. If this is false, it evaluates abort_fn(e) to decide
+  // whether the search should be aborted, and if so, FindSlot immediately
+  // returns nullptr. For every handle e that is not a match and not aborted,
+  // FindSlot runs update_fn(e, is_last) where is_last is set to true iff that
+  // slot will be the last probed because the next would cycle back to the first
+  // slot probed. This function uses templates instead of std::function to
+  // minimize the risk of heap-allocated closures being created.
+  template <typename MatchFn, typename AbortFn, typename UpdateFn>
+  inline HandleImpl* FindSlot(const UniqueId64x2& hashed_key, MatchFn match_fn,
+                              AbortFn abort_fn, UpdateFn update_fn);

  // Re-decrement all displacements in probe path starting from beginning
  // until (not including) the given handle
@@ -704,9 +705,14 @@ class ALIGN_AS(CACHE_LINE_SIZE) ClockCacheShard final : public CacheShardBase {
    return Lookup(key, hashed_key);
  }

+#ifndef NDEBUG
+  size_t& TEST_MutableOccupancyLimit() const {
+    return table_.TEST_MutableOccupancyLimit();
+  }
  // Acquire/release N references
  void TEST_RefN(HandleImpl* handle, size_t n);
  void TEST_ReleaseN(HandleImpl* handle, size_t n);
+#endif

 private:  // data
  Table table_;

--- a/cache/lru_cache_test.cc
+++ b/cache/lru_cache_test.cc
@@ -715,6 +715,62 @@ TEST_F(ClockCacheTest, ClockCounterOverflowTest) {
  ASSERT_EQ(val.deleted, 1);
 }

+TEST_F(ClockCacheTest, ClockTableFull) {
+  // Force clock cache table to fill up (not usually allowed) in order
+  // to test full probe sequence that is theoretically possible due to
+  // parallel operations
+  NewShard(6, /*strict_capacity_limit*/ false);
+  size_t size = shard_->GetTableAddressCount();
+  ASSERT_LE(size + 3, 256);  // for using char keys
+  // Modify occupancy and capacity limits to attempt insert on full
+  shard_->TEST_MutableOccupancyLimit() = size + 100;
+  shard_->SetCapacity(size + 100);
+
+  DeleteCounter val;
+  std::vector<HandleImpl*> handles;
+  // NOTE: the three extra insertions should create standalone entries
+  for (size_t i = 0; i < size + 3; ++i) {
+    UniqueId64x2 hkey = TestHashedKey(static_cast<char>(i));
+    ASSERT_OK(shard_->Insert(TestKey(hkey), hkey, &val, &kDeleteCounterHelper,
+                             1, &handles.emplace_back(),
+                             Cache::Priority::HIGH));
+  }
+
+  for (size_t i = 0; i < size + 3; ++i) {
+    UniqueId64x2 hkey = TestHashedKey(static_cast<char>(i));
+    HandleImpl* h = shard_->Lookup(TestKey(hkey), hkey);
+    if (i < size) {
+      ASSERT_NE(h, nullptr);
+      shard_->Release(h);
+    } else {
+      // Standalone entries not visible by lookup
+      ASSERT_EQ(h, nullptr);
+    }
+  }
+
+  for (size_t i = 0; i < size + 3; ++i) {
+    ASSERT_NE(handles[i], nullptr);
+    shard_->Release(handles[i]);
+    if (i < size) {
+      // Everything still in cache
+      ASSERT_EQ(val.deleted, 0);
+    } else {
+      // Standalone entries freed on release
+      ASSERT_EQ(val.deleted, i + 1 - size);
+    }
+  }
+
+  for (size_t i = size + 3; i > 0; --i) {
+    UniqueId64x2 hkey = TestHashedKey(static_cast<char>(i - 1));
+    shard_->Erase(TestKey(hkey), hkey);
+    if (i - 1 > size) {
+      ASSERT_EQ(val.deleted, 3);
+    } else {
+      ASSERT_EQ(val.deleted, 3 + size - (i - 1));
+    }
+  }
+}
+
 // This test is mostly to exercise some corner case logic, by forcing two
 // keys to have the same hash, and more
 TEST_F(ClockCacheTest, CollidingInsertEraseTest) {

--- a/unreleased_history/performance_improvements/hcc_perf
+++ b/unreleased_history/performance_improvements/hcc_perf
+Small efficiency improvement to HyperClockCache by reducing chance of compiler-generated heap allocations