Avoid to go through every CF for every ReleaseSnapshot() (#5090)

Summary: With https://github.com/facebook/rocksdb/pull/3009 we go through every CF to check whether a bottommost compaction is needed to be triggered. This is done within DB mutex. What we do within DB mutex may heavily influece the write throughput we can achieve, so we always want to minimize work there. Here we try to avoid this for-loop by first check a global threshold. In most of the time, the CF loop can be avoided. Pull Request resolved: https://github.com/facebook/rocksdb/pull/5090 Differential Revision: D14582684 Pulled By: siying fbshipit-source-id: 968f6d9bb6affe1a5ebc4910b418300b076f166f

Avoid to go through every CF for every ReleaseSnapshot() (#5090)
Summary: With https://github.com/facebook/rocksdb/pull/3009 we go through every CF to check whether a bottommost compaction is needed to be triggered. This is done within DB mutex. What we do within DB mutex may heavily influece the write throughput we can achieve, so we always want to minimize work there. Here we try to avoid this for-loop by first check a global threshold. In most of the time, the CF loop can be avoided. Pull Request resolved: https://github.com/facebook/rocksdb/pull/5090 Differential Revision: D14582684 Pulled By: siying fbshipit-source-id: 968f6d9bb6affe1a5ebc4910b418300b076f166f
48e7effa · Siying Dong · Facebook Github Bot · 52e6404e · 48e7effa · 48e7effa
5 changed file
--- a/db/db_compaction_test.cc
+++ b/db/db_compaction_test.cc
@@ -3348,7 +3348,7 @@ TEST_F(DBCompactionTest, CompactBottomLevelFilesWithDeletions) {
  options.level0_file_num_compaction_trigger = kNumLevelFiles;
  // inflate it a bit to account for key/metadata overhead
  options.target_file_size_base = 120 * kNumKeysPerFile * kValueSize / 100;
-  Reopen(options);
+  CreateAndReopenWithCF({"one"}, options);

  Random rnd(301);
  const Snapshot* snapshot = nullptr;
@@ -3379,10 +3379,12 @@ TEST_F(DBCompactionTest, CompactBottomLevelFilesWithDeletions) {
  // just need to bump seqnum so ReleaseSnapshot knows the newest key in the SST
  // files does not need to be preserved in case of a future snapshot.
  ASSERT_OK(Put(Key(0), "val"));
+  ASSERT_NE(kMaxSequenceNumber, dbfull()->bottommost_files_mark_threshold_);
  // release snapshot and wait for compactions to finish. Single-file
  // compactions should be triggered, which reduce the size of each bottom-level
  // file without changing file count.
  db_->ReleaseSnapshot(snapshot);
+  ASSERT_EQ(kMaxSequenceNumber, dbfull()->bottommost_files_mark_threshold_);
  rocksdb::SyncPoint::GetInstance()->SetCallBack(
      "LevelCompactionPicker::PickCompaction:Return", [&](void* arg) {
        Compaction* compaction = reinterpret_cast<Compaction*>(arg);

--- a/db/db_impl.cc
+++ b/db/db_impl.cc
@@ -2113,6 +2113,18 @@ SnapshotImpl* DBImpl::GetSnapshotImpl(bool is_write_conflict_boundary,
  return snapshot;
 }

+namespace {
+typedef autovector<ColumnFamilyData*, 2> CfdList;
+bool CfdListContains(const CfdList& list, ColumnFamilyData* cfd) {
+  for (const ColumnFamilyData* t : list) {
+    if (t == cfd) {
+      return true;
+    }
+  }
+  return false;
+}
+}  //  namespace
+
 void DBImpl::ReleaseSnapshot(const Snapshot* s) {
  const SnapshotImpl* casted_s = reinterpret_cast<const SnapshotImpl*>(s);
  {
@@ -2126,15 +2138,35 @@ void DBImpl::ReleaseSnapshot(const Snapshot* s) {
    } else {
      oldest_snapshot = snapshots_.oldest()->number_;
    }
-    for (auto* cfd : *versions_->GetColumnFamilySet()) {
-      cfd->current()->storage_info()->UpdateOldestSnapshot(oldest_snapshot);
-      if (!cfd->current()
-               ->storage_info()
-               ->BottommostFilesMarkedForCompaction()
-               .empty()) {
-        SchedulePendingCompaction(cfd);
-        MaybeScheduleFlushOrCompaction();
+    // Avoid to go through every column family by checking a global threshold
+    // first.
+    if (oldest_snapshot > bottommost_files_mark_threshold_) {
+      CfdList cf_scheduled;
+      for (auto* cfd : *versions_->GetColumnFamilySet()) {
+        cfd->current()->storage_info()->UpdateOldestSnapshot(oldest_snapshot);
+        if (!cfd->current()
+                 ->storage_info()
+                 ->BottommostFilesMarkedForCompaction()
+                 .empty()) {
+          SchedulePendingCompaction(cfd);
+          MaybeScheduleFlushOrCompaction();
+          cf_scheduled.push_back(cfd);
+        }
+      }
+
+      // Calculate a new threshold, skipping those CFs where compactions are
+      // scheduled. We do not do the same pass as the previous loop because
+      // mutex might be unlocked during the loop, making the result inaccurate.
+      SequenceNumber new_bottommost_files_mark_threshold = kMaxSequenceNumber;
+      for (auto* cfd : *versions_->GetColumnFamilySet()) {
+        if (CfdListContains(cf_scheduled, cfd)) {
+          continue;
+        }
+        new_bottommost_files_mark_threshold = std::min(
+            new_bottommost_files_mark_threshold,
+            cfd->current()->storage_info()->bottommost_files_mark_threshold());
      }
+      bottommost_files_mark_threshold_ = new_bottommost_files_mark_threshold;
    }
  }
  delete casted_s;

--- a/db/db_impl.h
+++ b/db/db_impl.h
@@ -865,6 +865,7 @@ class DBImpl : public DB {
  friend class CompactedDBImpl;
  friend class DBTest_ConcurrentFlushWAL_Test;
  friend class DBTest_MixedSlowdownOptionsStop_Test;
+  friend class DBCompactionTest_CompactBottomLevelFilesWithDeletions_Test;
 #ifndef NDEBUG
  friend class DBTest2_ReadCallbackTest_Test;
  friend class WriteCallbackTest_WriteWithCallbackTest_Test;
@@ -1573,6 +1574,10 @@ class DBImpl : public DB {
  // Indicate DB was opened successfully
  bool opened_successfully_;

+  // The min threshold to triggere bottommost compaction for removing
+  // garbages, among all column families.
+  SequenceNumber bottommost_files_mark_threshold_ = kMaxSequenceNumber;
+
  LogsWithPrepTracker logs_with_prep_tracker_;

  // Callback for compaction to check if a key is visible to a snapshot.

--- a/db/db_impl_compaction_flush.cc
+++ b/db/db_impl_compaction_flush.cc
@@ -2883,6 +2883,17 @@ void DBImpl::InstallSuperVersionAndScheduleWork(
  }
  cfd->InstallSuperVersion(sv_context, &mutex_, mutable_cf_options);

+  // There may be a small data race here. The snapshot tricking bottommost
+  // compaction may already be released here. But assuming there will always be
+  // newer snapshot created and released frequently, the compaction will be
+  // triggered soon anyway.
+  bottommost_files_mark_threshold_ = kMaxSequenceNumber;
+  for (auto* my_cfd : *versions_->GetColumnFamilySet()) {
+    bottommost_files_mark_threshold_ = std::min(
+        bottommost_files_mark_threshold_,
+        my_cfd->current()->storage_info()->bottommost_files_mark_threshold());
+  }
+
  // Whenever we install new SuperVersion, we might need to issue new flushes or
  // compactions.
  SchedulePendingCompaction(cfd);

--- a/db/version_set.h
+++ b/db/version_set.h
@@ -402,6 +402,10 @@ class VersionStorageInfo {

  bool force_consistency_checks() const { return force_consistency_checks_; }

+  SequenceNumber bottommost_files_mark_threshold() const {
+    return bottommost_files_mark_threshold_;
+  }
+
  // Returns whether any key in [`smallest_key`, `largest_key`] could appear in
  // an older L0 file than `last_l0_idx` or in a greater level than `last_level`
  //