diff --git a/HISTORY.md b/HISTORY.md index 63a9c209576e0eb2ef62fff928a1f2ce4982b60a..78086c33a4db3685ddfe45ab35b2411ac65c814a 100644 --- a/HISTORY.md +++ b/HISTORY.md @@ -16,6 +16,9 @@ * Fix a bug in which backup/checkpoint can include a WAL deleted by RocksDB. * Fix a bug where concurrent compactions might cause unnecessary further write stalling. In some cases, this might cause write rate to drop to minimum. +## Behavior Change +* In leveled compaction with dynamic levelling, level multiplier is not anymore adjusted due to oversized L0. Instead, compaction score is adjusted by increasing size level target by adding incoming bytes from upper levels. This would deprioritize compactions from upper levels if more data from L0 is coming. This is to fix some unnecessary full stalling due to drastic change of level targets, while not wasting write bandwidth for compaction while writes are overloaded. + ## 7.4.0 (06/19/2022) ### Bug Fixes * Fixed a bug in calculating key-value integrity protection for users of in-place memtable updates. In particular, the affected users would be those who configure `protection_bytes_per_key > 0` on `WriteBatch` or `WriteOptions`, and configure `inplace_callback != nullptr`. diff --git a/db/version_set.cc b/db/version_set.cc index c6098723d59fe12a5b738da965fe007b8eed5efd..f07f12841fa600b6f72ab9aba19f30a4627abd4f 100644 --- a/db/version_set.cc +++ b/db/version_set.cc @@ -2657,6 +2657,16 @@ uint32_t GetExpiredTtlFilesCount(const ImmutableOptions& ioptions, void VersionStorageInfo::ComputeCompactionScore( const ImmutableOptions& immutable_options, const MutableCFOptions& mutable_cf_options) { + double total_downcompact_bytes = 0.0; + // Historically, score is defined as actual bytes in a level divided by + // the level's target size, and 1.0 is the threshold for triggering + // compaction. Higher score means higher prioritization. + // Now we keep the compaction triggering condition, but consider more + // factors for priorization, while still keeping the 1.0 threshold. + // In order to provide flexibility for reducing score while still + // maintaining it to be over 1.0, we scale the original score by 10x + // if it is larger than 1.0. + const double kScoreScale = 10.0; for (int level = 0; level <= MaxInputLevel(); level++) { double score; if (level == 0) { @@ -2674,6 +2684,7 @@ void VersionStorageInfo::ComputeCompactionScore( int num_sorted_runs = 0; uint64_t total_size = 0; for (auto* f : files_[level]) { + total_downcompact_bytes += static_cast(f->fd.GetFileSize()); if (!f->being_compacted) { total_size += f->compensated_file_size; num_sorted_runs++; @@ -2737,18 +2748,40 @@ void VersionStorageInfo::ComputeCompactionScore( } score = std::max(score, static_cast(total_size) / l0_target_size); + if (immutable_options.level_compaction_dynamic_level_bytes && + score > 1.0) { + score *= kScoreScale; + } } } } else { // Compute the ratio of current size to size limit. uint64_t level_bytes_no_compacting = 0; + uint64_t level_total_bytes = 0; for (auto f : files_[level]) { + level_total_bytes += f->fd.GetFileSize(); if (!f->being_compacted) { level_bytes_no_compacting += f->compensated_file_size; } } - score = static_cast(level_bytes_no_compacting) / - MaxBytesForLevel(level); + if (!immutable_options.level_compaction_dynamic_level_bytes || + level_bytes_no_compacting < MaxBytesForLevel(level)) { + score = static_cast(level_bytes_no_compacting) / + MaxBytesForLevel(level); + } else { + // If there are a large mount of data being compacted down to the + // current level soon, we would de-prioritize compaction from + // a level where the incoming data would be a large ratio. We do + // it by dividing level size not by target level size, but + // the target size and the incoming compaction bytes. + score = static_cast(level_bytes_no_compacting) / + (MaxBytesForLevel(level) + total_downcompact_bytes) * + kScoreScale; + } + if (level_total_bytes > MaxBytesForLevel(level)) { + total_downcompact_bytes += + static_cast(level_total_bytes - MaxBytesForLevel(level)); + } } compaction_level_[level] = level; compaction_score_[level] = score; @@ -3775,13 +3808,7 @@ void VersionStorageInfo::CalculateBaseBytes(const ImmutableOptions& ioptions, // No compaction from L1+ needs to be scheduled. base_level_ = num_levels_ - 1; } else { - uint64_t l0_size = 0; - for (const auto& f : files_[0]) { - l0_size += f->fd.GetFileSize(); - } - - uint64_t base_bytes_max = - std::max(options.max_bytes_for_level_base, l0_size); + uint64_t base_bytes_max = options.max_bytes_for_level_base; uint64_t base_bytes_min = static_cast( base_bytes_max / options.max_bytes_for_level_multiplier); @@ -3823,26 +3850,6 @@ void VersionStorageInfo::CalculateBaseBytes(const ImmutableOptions& ioptions, level_multiplier_ = options.max_bytes_for_level_multiplier; assert(base_level_size > 0); - if (l0_size > base_level_size && - (l0_size > options.max_bytes_for_level_base || - static_cast(files_[0].size() / 2) >= - options.level0_file_num_compaction_trigger)) { - // We adjust the base level according to actual L0 size, and adjust - // the level multiplier accordingly, when: - // 1. the L0 size is larger than level size base, or - // 2. number of L0 files reaches twice the L0->L1 compaction trigger - // We don't do this otherwise to keep the LSM-tree structure stable - // unless the L0 compaction is backlogged. - base_level_size = l0_size; - if (base_level_ == num_levels_ - 1) { - level_multiplier_ = 1.0; - } else { - level_multiplier_ = std::pow( - static_cast(max_level_size) / - static_cast(base_level_size), - 1.0 / static_cast(num_levels_ - base_level_ - 1)); - } - } uint64_t level_size = base_level_size; for (int i = base_level_; i < num_levels_; i++) { diff --git a/db/version_set_test.cc b/db/version_set_test.cc index c4c125bfcc66eb0e2180db0b4a7ec2b6864d290f..47bfe5ee89ab0883efc680f53865a5a9bc7f9850 100644 --- a/db/version_set_test.cc +++ b/db/version_set_test.cc @@ -376,73 +376,80 @@ TEST_F(VersionStorageInfoTest, MaxBytesForLevelDynamicWithLargeL0_1) { ASSERT_EQ(2, vstorage_.base_level()); // level multiplier should be 3.5 ASSERT_EQ(vstorage_.level_multiplier(), 5.0); - // Level size should be around 30,000, 105,000, 367,500 ASSERT_EQ(40000U, vstorage_.MaxBytesForLevel(2)); ASSERT_EQ(51450U, vstorage_.MaxBytesForLevel(3)); ASSERT_EQ(257250U, vstorage_.MaxBytesForLevel(4)); + + vstorage_.ComputeCompactionScore(ioptions_, mutable_cf_options_); + // Only L0 hits compaction. + ASSERT_EQ(vstorage_.CompactionScoreLevel(0), 0); } TEST_F(VersionStorageInfoTest, MaxBytesForLevelDynamicWithLargeL0_2) { ioptions_.level_compaction_dynamic_level_bytes = true; mutable_cf_options_.max_bytes_for_level_base = 10000; mutable_cf_options_.max_bytes_for_level_multiplier = 5; - mutable_cf_options_.level0_file_num_compaction_trigger = 2; + mutable_cf_options_.level0_file_num_compaction_trigger = 4; Add(0, 11U, "1", "2", 10000U); Add(0, 12U, "1", "2", 10000U); Add(0, 13U, "1", "2", 10000U); + // Level size should be around 10,000, 10,290, 51,450, 257,250 Add(5, 4U, "1", "2", 1286250U); - Add(4, 5U, "1", "2", 200000U); - Add(3, 6U, "1", "2", 40000U); - Add(2, 7U, "1", "2", 8000U); + Add(4, 5U, "1", "2", 258000U); // unadjusted score 1.003 + Add(3, 6U, "1", "2", 53000U); // unadjusted score 1.03 + Add(2, 7U, "1", "2", 20000U); // unadjusted score 1.94 UpdateVersionStorageInfo(); ASSERT_EQ(0, logger_->log_count); - ASSERT_EQ(2, vstorage_.base_level()); - // level multiplier should be 3.5 - ASSERT_LT(vstorage_.level_multiplier(), 3.6); - ASSERT_GT(vstorage_.level_multiplier(), 3.4); - // Level size should be around 30,000, 105,000, 367,500 - ASSERT_EQ(30000U, vstorage_.MaxBytesForLevel(2)); - ASSERT_LT(vstorage_.MaxBytesForLevel(3), 110000U); - ASSERT_GT(vstorage_.MaxBytesForLevel(3), 100000U); - ASSERT_LT(vstorage_.MaxBytesForLevel(4), 370000U); - ASSERT_GT(vstorage_.MaxBytesForLevel(4), 360000U); + ASSERT_EQ(1, vstorage_.base_level()); + ASSERT_EQ(10000U, vstorage_.MaxBytesForLevel(1)); + ASSERT_EQ(10290U, vstorage_.MaxBytesForLevel(2)); + ASSERT_EQ(51450U, vstorage_.MaxBytesForLevel(3)); + ASSERT_EQ(257250U, vstorage_.MaxBytesForLevel(4)); + + vstorage_.ComputeCompactionScore(ioptions_, mutable_cf_options_); + // Although L2 and l3 have higher unadjusted compaction score, considering + // a relatively large L0 being compacted down soon, L4 is picked up for + // compaction. + // L0 is still picked up for oversizing. + ASSERT_EQ(0, vstorage_.CompactionScoreLevel(0)); + ASSERT_EQ(4, vstorage_.CompactionScoreLevel(1)); } TEST_F(VersionStorageInfoTest, MaxBytesForLevelDynamicWithLargeL0_3) { ioptions_.level_compaction_dynamic_level_bytes = true; - mutable_cf_options_.max_bytes_for_level_base = 10000; + mutable_cf_options_.max_bytes_for_level_base = 20000; mutable_cf_options_.max_bytes_for_level_multiplier = 5; - mutable_cf_options_.level0_file_num_compaction_trigger = 2; + mutable_cf_options_.level0_file_num_compaction_trigger = 5; - Add(0, 11U, "1", "2", 5000U); - Add(0, 12U, "1", "2", 5000U); - Add(0, 13U, "1", "2", 5000U); - Add(0, 14U, "1", "2", 5000U); - Add(0, 15U, "1", "2", 5000U); - Add(0, 16U, "1", "2", 5000U); + Add(0, 11U, "1", "2", 2500U); + Add(0, 12U, "1", "2", 2500U); + Add(0, 13U, "1", "2", 2500U); + Add(0, 14U, "1", "2", 2500U); + // Level size should be around 20,000, 53000, 258000 Add(5, 4U, "1", "2", 1286250U); - Add(4, 5U, "1", "2", 200000U); - Add(3, 6U, "1", "2", 40000U); - Add(2, 7U, "1", "2", 8000U); + Add(4, 5U, "1", "2", 260000U); // Unadjusted score 1.01, adjusted about 4.3 + Add(3, 6U, "1", "2", 85000U); // Unadjusted score 1.42, adjusted about 11.6 + Add(2, 7U, "1", "2", 30000); // Unadjusted score 1.5, adjusted about 10.0 UpdateVersionStorageInfo(); ASSERT_EQ(0, logger_->log_count); ASSERT_EQ(2, vstorage_.base_level()); - // level multiplier should be 3.5 - ASSERT_LT(vstorage_.level_multiplier(), 3.6); - ASSERT_GT(vstorage_.level_multiplier(), 3.4); - // Level size should be around 30,000, 105,000, 367,500 - ASSERT_EQ(30000U, vstorage_.MaxBytesForLevel(2)); - ASSERT_LT(vstorage_.MaxBytesForLevel(3), 110000U); - ASSERT_GT(vstorage_.MaxBytesForLevel(3), 100000U); - ASSERT_LT(vstorage_.MaxBytesForLevel(4), 370000U); - ASSERT_GT(vstorage_.MaxBytesForLevel(4), 360000U); + ASSERT_EQ(20000U, vstorage_.MaxBytesForLevel(2)); + + vstorage_.ComputeCompactionScore(ioptions_, mutable_cf_options_); + // Although L2 has higher unadjusted compaction score, considering + // a relatively large L0 being compacted down soon, L3 is picked up for + // compaction. + + ASSERT_EQ(3, vstorage_.CompactionScoreLevel(0)); + ASSERT_EQ(2, vstorage_.CompactionScoreLevel(1)); + ASSERT_EQ(4, vstorage_.CompactionScoreLevel(2)); } TEST_F(VersionStorageInfoTest, EstimateLiveDataSize) {