From 8aa99fc71ed744eb56d9b24cfd1a2fab8fddc497 Mon Sep 17 00:00:00 2001 From: Peter Dillinger Date: Mon, 20 Jan 2020 21:30:22 -0800 Subject: [PATCH] Warn on excessive keys for legacy Bloom filter with 32-bit hash (#6317) Summary: With many millions of keys, the old Bloom filter implementation for the block-based table (format_version <= 4) would have excessive FP rate due to the limitations of feeding the Bloom filter with a 32-bit hash. This change computes an estimated inflated FP rate due to this effect and warns in the log whenever an SST filter is constructed (almost certainly a "full" not "partitioned" filter) that exceeds 1.5x FP rate due to this effect. The detailed condition is only checked if 3 million keys or more have been added to a filter, as this should be a lower bound for common bits/key settings (< 20). Recommended remedies include smaller SST file size, using format_version >= 5 (for new Bloom filter), or using partitioned filters. This does not change behavior other than generating warnings for some constructed filters using the old implementation. Pull Request resolved: https://github.com/facebook/rocksdb/pull/6317 Test Plan: Example with warning, 15M keys @ 15 bits / key: (working_mem_size_mb is just to stop after building one filter if it's large) $ ./filter_bench -quick -impl=0 -working_mem_size_mb=1 -bits_per_key=15 -average_keys_per_filter=15000000 2>&1 | grep 'FP rate' [WARN] [/block_based/filter_policy.cc:292] Using legacy SST/BBT Bloom filter with excessive key count (15.0M @ 15bpk), causing estimated 1.8x higher filter FP rate. Consider using new Bloom with format_version>=5, smaller SST file size, or partitioned filters. Predicted FP rate %: 0.766702 Average FP rate %: 0.66846 Example without warning (150K keys): $ ./filter_bench -quick -impl=0 -working_mem_size_mb=1 -bits_per_key=15 -average_keys_per_filter=150000 2>&1 | grep 'FP rate' Predicted FP rate %: 0.422857 Average FP rate %: 0.379301 $ With more samples at 15 bits/key: 150K keys -> no warning; actual: 0.379% FP rate (baseline) 1M keys -> no warning; actual: 0.396% FP rate, 1.045x 9M keys -> no warning; actual: 0.563% FP rate, 1.485x 10M keys -> warning (1.5x); actual: 0.564% FP rate, 1.488x 15M keys -> warning (1.8x); actual: 0.668% FP rate, 1.76x 25M keys -> warning (2.4x); actual: 0.880% FP rate, 2.32x At 10 bits/key: 150K keys -> no warning; actual: 1.17% FP rate (baseline) 1M keys -> no warning; actual: 1.16% FP rate 10M keys -> no warning; actual: 1.32% FP rate, 1.13x 25M keys -> no warning; actual: 1.63% FP rate, 1.39x 35M keys -> warning (1.6x); actual: 1.81% FP rate, 1.55x At 5 bits/key: 150K keys -> no warning; actual: 9.32% FP rate (baseline) 25M keys -> no warning; actual: 9.62% FP rate, 1.03x 200M keys -> no warning; actual: 12.2% FP rate, 1.31x 250M keys -> warning (1.5x); actual: 12.8% FP rate, 1.37x 300M keys -> warning (1.6x); actual: 13.4% FP rate, 1.43x The reason for the modest inaccuracy at low bits/key is that the assumption of independence between a collision between 32-hash values feeding the filter and an FP in the filter is not quite true for implementations using "simple" logic to compute indices from the stock hash result. There's math on this in my dissertation, but I don't think it's worth the effort just for these extreme cases (> 100 million keys and low-ish bits/key). Differential Revision: D19471715 Pulled By: pdillinger fbshipit-source-id: f80c96893a09bf1152630ff0b964e5cdd7e35c68 --- table/block_based/filter_policy.cc | 51 ++++++++++-- table/block_based/filter_policy_internal.h | 5 ++ util/bloom_impl.h | 95 ++++++++++++++++++++++ util/filter_bench.cc | 25 +++++- 4 files changed, 167 insertions(+), 9 deletions(-) diff --git a/table/block_based/filter_policy.cc b/table/block_based/filter_policy.cc index 1b49d554d..9890a1d8f 100644 --- a/table/block_based/filter_policy.cc +++ b/table/block_based/filter_policy.cc @@ -92,6 +92,11 @@ class FastLocalBloomBitsBuilder : public BuiltinFilterBitsBuilder { return num_cache_lines * 64 + /*metadata*/ 5; } + double EstimatedFpRate(size_t keys, size_t bytes) override { + return FastLocalBloomImpl::EstimatedFpRate(keys, bytes - /*metadata*/ 5, + num_probes_, /*hash bits*/ 64); + } + private: void AddAllEntries(char* data, uint32_t len) { // Simple version without prefetching: @@ -194,7 +199,7 @@ using LegacyBloomImpl = LegacyLocalityBloomImpl; class LegacyBloomBitsBuilder : public BuiltinFilterBitsBuilder { public: - explicit LegacyBloomBitsBuilder(const int bits_per_key); + explicit LegacyBloomBitsBuilder(const int bits_per_key, Logger* info_log); // No Copy allowed LegacyBloomBitsBuilder(const LegacyBloomBitsBuilder&) = delete; @@ -214,10 +219,16 @@ class LegacyBloomBitsBuilder : public BuiltinFilterBitsBuilder { return CalculateSpace(num_entry, &dont_care1, &dont_care2); } + double EstimatedFpRate(size_t keys, size_t bytes) override { + return LegacyBloomImpl::EstimatedFpRate(keys, bytes - /*metadata*/ 5, + num_probes_); + } + private: int bits_per_key_; int num_probes_; std::vector hash_entries_; + Logger* info_log_; // Get totalbits that optimized for cpu cache line uint32_t GetTotalBitsForLocality(uint32_t total_bits); @@ -234,9 +245,11 @@ class LegacyBloomBitsBuilder : public BuiltinFilterBitsBuilder { void AddHash(uint32_t h, char* data, uint32_t num_lines, uint32_t total_bits); }; -LegacyBloomBitsBuilder::LegacyBloomBitsBuilder(const int bits_per_key) +LegacyBloomBitsBuilder::LegacyBloomBitsBuilder(const int bits_per_key, + Logger* info_log) : bits_per_key_(bits_per_key), - num_probes_(LegacyNoLocalityBloomImpl::ChooseNumProbes(bits_per_key_)) { + num_probes_(LegacyNoLocalityBloomImpl::ChooseNumProbes(bits_per_key_)), + info_log_(info_log) { assert(bits_per_key_); } @@ -251,14 +264,39 @@ void LegacyBloomBitsBuilder::AddKey(const Slice& key) { Slice LegacyBloomBitsBuilder::Finish(std::unique_ptr* buf) { uint32_t total_bits, num_lines; - char* data = ReserveSpace(static_cast(hash_entries_.size()), &total_bits, - &num_lines); + size_t num_entries = hash_entries_.size(); + char* data = + ReserveSpace(static_cast(num_entries), &total_bits, &num_lines); assert(data); if (total_bits != 0 && num_lines != 0) { for (auto h : hash_entries_) { AddHash(h, data, num_lines, total_bits); } + + // Check for excessive entries for 32-bit hash function + if (num_entries >= /* minimum of 3 million */ 3000000U) { + // More specifically, we can detect that the 32-bit hash function + // is causing significant increase in FP rate by comparing current + // estimated FP rate to what we would get with a normal number of + // keys at same memory ratio. + double est_fp_rate = LegacyBloomImpl::EstimatedFpRate( + num_entries, total_bits / 8, num_probes_); + double vs_fp_rate = LegacyBloomImpl::EstimatedFpRate( + 1U << 16, (1U << 16) * bits_per_key_ / 8, num_probes_); + + if (est_fp_rate >= 1.50 * vs_fp_rate) { + // For more details, see + // https://github.com/facebook/rocksdb/wiki/RocksDB-Bloom-Filter + ROCKS_LOG_WARN( + info_log_, + "Using legacy SST/BBT Bloom filter with excessive key count " + "(%.1fM @ %dbpk), causing estimated %.1fx higher filter FP rate. " + "Consider using new Bloom with format_version>=5, smaller SST " + "file size, or partitioned filters.", + num_entries / 1000000.0, bits_per_key_, est_fp_rate / vs_fp_rate); + } + } } // See BloomFilterPolicy::GetFilterBitsReader for metadata data[total_bits / 8] = static_cast(num_probes_); @@ -545,7 +583,8 @@ FilterBitsBuilder* BloomFilterPolicy::GetBuilderWithContext( "with format_version>=5.", whole_bits_per_key_, adjective); } - return new LegacyBloomBitsBuilder(whole_bits_per_key_); + return new LegacyBloomBitsBuilder(whole_bits_per_key_, + context.info_log); } } assert(false); diff --git a/table/block_based/filter_policy_internal.h b/table/block_based/filter_policy_internal.h index 6fe344c48..df182b524 100644 --- a/table/block_based/filter_policy_internal.h +++ b/table/block_based/filter_policy_internal.h @@ -28,6 +28,11 @@ class BuiltinFilterBitsBuilder : public FilterBitsBuilder { // metadata. Passing the result to CalculateNumEntry should // return >= the num_entry passed in. virtual uint32_t CalculateSpace(const int num_entry) = 0; + + // Returns an estimate of the FP rate of the returned filter if + // `keys` keys are added and the filter returned by Finish is `bytes` + // bytes. + virtual double EstimatedFpRate(size_t keys, size_t bytes) = 0; }; // RocksDB built-in filter policy for Bloom or Bloom-like filters. diff --git a/util/bloom_impl.h b/util/bloom_impl.h index 73575b07c..2a9fbaef2 100644 --- a/util/bloom_impl.h +++ b/util/bloom_impl.h @@ -10,6 +10,7 @@ #pragma once #include #include +#include #include "rocksdb/slice.h" #include "util/hash.h" @@ -20,6 +21,70 @@ namespace rocksdb { +class BloomMath { + public: + // False positive rate of a standard Bloom filter, for given ratio of + // filter memory bits to added keys, and number of probes per operation. + // (The false positive rate is effectively independent of scale, assuming + // the implementation scales OK.) + static double StandardFpRate(double bits_per_key, int num_probes) { + // Standard very-good-estimate formula. See + // https://en.wikipedia.org/wiki/Bloom_filter#Probability_of_false_positives + return std::pow(1.0 - std::exp(-num_probes / bits_per_key), num_probes); + } + + // False positive rate of a "blocked"/"shareded"/"cache-local" Bloom filter, + // for given ratio of filter memory bits to added keys, number of probes per + // operation (all within the given block or cache line size), and block or + // cache line size. + static double CacheLocalFpRate(double bits_per_key, int num_probes, + int cache_line_bits) { + double keys_per_cache_line = cache_line_bits / bits_per_key; + // A reasonable estimate is the average of the FP rates for one standard + // deviation above and below the mean bucket occupancy. See + // https://github.com/facebook/rocksdb/wiki/RocksDB-Bloom-Filter#the-math + double keys_stddev = std::sqrt(keys_per_cache_line); + double crowded_fp = StandardFpRate( + cache_line_bits / (keys_per_cache_line + keys_stddev), num_probes); + double uncrowded_fp = StandardFpRate( + cache_line_bits / (keys_per_cache_line - keys_stddev), num_probes); + return (crowded_fp + uncrowded_fp) / 2; + } + + // False positive rate of querying a new item against `num_keys` items, all + // hashed to `fingerprint_bits` bits. (This assumes the fingerprint hashes + // themselves are stored losslessly. See Section 4 of + // http://www.ccs.neu.edu/home/pete/pub/bloom-filters-verification.pdf) + static double FingerprintFpRate(size_t num_keys, int fingerprint_bits) { + double inv_fingerprint_space = std::pow(0.5, fingerprint_bits); + // Base estimate assumes each key maps to a unique fingerprint. + // Could be > 1 in extreme cases. + double base_estimate = num_keys * inv_fingerprint_space; + // To account for potential overlap, we choose between two formulas + if (base_estimate > 0.0001) { + // A very good formula assuming we don't construct a floating point + // number extremely close to 1. Always produces a probability < 1. + return 1.0 - std::exp(-base_estimate); + } else { + // A very good formula when base_estimate is far below 1. (Subtract + // away the integral-approximated sum that some key has same hash as + // one coming before it in a list.) + return base_estimate - (base_estimate * base_estimate * 0.5); + } + } + + // Returns the probably of either of two independent(-ish) events + // happening, given their probabilities. (This is useful for combining + // results from StandardFpRate or CacheLocalFpRate with FingerprintFpRate + // for a hash-efficient Bloom filter's FP rate. See Section 4 of + // http://www.ccs.neu.edu/home/pete/pub/bloom-filters-verification.pdf) + static double IndependentProbabilitySum(double rate1, double rate2) { + // Use formula that avoids floating point extremely close to 1 if + // rates are extremely small. + return rate1 + rate2 - (rate1 * rate2); + } +}; + // A fast, flexible, and accurate cache-local Bloom implementation with // SIMD-optimized query performance (currently using AVX2 on Intel). Write // performance and non-SIMD read are very good, benefiting from fastrange32 @@ -72,6 +137,16 @@ namespace rocksdb { // class FastLocalBloomImpl { public: + // NOTE: this has only been validated to enough accuracy for producing + // reasonable warnings / user feedback, not for making functional decisions. + static double EstimatedFpRate(size_t keys, size_t bytes, int num_probes, + int hash_bits) { + return BloomMath::IndependentProbabilitySum( + BloomMath::CacheLocalFpRate(8.0 * bytes / keys, num_probes, + /*cache line bits*/ 512), + BloomMath::FingerprintFpRate(keys, hash_bits)); + } + static inline int ChooseNumProbes(int millibits_per_key) { // Since this implementation can (with AVX2) make up to 8 probes // for the same cost, we pick the most accurate num_probes, based @@ -328,6 +403,26 @@ class LegacyLocalityBloomImpl { } public: + // NOTE: this has only been validated to enough accuracy for producing + // reasonable warnings / user feedback, not for making functional decisions. + static double EstimatedFpRate(size_t keys, size_t bytes, int num_probes) { + double bits_per_key = 8.0 * bytes / keys; + double filter_rate = BloomMath::CacheLocalFpRate(bits_per_key, num_probes, + /*cache line bits*/ 512); + if (!ExtraRotates) { + // Good estimate of impact of flaw in index computation. + // Adds roughly 0.002 around 50 bits/key and 0.001 around 100 bits/key. + // The + 22 shifts it nicely to fit for lower bits/key. + filter_rate += 0.1 / (bits_per_key * 0.75 + 22); + } else { + // Not yet validated + assert(false); + } + // Always uses 32-bit hash + double fingerprint_rate = BloomMath::FingerprintFpRate(keys, 32); + return BloomMath::IndependentProbabilitySum(filter_rate, fingerprint_rate); + } + static inline void AddHash(uint32_t h, uint32_t num_lines, int num_probes, char *data, int log2_cache_line_bytes) { const int log2_cache_line_bits = log2_cache_line_bytes + 3; diff --git a/util/filter_bench.cc b/util/filter_bench.cc index 04b419dd3..d466a469d 100644 --- a/util/filter_bench.cc +++ b/util/filter_bench.cc @@ -95,14 +95,20 @@ void _always_assert_fail(int line, const char *file, const char *expr) { #define ALWAYS_ASSERT(cond) \ ((cond) ? (void)0 : ::_always_assert_fail(__LINE__, __FILE__, #cond)) +#ifndef NDEBUG +// This could affect build times enough that we should not include it for +// accurate speed tests +#define PREDICT_FP_RATE +#endif + using rocksdb::Arena; using rocksdb::BlockContents; using rocksdb::BloomFilterPolicy; using rocksdb::BloomHash; +using rocksdb::BuiltinFilterBitsBuilder; using rocksdb::CachableEntry; using rocksdb::EncodeFixed32; using rocksdb::fastrange32; -using rocksdb::FilterBitsBuilder; using rocksdb::FilterBitsReader; using rocksdb::FilterBuildingContext; using rocksdb::FullFilterBlockReader; @@ -302,10 +308,13 @@ void FilterBench::Go() { std::cout << "Building..." << std::endl; - std::unique_ptr builder; + std::unique_ptr builder; size_t total_memory_used = 0; size_t total_keys_added = 0; +#ifdef PREDICT_FP_RATE + double weighted_predicted_fp_rate = 0.0; +#endif rocksdb::StopWatchNano timer(rocksdb::Env::Default(), true); @@ -330,12 +339,17 @@ void FilterBench::Go() { info.filter_ = info.plain_table_bloom_->GetRawData(); } else { if (!builder) { - builder.reset(GetBuilder()); + builder.reset(&dynamic_cast(*GetBuilder())); } for (uint32_t i = 0; i < keys_to_add; ++i) { builder->AddKey(kms_[0].Get(filter_id, i)); } info.filter_ = builder->Finish(&info.owner_); +#ifdef PREDICT_FP_RATE + weighted_predicted_fp_rate += + keys_to_add * + builder->EstimatedFpRate(keys_to_add, info.filter_.size()); +#endif if (FLAGS_new_builder) { builder.reset(); } @@ -362,6 +376,11 @@ void FilterBench::Go() { double bpk = total_memory_used * 8.0 / total_keys_added; std::cout << "Bits/key actual: " << bpk << std::endl; +#ifdef PREDICT_FP_RATE + std::cout << "Predicted FP rate %: " + << 100.0 * (weighted_predicted_fp_rate / total_keys_added) + << std::endl; +#endif if (!FLAGS_quick && !FLAGS_best_case) { double tolerable_rate = std::pow(2.0, -(bpk - 1.0) / (1.4 + bpk / 50.0)); std::cout << "Best possible FP rate %: " << 100.0 * std::pow(2.0, -bpk) -- GitLab