// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under both the GPLv2 (found in the // COPYING file in the root directory) and Apache 2.0 License // (found in the LICENSE.Apache file in the root directory). // // Copyright (c) 2012 The LevelDB Authors. All rights reserved. // Use of this source code is governed by a BSD-style license that can be // found in the LICENSE file. See the AUTHORS file for names of contributors. #include #include #include "rocksdb/filter_policy.h" #include "rocksdb/slice.h" #include "table/block_based/block_based_filter_block.h" #include "table/block_based/full_filter_block.h" #include "table/block_based/filter_policy_internal.h" #include "third-party/folly/folly/ConstexprMath.h" #include "util/bloom_impl.h" #include "util/coding.h" #include "util/hash.h" namespace rocksdb { namespace { // See description in FastLocalBloomImpl class FastLocalBloomBitsBuilder : public BuiltinFilterBitsBuilder { public: explicit FastLocalBloomBitsBuilder(const int millibits_per_key) : millibits_per_key_(millibits_per_key), num_probes_(FastLocalBloomImpl::ChooseNumProbes(millibits_per_key_)) { assert(millibits_per_key >= 1000); } // No Copy allowed FastLocalBloomBitsBuilder(const FastLocalBloomBitsBuilder&) = delete; void operator=(const FastLocalBloomBitsBuilder&) = delete; ~FastLocalBloomBitsBuilder() override {} virtual void AddKey(const Slice& key) override { uint64_t hash = GetSliceHash64(key); if (hash_entries_.empty() || hash != hash_entries_.back()) { hash_entries_.push_back(hash); } } virtual Slice Finish(std::unique_ptr* buf) override { uint32_t len_with_metadata = CalculateSpace(static_cast(hash_entries_.size())); char* data = new char[len_with_metadata]; memset(data, 0, len_with_metadata); assert(data); assert(len_with_metadata >= 5); uint32_t len = len_with_metadata - 5; if (len > 0) { AddAllEntries(data, len); } // See BloomFilterPolicy::GetBloomBitsReader re: metadata // -1 = Marker for newer Bloom implementations data[len] = static_cast(-1); // 0 = Marker for this sub-implementation data[len + 1] = static_cast(0); // num_probes (and 0 in upper bits for 64-byte block size) data[len + 2] = static_cast(num_probes_); // rest of metadata stays zero const char* const_data = data; buf->reset(const_data); assert(hash_entries_.empty()); return Slice(data, len_with_metadata); } int CalculateNumEntry(const uint32_t bytes) override { uint32_t bytes_no_meta = bytes >= 5u ? bytes - 5u : 0; return static_cast(uint64_t{8000} * bytes_no_meta / millibits_per_key_); } uint32_t CalculateSpace(const int num_entry) override { uint32_t num_cache_lines = 0; if (millibits_per_key_ > 0 && num_entry > 0) { num_cache_lines = static_cast( (int64_t{num_entry} * millibits_per_key_ + 511999) / 512000); } return num_cache_lines * 64 + /*metadata*/ 5; } double EstimatedFpRate(size_t keys, size_t bytes) override { return FastLocalBloomImpl::EstimatedFpRate(keys, bytes - /*metadata*/ 5, num_probes_, /*hash bits*/ 64); } private: void AddAllEntries(char* data, uint32_t len) { // Simple version without prefetching: // // for (auto h : hash_entries_) { // FastLocalBloomImpl::AddHash(Lower32of64(h), Upper32of64(h), len, // num_probes_, data); // } const size_t num_entries = hash_entries_.size(); constexpr size_t kBufferMask = 7; static_assert(((kBufferMask + 1) & kBufferMask) == 0, "Must be power of 2 minus 1"); std::array hashes; std::array byte_offsets; // Prime the buffer size_t i = 0; for (; i <= kBufferMask && i < num_entries; ++i) { uint64_t h = hash_entries_.front(); hash_entries_.pop_front(); FastLocalBloomImpl::PrepareHash(Lower32of64(h), len, data, /*out*/ &byte_offsets[i]); hashes[i] = Upper32of64(h); } // Process and buffer for (; i < num_entries; ++i) { uint32_t& hash_ref = hashes[i & kBufferMask]; uint32_t& byte_offset_ref = byte_offsets[i & kBufferMask]; // Process (add) FastLocalBloomImpl::AddHashPrepared(hash_ref, num_probes_, data + byte_offset_ref); // And buffer uint64_t h = hash_entries_.front(); hash_entries_.pop_front(); FastLocalBloomImpl::PrepareHash(Lower32of64(h), len, data, /*out*/ &byte_offset_ref); hash_ref = Upper32of64(h); } // Finish processing for (i = 0; i <= kBufferMask && i < num_entries; ++i) { FastLocalBloomImpl::AddHashPrepared(hashes[i], num_probes_, data + byte_offsets[i]); } } int millibits_per_key_; int num_probes_; // A deque avoids unnecessary copying of already-saved values // and has near-minimal peak memory use. std::deque hash_entries_; }; // See description in FastLocalBloomImpl class FastLocalBloomBitsReader : public FilterBitsReader { public: FastLocalBloomBitsReader(const char* data, int num_probes, uint32_t len_bytes) : data_(data), num_probes_(num_probes), len_bytes_(len_bytes) {} // No Copy allowed FastLocalBloomBitsReader(const FastLocalBloomBitsReader&) = delete; void operator=(const FastLocalBloomBitsReader&) = delete; ~FastLocalBloomBitsReader() override {} bool MayMatch(const Slice& key) override { uint64_t h = GetSliceHash64(key); uint32_t byte_offset; FastLocalBloomImpl::PrepareHash(Lower32of64(h), len_bytes_, data_, /*out*/ &byte_offset); return FastLocalBloomImpl::HashMayMatchPrepared(Upper32of64(h), num_probes_, data_ + byte_offset); } virtual void MayMatch(int num_keys, Slice** keys, bool* may_match) override { std::array hashes; std::array byte_offsets; for (int i = 0; i < num_keys; ++i) { uint64_t h = GetSliceHash64(*keys[i]); FastLocalBloomImpl::PrepareHash(Lower32of64(h), len_bytes_, data_, /*out*/ &byte_offsets[i]); hashes[i] = Upper32of64(h); } for (int i = 0; i < num_keys; ++i) { may_match[i] = FastLocalBloomImpl::HashMayMatchPrepared( hashes[i], num_probes_, data_ + byte_offsets[i]); } } private: const char* data_; const int num_probes_; const uint32_t len_bytes_; }; using LegacyBloomImpl = LegacyLocalityBloomImpl; class LegacyBloomBitsBuilder : public BuiltinFilterBitsBuilder { public: explicit LegacyBloomBitsBuilder(const int bits_per_key, Logger* info_log); // No Copy allowed LegacyBloomBitsBuilder(const LegacyBloomBitsBuilder&) = delete; void operator=(const LegacyBloomBitsBuilder&) = delete; ~LegacyBloomBitsBuilder() override; void AddKey(const Slice& key) override; Slice Finish(std::unique_ptr* buf) override; int CalculateNumEntry(const uint32_t bytes) override; uint32_t CalculateSpace(const int num_entry) override { uint32_t dont_care1; uint32_t dont_care2; return CalculateSpace(num_entry, &dont_care1, &dont_care2); } double EstimatedFpRate(size_t keys, size_t bytes) override { return LegacyBloomImpl::EstimatedFpRate(keys, bytes - /*metadata*/ 5, num_probes_); } private: int bits_per_key_; int num_probes_; std::vector hash_entries_; Logger* info_log_; // Get totalbits that optimized for cpu cache line uint32_t GetTotalBitsForLocality(uint32_t total_bits); // Reserve space for new filter char* ReserveSpace(const int num_entry, uint32_t* total_bits, uint32_t* num_lines); // Implementation-specific variant of public CalculateSpace uint32_t CalculateSpace(const int num_entry, uint32_t* total_bits, uint32_t* num_lines); // Assuming single threaded access to this function. void AddHash(uint32_t h, char* data, uint32_t num_lines, uint32_t total_bits); }; LegacyBloomBitsBuilder::LegacyBloomBitsBuilder(const int bits_per_key, Logger* info_log) : bits_per_key_(bits_per_key), num_probes_(LegacyNoLocalityBloomImpl::ChooseNumProbes(bits_per_key_)), info_log_(info_log) { assert(bits_per_key_); } LegacyBloomBitsBuilder::~LegacyBloomBitsBuilder() {} void LegacyBloomBitsBuilder::AddKey(const Slice& key) { uint32_t hash = BloomHash(key); if (hash_entries_.size() == 0 || hash != hash_entries_.back()) { hash_entries_.push_back(hash); } } Slice LegacyBloomBitsBuilder::Finish(std::unique_ptr* buf) { uint32_t total_bits, num_lines; size_t num_entries = hash_entries_.size(); char* data = ReserveSpace(static_cast(num_entries), &total_bits, &num_lines); assert(data); if (total_bits != 0 && num_lines != 0) { for (auto h : hash_entries_) { AddHash(h, data, num_lines, total_bits); } // Check for excessive entries for 32-bit hash function if (num_entries >= /* minimum of 3 million */ 3000000U) { // More specifically, we can detect that the 32-bit hash function // is causing significant increase in FP rate by comparing current // estimated FP rate to what we would get with a normal number of // keys at same memory ratio. double est_fp_rate = LegacyBloomImpl::EstimatedFpRate( num_entries, total_bits / 8, num_probes_); double vs_fp_rate = LegacyBloomImpl::EstimatedFpRate( 1U << 16, (1U << 16) * bits_per_key_ / 8, num_probes_); if (est_fp_rate >= 1.50 * vs_fp_rate) { // For more details, see // https://github.com/facebook/rocksdb/wiki/RocksDB-Bloom-Filter ROCKS_LOG_WARN( info_log_, "Using legacy SST/BBT Bloom filter with excessive key count " "(%.1fM @ %dbpk), causing estimated %.1fx higher filter FP rate. " "Consider using new Bloom with format_version>=5, smaller SST " "file size, or partitioned filters.", num_entries / 1000000.0, bits_per_key_, est_fp_rate / vs_fp_rate); } } } // See BloomFilterPolicy::GetFilterBitsReader for metadata data[total_bits / 8] = static_cast(num_probes_); EncodeFixed32(data + total_bits / 8 + 1, static_cast(num_lines)); const char* const_data = data; buf->reset(const_data); hash_entries_.clear(); return Slice(data, total_bits / 8 + 5); } uint32_t LegacyBloomBitsBuilder::GetTotalBitsForLocality(uint32_t total_bits) { uint32_t num_lines = (total_bits + CACHE_LINE_SIZE * 8 - 1) / (CACHE_LINE_SIZE * 8); // Make num_lines an odd number to make sure more bits are involved // when determining which block. if (num_lines % 2 == 0) { num_lines++; } return num_lines * (CACHE_LINE_SIZE * 8); } uint32_t LegacyBloomBitsBuilder::CalculateSpace(const int num_entry, uint32_t* total_bits, uint32_t* num_lines) { assert(bits_per_key_); if (num_entry != 0) { uint32_t total_bits_tmp = static_cast(num_entry * bits_per_key_); *total_bits = GetTotalBitsForLocality(total_bits_tmp); *num_lines = *total_bits / (CACHE_LINE_SIZE * 8); assert(*total_bits > 0 && *total_bits % 8 == 0); } else { // filter is empty, just leave space for metadata *total_bits = 0; *num_lines = 0; } // Reserve space for Filter uint32_t sz = *total_bits / 8; sz += 5; // 4 bytes for num_lines, 1 byte for num_probes return sz; } char* LegacyBloomBitsBuilder::ReserveSpace(const int num_entry, uint32_t* total_bits, uint32_t* num_lines) { uint32_t sz = CalculateSpace(num_entry, total_bits, num_lines); char* data = new char[sz]; memset(data, 0, sz); return data; } int LegacyBloomBitsBuilder::CalculateNumEntry(const uint32_t bytes) { assert(bits_per_key_); assert(bytes > 0); int high = static_cast(bytes * 8 / bits_per_key_ + 1); int low = 1; int n = high; for (; n >= low; n--) { if (CalculateSpace(n) <= bytes) { break; } } assert(n < high); // High should be an overestimation return n; } inline void LegacyBloomBitsBuilder::AddHash(uint32_t h, char* data, uint32_t num_lines, uint32_t total_bits) { #ifdef NDEBUG static_cast(total_bits); #endif assert(num_lines > 0 && total_bits > 0); LegacyBloomImpl::AddHash(h, num_lines, num_probes_, data, folly::constexpr_log2(CACHE_LINE_SIZE)); } class LegacyBloomBitsReader : public FilterBitsReader { public: LegacyBloomBitsReader(const char* data, int num_probes, uint32_t num_lines, uint32_t log2_cache_line_size) : data_(data), num_probes_(num_probes), num_lines_(num_lines), log2_cache_line_size_(log2_cache_line_size) {} // No Copy allowed LegacyBloomBitsReader(const LegacyBloomBitsReader&) = delete; void operator=(const LegacyBloomBitsReader&) = delete; ~LegacyBloomBitsReader() override {} // "contents" contains the data built by a preceding call to // FilterBitsBuilder::Finish. MayMatch must return true if the key was // passed to FilterBitsBuilder::AddKey. This method may return true or false // if the key was not on the list, but it should aim to return false with a // high probability. bool MayMatch(const Slice& key) override { uint32_t hash = BloomHash(key); uint32_t byte_offset; LegacyBloomImpl::PrepareHashMayMatch( hash, num_lines_, data_, /*out*/ &byte_offset, log2_cache_line_size_); return LegacyBloomImpl::HashMayMatchPrepared( hash, num_probes_, data_ + byte_offset, log2_cache_line_size_); } virtual void MayMatch(int num_keys, Slice** keys, bool* may_match) override { std::array hashes; std::array byte_offsets; for (int i = 0; i < num_keys; ++i) { hashes[i] = BloomHash(*keys[i]); LegacyBloomImpl::PrepareHashMayMatch(hashes[i], num_lines_, data_, /*out*/ &byte_offsets[i], log2_cache_line_size_); } for (int i = 0; i < num_keys; ++i) { may_match[i] = LegacyBloomImpl::HashMayMatchPrepared( hashes[i], num_probes_, data_ + byte_offsets[i], log2_cache_line_size_); } } private: const char* data_; const int num_probes_; const uint32_t num_lines_; const uint32_t log2_cache_line_size_; }; class AlwaysTrueFilter : public FilterBitsReader { public: bool MayMatch(const Slice&) override { return true; } using FilterBitsReader::MayMatch; // inherit overload }; class AlwaysFalseFilter : public FilterBitsReader { public: bool MayMatch(const Slice&) override { return false; } using FilterBitsReader::MayMatch; // inherit overload }; } // namespace const std::vector BloomFilterPolicy::kAllFixedImpls = { kLegacyBloom, kDeprecatedBlock, kFastLocalBloom, }; const std::vector BloomFilterPolicy::kAllUserModes = { kDeprecatedBlock, kAuto, }; BloomFilterPolicy::BloomFilterPolicy(double bits_per_key, Mode mode) : mode_(mode), warned_(false) { // Sanitize bits_per_key if (bits_per_key < 1.0) { bits_per_key = 1.0; } else if (!(bits_per_key < 100.0)) { // including NaN bits_per_key = 100.0; } // Includes a nudge toward rounding up, to ensure on all platforms // that doubles specified with three decimal digits after the decimal // point are interpreted accurately. millibits_per_key_ = static_cast(bits_per_key * 1000.0 + 0.500001); // For better or worse, this is a rounding up of a nudged rounding up, // e.g. 7.4999999999999 will round up to 8, but that provides more // predictability against small arithmetic errors in floating point. whole_bits_per_key_ = (millibits_per_key_ + 500) / 1000; } BloomFilterPolicy::~BloomFilterPolicy() {} const char* BloomFilterPolicy::Name() const { return "rocksdb.BuiltinBloomFilter"; } void BloomFilterPolicy::CreateFilter(const Slice* keys, int n, std::string* dst) const { // We should ideally only be using this deprecated interface for // appropriately constructed BloomFilterPolicy assert(mode_ == kDeprecatedBlock); // Compute bloom filter size (in both bits and bytes) uint32_t bits = static_cast(n * whole_bits_per_key_); // For small n, we can see a very high false positive rate. Fix it // by enforcing a minimum bloom filter length. if (bits < 64) bits = 64; uint32_t bytes = (bits + 7) / 8; bits = bytes * 8; int num_probes = LegacyNoLocalityBloomImpl::ChooseNumProbes(whole_bits_per_key_); const size_t init_size = dst->size(); dst->resize(init_size + bytes, 0); dst->push_back(static_cast(num_probes)); // Remember # of probes char* array = &(*dst)[init_size]; for (int i = 0; i < n; i++) { LegacyNoLocalityBloomImpl::AddHash(BloomHash(keys[i]), bits, num_probes, array); } } bool BloomFilterPolicy::KeyMayMatch(const Slice& key, const Slice& bloom_filter) const { const size_t len = bloom_filter.size(); if (len < 2 || len > 0xffffffffU) { return false; } const char* array = bloom_filter.data(); const uint32_t bits = static_cast(len - 1) * 8; // Use the encoded k so that we can read filters generated by // bloom filters created using different parameters. const int k = static_cast(array[len - 1]); if (k > 30) { // Reserved for potentially new encodings for short bloom filters. // Consider it a match. return true; } // NB: using stored k not num_probes for whole_bits_per_key_ return LegacyNoLocalityBloomImpl::HashMayMatch(BloomHash(key), bits, k, array); } FilterBitsBuilder* BloomFilterPolicy::GetFilterBitsBuilder() const { // This code path should no longer be used, for the built-in // BloomFilterPolicy. Internal to RocksDB and outside // BloomFilterPolicy, only get a FilterBitsBuilder with // BloomFilterPolicy::GetBuilderFromContext(), which will call // BloomFilterPolicy::GetBuilderWithContext(). RocksDB users have // been warned (HISTORY.md) that they can no longer call this on // the built-in BloomFilterPolicy (unlikely). assert(false); return GetBuilderWithContext(FilterBuildingContext(BlockBasedTableOptions())); } FilterBitsBuilder* BloomFilterPolicy::GetBuilderWithContext( const FilterBuildingContext& context) const { Mode cur = mode_; // Unusual code construction so that we can have just // one exhaustive switch without (risky) recursion for (int i = 0; i < 2; ++i) { switch (cur) { case kAuto: if (context.table_options.format_version < 5) { cur = kLegacyBloom; } else { cur = kFastLocalBloom; } break; case kDeprecatedBlock: return nullptr; case kFastLocalBloom: return new FastLocalBloomBitsBuilder(millibits_per_key_); case kLegacyBloom: if (whole_bits_per_key_ >= 14 && context.info_log && !warned_.load(std::memory_order_relaxed)) { warned_ = true; const char* adjective; if (whole_bits_per_key_ >= 20) { adjective = "Dramatic"; } else { adjective = "Significant"; } // For more details, see // https://github.com/facebook/rocksdb/wiki/RocksDB-Bloom-Filter ROCKS_LOG_WARN( context.info_log, "Using legacy Bloom filter with high (%d) bits/key. " "%s filter space and/or accuracy improvement is available " "with format_version>=5.", whole_bits_per_key_, adjective); } return new LegacyBloomBitsBuilder(whole_bits_per_key_, context.info_log); } } assert(false); return nullptr; // something legal } FilterBitsBuilder* BloomFilterPolicy::GetBuilderFromContext( const FilterBuildingContext& context) { if (context.table_options.filter_policy) { return context.table_options.filter_policy->GetBuilderWithContext(context); } else { return nullptr; } } // Read metadata to determine what kind of FilterBitsReader is needed // and return a new one. FilterBitsReader* BloomFilterPolicy::GetFilterBitsReader( const Slice& contents) const { uint32_t len_with_meta = static_cast(contents.size()); if (len_with_meta <= 5) { // filter is empty or broken. Treat like zero keys added. return new AlwaysFalseFilter(); } // Legacy Bloom filter data: // 0 +-----------------------------------+ // | Raw Bloom filter data | // | ... | // len +-----------------------------------+ // | byte for num_probes or | // | marker for new implementations | // len+1 +-----------------------------------+ // | four bytes for number of cache | // | lines | // len_with_meta +-----------------------------------+ int8_t raw_num_probes = static_cast(contents.data()[len_with_meta - 5]); // NB: *num_probes > 30 and < 128 probably have not been used, because of // BloomFilterPolicy::initialize, unless directly calling // LegacyBloomBitsBuilder as an API, but we are leaving those cases in // limbo with LegacyBloomBitsReader for now. if (raw_num_probes < 1) { // Note: < 0 (or unsigned > 127) indicate special new implementations // (or reserved for future use) if (raw_num_probes == -1) { // Marker for newer Bloom implementations return GetBloomBitsReader(contents); } // otherwise // Treat as zero probes (always FP) for now. return new AlwaysTrueFilter(); } // else attempt decode for LegacyBloomBitsReader int num_probes = raw_num_probes; assert(num_probes >= 1); assert(num_probes <= 127); uint32_t len = len_with_meta - 5; assert(len > 0); uint32_t num_lines = DecodeFixed32(contents.data() + len_with_meta - 4); uint32_t log2_cache_line_size; if (num_lines * CACHE_LINE_SIZE == len) { // Common case log2_cache_line_size = folly::constexpr_log2(CACHE_LINE_SIZE); } else if (num_lines == 0 || len % num_lines != 0) { // Invalid (no solution to num_lines * x == len) // Treat as zero probes (always FP) for now. return new AlwaysTrueFilter(); } else { // Determine the non-native cache line size (from another system) log2_cache_line_size = 0; while ((num_lines << log2_cache_line_size) < len) { ++log2_cache_line_size; } if ((num_lines << log2_cache_line_size) != len) { // Invalid (block size not a power of two) // Treat as zero probes (always FP) for now. return new AlwaysTrueFilter(); } } // if not early return return new LegacyBloomBitsReader(contents.data(), num_probes, num_lines, log2_cache_line_size); } // For newer Bloom filter implementations FilterBitsReader* BloomFilterPolicy::GetBloomBitsReader( const Slice& contents) const { uint32_t len_with_meta = static_cast(contents.size()); uint32_t len = len_with_meta - 5; assert(len > 0); // precondition // New Bloom filter data: // 0 +-----------------------------------+ // | Raw Bloom filter data | // | ... | // len +-----------------------------------+ // | char{-1} byte -> new Bloom filter | // len+1 +-----------------------------------+ // | byte for subimplementation | // | 0: FastLocalBloom | // | other: reserved | // len+2 +-----------------------------------+ // | byte for block_and_probes | // | 0 in top 3 bits -> 6 -> 64-byte | // | reserved: | // | 1 in top 3 bits -> 7 -> 128-byte| // | 2 in top 3 bits -> 8 -> 256-byte| // | ... | // | num_probes in bottom 5 bits, | // | except 0 and 31 reserved | // len+3 +-----------------------------------+ // | two bytes reserved | // | possibly for hash seed | // len_with_meta +-----------------------------------+ // Read more metadata (see above) char sub_impl_val = contents.data()[len_with_meta - 4]; char block_and_probes = contents.data()[len_with_meta - 3]; int log2_block_bytes = ((block_and_probes >> 5) & 7) + 6; int num_probes = (block_and_probes & 31); if (num_probes < 1 || num_probes > 30) { // Reserved / future safe return new AlwaysTrueFilter(); } uint16_t rest = DecodeFixed16(contents.data() + len_with_meta - 2); if (rest != 0) { // Reserved, possibly for hash seed // Future safe return new AlwaysTrueFilter(); } if (sub_impl_val == 0) { // FastLocalBloom if (log2_block_bytes == 6) { // Only block size supported for now return new FastLocalBloomBitsReader(contents.data(), num_probes, len); } } // otherwise // Reserved / future safe return new AlwaysTrueFilter(); } const FilterPolicy* NewBloomFilterPolicy(double bits_per_key, bool use_block_based_builder) { BloomFilterPolicy::Mode m; if (use_block_based_builder) { m = BloomFilterPolicy::kDeprecatedBlock; } else { m = BloomFilterPolicy::kAuto; } assert(std::find(BloomFilterPolicy::kAllUserModes.begin(), BloomFilterPolicy::kAllUserModes.end(), m) != BloomFilterPolicy::kAllUserModes.end()); return new BloomFilterPolicy(bits_per_key, m); } FilterBuildingContext::FilterBuildingContext( const BlockBasedTableOptions& _table_options) : table_options(_table_options) {} FilterPolicy::~FilterPolicy() { } } // namespace rocksdb