From caf0f53a74ee2bf624c423d4764f4d7774a26f8b Mon Sep 17 00:00:00 2001 From: Maysam Yabandeh Date: Thu, 9 Aug 2018 16:49:45 -0700 Subject: [PATCH] Index value delta encoding (#3983) Summary: Given that index value is a BlockHandle, which is basically an pair we can apply delta encoding on the values. The first value at each index restart interval encoded the full BlockHandle but the rest encode only the size. Refer to IndexBlockIter::DecodeCurrentValue for the detail of the encoding. This reduces the index size which helps using the block cache more efficiently. The feature is enabled with using format_version 4. The feature comes with a bit of cpu overhead which should be paid back by the higher cache hits due to smaller index block size. Results with sysbench read-only using 4k blocks and using 16 index restart interval: Format 2: 19585 rocksdb read-only range=100 Format 3: 19569 rocksdb read-only range=100 Format 4: 19352 rocksdb read-only range=100 Pull Request resolved: https://github.com/facebook/rocksdb/pull/3983 Differential Revision: D8361343 Pulled By: maysamyabandeh fbshipit-source-id: f882ee082322acac32b0072e2bdbb0b5f854e651 --- HISTORY.md | 3 +- db/builder.h | 1 - db/db_impl.cc | 2 +- db/db_iter.h | 1 - db/db_properties_test.cc | 39 +-- db/db_test_util.cc | 4 +- db/db_test_util.h | 2 +- db/memtable.h | 1 - db/merge_helper.h | 1 - db/table_cache.cc | 4 +- db/table_cache.h | 1 - db/version_edit.cc | 4 +- db/version_set.h | 1 - include/rocksdb/table_properties.h | 3 + table/block.cc | 185 +++++++++--- table/block.h | 90 ++++-- table/block_based_table_builder.cc | 17 +- table/block_based_table_reader.cc | 341 +++++++++++----------- table/block_based_table_reader.h | 47 +-- table/block_builder.cc | 48 ++- table/block_builder.h | 9 +- table/block_test.cc | 101 +++++++ table/cuckoo_table_reader.cc | 5 +- table/cuckoo_table_reader.h | 1 - table/format.cc | 19 ++ table/format.h | 4 +- table/index_builder.cc | 48 ++- table/index_builder.h | 40 ++- table/internal_iterator.h | 28 +- table/iterator.cc | 49 +++- table/iterator_wrapper.h | 34 ++- table/merging_iterator.cc | 2 +- table/merging_iterator.h | 4 +- table/meta_blocks.cc | 4 + table/meta_blocks.h | 1 - table/partitioned_filter_block.cc | 85 +++--- table/partitioned_filter_block.h | 12 +- table/partitioned_filter_block_test.cc | 12 +- table/plain_table_reader.h | 1 - table/table_properties.cc | 8 +- table/table_properties_internal.h | 1 - table/table_reader.h | 1 - table/table_test.cc | 14 +- table/two_level_iterator.cc | 63 ++-- table/two_level_iterator.h | 8 +- util/coding.h | 24 +- util/testutil.cc | 2 +- utilities/transactions/transaction_test.h | 2 +- 48 files changed, 890 insertions(+), 487 deletions(-) diff --git a/HISTORY.md b/HISTORY.md index 653018937..6b89342f8 100644 --- a/HISTORY.md +++ b/HISTORY.md @@ -2,6 +2,7 @@ ## Unreleased ### Public API Change ### New Features +* Changes the format of index blocks by delta encoding the index values, which are the block handles. This saves the encoding of BlockHandle::offset of the non-head index entries in each restart interval. The feature is backward compatible but not forward compatible. It is disabled by default unless format_version 4 or above is used. ### Bug Fixes ## 5.15.0 (7/17/2018) @@ -13,7 +14,7 @@ * The "rocksdb.num.entries" table property no longer counts range deletion tombstones as entries. ### New Features -* Changes the format of index blocks by storing the key in their raw form rather than converting them to InternalKey. This saves 8 bytes per index key. The feature is backward compatbile but not forward compatible. It is disabled by default unless format_version 3 or above is used. +* Changes the format of index blocks by storing the key in their raw form rather than converting them to InternalKey. This saves 8 bytes per index key. The feature is backward compatible but not forward compatible. It is disabled by default unless format_version 3 or above is used. * Avoid memcpy when reading mmap files with OpenReadOnly and max_open_files==-1. * Support dynamically changing `ColumnFamilyOptions::ttl` via `SetOptions()`. * Add a new table property, "rocksdb.num.range-deletions", which counts the number of range deletion tombstones in the table. diff --git a/db/builder.h b/db/builder.h index 0e8218e74..9995723df 100644 --- a/db/builder.h +++ b/db/builder.h @@ -35,7 +35,6 @@ class VersionEdit; class TableBuilder; class WritableFileWriter; class InternalStats; -class InternalIterator; // @param column_family_name Name of the column family that is also identified // by column_family_id, or empty string if unknown. It must outlive the diff --git a/db/db_impl.cc b/db/db_impl.cc index a9fde0b23..1aeb5ab68 100644 --- a/db/db_impl.cc +++ b/db/db_impl.cc @@ -1047,7 +1047,7 @@ InternalIterator* DBImpl::NewInternalIterator( } else { CleanupSuperVersion(super_version); } - return NewErrorInternalIterator(s, arena); + return NewErrorInternalIterator(s, arena); } ColumnFamilyHandle* DBImpl::DefaultColumnFamily() const { diff --git a/db/db_iter.h b/db/db_iter.h index 989a0ba1f..56b6a42db 100644 --- a/db/db_iter.h +++ b/db/db_iter.h @@ -23,7 +23,6 @@ namespace rocksdb { class Arena; class DBIter; -class InternalIterator; // Return a new iterator that converts internal keys (yielded by // "*internal_iter") that were live at the specified "sequence" number diff --git a/db/db_properties_test.cc b/db/db_properties_test.cc index 819758e3f..bdd50ec6c 100644 --- a/db/db_properties_test.cc +++ b/db/db_properties_test.cc @@ -180,17 +180,16 @@ void ParseTablePropertiesString(std::string tp_string, TableProperties* tp) { ResetTableProperties(tp); sscanf(tp_string.c_str(), "# data blocks %" SCNu64 " # entries %" SCNu64 - " # range deletions %" SCNu64 - " raw key size %" SCNu64 + " # range deletions %" SCNu64 " raw key size %" SCNu64 " raw average key size %lf " " raw value size %" SCNu64 " raw average value size %lf " " data block size %" SCNu64 " index block size (user-key? %" SCNu64 - ") %" SCNu64 " filter block size %" SCNu64, + ", delta-value? %" SCNu64 ") %" SCNu64 " filter block size %" SCNu64, &tp->num_data_blocks, &tp->num_entries, &tp->num_range_deletions, &tp->raw_key_size, &dummy_double, &tp->raw_value_size, &dummy_double, - &tp->data_size, &tp->index_key_is_user_key, &tp->index_size, - &tp->filter_size); + &tp->data_size, &tp->index_key_is_user_key, + &tp->index_value_is_delta_encoded, &tp->index_size, &tp->filter_size); } void VerifySimilar(uint64_t a, uint64_t b, double bias) { @@ -224,14 +223,11 @@ void VerifyTableProperties(const TableProperties& base_tp, ASSERT_EQ(base_tp.num_range_deletions, new_tp.num_range_deletions); } -void GetExpectedTableProperties(TableProperties* expected_tp, - const int kKeySize, const int kValueSize, - const int kKeysPerTable, - const int kRangeDeletionsPerTable, - const int kTableCount, - const int kBloomBitsPerKey, - const size_t kBlockSize, - const bool index_key_is_user_key) { +void GetExpectedTableProperties( + TableProperties* expected_tp, const int kKeySize, const int kValueSize, + const int kKeysPerTable, const int kRangeDeletionsPerTable, + const int kTableCount, const int kBloomBitsPerKey, const size_t kBlockSize, + const bool index_key_is_user_key, const bool value_delta_encoding) { const int kKeyCount = kTableCount * kKeysPerTable; const int kRangeDeletionCount = kTableCount * kRangeDeletionsPerTable; const int kAvgSuccessorSize = kKeySize / 5; @@ -248,7 +244,9 @@ void GetExpectedTableProperties(TableProperties* expected_tp, kTableCount * (kKeysPerTable * (kKeySize + 8 + kValueSize)); expected_tp->index_size = expected_tp->num_data_blocks * - (kAvgSuccessorSize + (index_key_is_user_key ? 0 : 8)); + (kAvgSuccessorSize + (index_key_is_user_key ? 0 : 8) - + // discount 1 byte as value size is not encoded in value delta encoding + (value_delta_encoding ? 1 : 0)); expected_tp->filter_size = kTableCount * (kKeysPerTable * kBloomBitsPerKey / 8); } @@ -342,12 +340,14 @@ TEST_F(DBPropertiesTest, AggregatedTableProperties) { TableProperties output_tp; ParseTablePropertiesString(property, &output_tp); bool index_key_is_user_key = output_tp.index_key_is_user_key > 0; + bool value_is_delta_encoded = output_tp.index_value_is_delta_encoded > 0; TableProperties expected_tp; GetExpectedTableProperties(&expected_tp, kKeySize, kValueSize, kKeysPerTable, kRangeDeletionsPerTable, kTableCount, kBloomBitsPerKey, - table_options.block_size, index_key_is_user_key); + table_options.block_size, index_key_is_user_key, + value_is_delta_encoded); VerifyTableProperties(expected_tp, output_tp); } @@ -533,6 +533,7 @@ TEST_F(DBPropertiesTest, AggregatedTablePropertiesAtLevel) { db_->GetProperty(DB::Properties::kAggregatedTableProperties, &tp_string); ParseTablePropertiesString(tp_string, &tp); bool index_key_is_user_key = tp.index_key_is_user_key > 0; + bool value_is_delta_encoded = tp.index_value_is_delta_encoded > 0; ASSERT_EQ(sum_tp.data_size, tp.data_size); ASSERT_EQ(sum_tp.index_size, tp.index_size); ASSERT_EQ(sum_tp.filter_size, tp.filter_size); @@ -542,10 +543,10 @@ TEST_F(DBPropertiesTest, AggregatedTablePropertiesAtLevel) { ASSERT_EQ(sum_tp.num_entries, tp.num_entries); ASSERT_EQ(sum_tp.num_range_deletions, tp.num_range_deletions); if (table > 3) { - GetExpectedTableProperties( - &expected_tp, kKeySize, kValueSize, kKeysPerTable, - kRangeDeletionsPerTable, table, kBloomBitsPerKey, - table_options.block_size, index_key_is_user_key); + GetExpectedTableProperties(&expected_tp, kKeySize, kValueSize, + kKeysPerTable, kRangeDeletionsPerTable, table, + kBloomBitsPerKey, table_options.block_size, + index_key_is_user_key, value_is_delta_encoded); // Gives larger bias here as index block size, filter block size, // and data block size become much harder to estimate in this test. VerifyTableProperties(expected_tp, tp, 0.5, 0.4, 0.4, 0.25); diff --git a/db/db_test_util.cc b/db/db_test_util.cc index 5867713ae..90560d90e 100644 --- a/db/db_test_util.cc +++ b/db/db_test_util.cc @@ -449,8 +449,8 @@ Options DBTestBase::GetOptions( options.prefix_extractor.reset(NewNoopTransform()); break; } - case kBlockBasedTableWithPartitionedIndexFormat3: { - table_options.format_version = 3; + case kBlockBasedTableWithPartitionedIndexFormat4: { + table_options.format_version = 4; // Format 3 changes the binary index format. Since partitioned index is a // super-set of simple indexes, we are also using kTwoLevelIndexSearch to // test this format. diff --git a/db/db_test_util.h b/db/db_test_util.h index 56ab733bf..30ee2e0c2 100644 --- a/db/db_test_util.h +++ b/db/db_test_util.h @@ -698,7 +698,7 @@ class DBTestBase : public testing::Test { kLevelSubcompactions, kBlockBasedTableWithIndexRestartInterval, kBlockBasedTableWithPartitionedIndex, - kBlockBasedTableWithPartitionedIndexFormat3, + kBlockBasedTableWithPartitionedIndexFormat4, kPartitionedFilterWithNewTableReaderForCompactions, kUniversalSubcompactions, // This must be the last line diff --git a/db/memtable.h b/db/memtable.h index 6082a8e6c..f1dbb7012 100644 --- a/db/memtable.h +++ b/db/memtable.h @@ -34,7 +34,6 @@ namespace rocksdb { class Mutex; class MemTableIterator; class MergeContext; -class InternalIterator; struct ImmutableMemTableOptions { explicit ImmutableMemTableOptions(const ImmutableCFOptions& ioptions, diff --git a/db/merge_helper.h b/db/merge_helper.h index db941808e..abb1e1756 100644 --- a/db/merge_helper.h +++ b/db/merge_helper.h @@ -26,7 +26,6 @@ class Iterator; class Logger; class MergeOperator; class Statistics; -class InternalIterator; class MergeHelper { public: diff --git a/db/table_cache.cc b/db/table_cache.cc index edfffb0e8..f374a6876 100644 --- a/db/table_cache.cc +++ b/db/table_cache.cc @@ -238,7 +238,7 @@ InternalIterator* TableCache::NewIterator( if (s.ok()) { if (options.table_filter && !options.table_filter(*table_reader->GetTableProperties())) { - result = NewEmptyInternalIterator(arena); + result = NewEmptyInternalIterator(arena); } else { result = table_reader->NewIterator(options, prefix_extractor, arena, skip_filters, for_compaction); @@ -279,7 +279,7 @@ InternalIterator* TableCache::NewIterator( } if (!s.ok()) { assert(result == nullptr); - result = NewErrorInternalIterator(s, arena); + result = NewErrorInternalIterator(s, arena); } return result; } diff --git a/db/table_cache.h b/db/table_cache.h index dcf09edff..7e7f53cc1 100644 --- a/db/table_cache.h +++ b/db/table_cache.h @@ -31,7 +31,6 @@ class Arena; struct FileDescriptor; class GetContext; class HistogramImpl; -class InternalIterator; class TableCache { public: diff --git a/db/version_edit.cc b/db/version_edit.cc index 40d25999e..447fbf378 100644 --- a/db/version_edit.cc +++ b/db/version_edit.cc @@ -46,7 +46,7 @@ enum CustomTag : uint32_t { kTerminate = 1, // The end of customized fields kNeedCompaction = 2, // Since Manifest is not entirely currently forward-compatible, and the only - // forward-compatbile part is the CutsomtTag of kNewFile, we currently encode + // forward-compatible part is the CutsomtTag of kNewFile, we currently encode // kMinLogNumberToKeep as part of a CustomTag as a hack. This should be // removed when manifest becomes forward-comptabile. kMinLogNumberToKeepHack = 3, @@ -274,7 +274,7 @@ const char* VersionEdit::DecodeNewFile4From(Slice* input) { break; case kMinLogNumberToKeepHack: // This is a hack to encode kMinLogNumberToKeep in a - // forward-compatbile fashion. + // forward-compatible fashion. if (!GetFixed64(&field, &min_log_number_to_keep_)) { return "deleted log number malformatted"; } diff --git a/db/version_set.h b/db/version_set.h index fe8f26339..f57f84fb7 100644 --- a/db/version_set.h +++ b/db/version_set.h @@ -52,7 +52,6 @@ class Writer; } class Compaction; -class InternalIterator; class LogBuffer; class LookupKey; class MemTable; diff --git a/include/rocksdb/table_properties.h b/include/rocksdb/table_properties.h index 32ddb6c98..d545e455f 100644 --- a/include/rocksdb/table_properties.h +++ b/include/rocksdb/table_properties.h @@ -34,6 +34,7 @@ struct TablePropertiesNames { static const std::string kIndexPartitions; static const std::string kTopLevelIndexSize; static const std::string kIndexKeyIsUserKey; + static const std::string kIndexValueIsDeltaEncoded; static const std::string kFilterSize; static const std::string kRawKeySize; static const std::string kRawValueSize; @@ -139,6 +140,8 @@ struct TableProperties { // Whether the index key is user key. Otherwise it includes 8 byte of sequence // number added by internal key format. uint64_t index_key_is_user_key = 0; + // Whether delta encoding is used to encode the index values. + uint64_t index_value_is_delta_encoded = 0; // the size of filter block. uint64_t filter_size = 0; // total raw key size diff --git a/table/block.cc b/table/block.cc index c1891801c..0e6c2b112 100644 --- a/table/block.cc +++ b/table/block.cc @@ -33,28 +33,65 @@ namespace rocksdb { // // If any errors are detected, returns nullptr. Otherwise, returns a // pointer to the key delta (just past the three decoded values). -static inline const char* DecodeEntry(const char* p, const char* limit, - uint32_t* shared, - uint32_t* non_shared, - uint32_t* value_length) { - if (limit - p < 3) return nullptr; - *shared = reinterpret_cast(p)[0]; - *non_shared = reinterpret_cast(p)[1]; - *value_length = reinterpret_cast(p)[2]; - if ((*shared | *non_shared | *value_length) < 128) { - // Fast path: all three values are encoded in one byte each - p += 3; - } else { - if ((p = GetVarint32Ptr(p, limit, shared)) == nullptr) return nullptr; - if ((p = GetVarint32Ptr(p, limit, non_shared)) == nullptr) return nullptr; - if ((p = GetVarint32Ptr(p, limit, value_length)) == nullptr) return nullptr; - } +struct DecodeEntry { + inline const char* operator()(const char* p, const char* limit, + uint32_t* shared, uint32_t* non_shared, + uint32_t* value_length) { + // We need 2 bytes for shared and non_shared size. We also need one more + // byte either for value size or the actual value in case of value delta + // encoding. + assert(limit - p >= 3); + *shared = reinterpret_cast(p)[0]; + *non_shared = reinterpret_cast(p)[1]; + *value_length = reinterpret_cast(p)[2]; + if ((*shared | *non_shared | *value_length) < 128) { + // Fast path: all three values are encoded in one byte each + p += 3; + } else { + if ((p = GetVarint32Ptr(p, limit, shared)) == nullptr) return nullptr; + if ((p = GetVarint32Ptr(p, limit, non_shared)) == nullptr) return nullptr; + if ((p = GetVarint32Ptr(p, limit, value_length)) == nullptr) { + return nullptr; + } + } - if (static_cast(limit - p) < (*non_shared + *value_length)) { - return nullptr; + // Using an assert in place of "return null" since we should not pay the + // cost of checking for corruption on every single key decoding + assert(!(static_cast(limit - p) < (*non_shared + *value_length))); + return p; + } +}; + +struct DecodeKey { + inline const char* operator()(const char* p, const char* limit, + uint32_t* shared, uint32_t* non_shared) { + uint32_t value_length; + return DecodeEntry()(p, limit, shared, non_shared, &value_length); + } +}; + +// In format_version 4, which is used by index blocks, the value size is not +// encoded before the entry, as the value is known to be the handle with the +// known size. +struct DecodeKeyV4 { + inline const char* operator()(const char* p, const char* limit, + uint32_t* shared, uint32_t* non_shared) { + // We need 2 bytes for shared and non_shared size. We also need one more + // byte either for value size or the actual value in case of value delta + // encoding. + if (limit - p < 3) return nullptr; + *shared = reinterpret_cast(p)[0]; + *non_shared = reinterpret_cast(p)[1]; + if ((*shared | *non_shared) < 128) { + // Fast path: all three values are encoded in one byte each + p += 2; + } else { + if ((p = GetVarint32Ptr(p, limit, shared)) == nullptr) return nullptr; + if ((p = GetVarint32Ptr(p, limit, non_shared)) == nullptr) return nullptr; + } + return p; } - return p; -} +}; void DataBlockIter::Next() { assert(Valid()); @@ -170,7 +207,8 @@ void DataBlockIter::Seek(const Slice& target) { return; } uint32_t index = 0; - bool ok = BinarySeek(seek_key, 0, num_restarts_ - 1, &index, comparator_); + bool ok = BinarySeek(seek_key, 0, num_restarts_ - 1, &index, + comparator_); if (!ok) { return; @@ -198,8 +236,12 @@ void IndexBlockIter::Seek(const Slice& target) { bool ok = false; if (prefix_index_) { ok = PrefixSeek(target, &index); + } else if (value_delta_encoded_) { + ok = BinarySeek(seek_key, 0, num_restarts_ - 1, &index, + active_comparator_); } else { - ok = BinarySeek(seek_key, 0, num_restarts_ - 1, &index, active_comparator_); + ok = BinarySeek(seek_key, 0, num_restarts_ - 1, &index, + active_comparator_); } if (!ok) { @@ -222,7 +264,8 @@ void DataBlockIter::SeekForPrev(const Slice& target) { return; } uint32_t index = 0; - bool ok = BinarySeek(seek_key, 0, num_restarts_ - 1, &index, comparator_); + bool ok = BinarySeek(seek_key, 0, num_restarts_ - 1, &index, + comparator_); if (!ok) { return; @@ -277,7 +320,8 @@ void IndexBlockIter::SeekToLast() { } } -void BlockIter::CorruptionError() { +template +void BlockIter::CorruptionError() { current_ = restarts_; restart_index_ = num_restarts_; status_ = Status::Corruption("bad entry in block"); @@ -298,7 +342,7 @@ bool DataBlockIter::ParseNextDataKey() { // Decode next entry uint32_t shared, non_shared, value_length; - p = DecodeEntry(p, limit, &shared, &non_shared, &value_length); + p = DecodeEntry()(p, limit, &shared, &non_shared, &value_length); if (p == nullptr || key_.Size() < shared) { CorruptionError(); return false; @@ -340,10 +384,14 @@ bool DataBlockIter::ParseNextDataKey() { } value_ = Slice(p + non_shared, value_length); - while (restart_index_ + 1 < num_restarts_ && - GetRestartPoint(restart_index_ + 1) < current_) { - ++restart_index_; + if (shared == 0) { + while (restart_index_ + 1 < num_restarts_ && + GetRestartPoint(restart_index_ + 1) < current_) { + ++restart_index_; + } } + // else we are in the middle of a restart interval and the restart_index_ + // thus has not changed return true; } } @@ -361,7 +409,12 @@ bool IndexBlockIter::ParseNextIndexKey() { // Decode next entry uint32_t shared, non_shared, value_length; - p = DecodeEntry(p, limit, &shared, &non_shared, &value_length); + if (value_delta_encoded_) { + p = DecodeKeyV4()(p, limit, &shared, &non_shared); + value_length = 0; + } else { + p = DecodeEntry()(p, limit, &shared, &non_shared, &value_length); + } if (p == nullptr || key_.Size() < shared) { CorruptionError(); return false; @@ -377,27 +430,69 @@ bool IndexBlockIter::ParseNextIndexKey() { key_pinned_ = false; } value_ = Slice(p + non_shared, value_length); - while (restart_index_ + 1 < num_restarts_ && - GetRestartPoint(restart_index_ + 1) < current_) { - ++restart_index_; + if (shared == 0) { + while (restart_index_ + 1 < num_restarts_ && + GetRestartPoint(restart_index_ + 1) < current_) { + ++restart_index_; + } + } + // else we are in the middle of a restart interval and the restart_index_ + // thus has not changed + if (value_delta_encoded_) { + assert(value_length == 0); + DecodeCurrentValue(shared); } return true; } +// The format: +// restart_point 0: k, v (off, sz), k, v (delta-sz), ..., k, v (delta-sz) +// restart_point 1: k, v (off, sz), k, v (delta-sz), ..., k, v (delta-sz) +// ... +// restart_point n-1: k, v (off, sz), k, v (delta-sz), ..., k, v (delta-sz) +// where, k is key, v is value, and its encoding is in parenthesis. +// The format of each key is (shared_size, non_shared_size, shared, non_shared) +// The format of each value, i.e., block hanlde, is (offset, size) whenever the +// shared_size is 0, which included the first entry in each restart point. +// Otherwise the format is delta-size = block handle size - size of last block +// handle. +void IndexBlockIter::DecodeCurrentValue(uint32_t shared) { + assert(value_delta_encoded_); + const char* limit = data_ + restarts_; + if (shared == 0) { + uint64_t o, s; + const char* newp = GetVarint64Ptr(value_.data(), limit, &o); + newp = GetVarint64Ptr(newp, limit, &s); + decoded_value_ = BlockHandle(o, s); + value_ = Slice(value_.data(), newp - value_.data()); + } else { + uint64_t next_value_base = + decoded_value_.offset() + decoded_value_.size() + kBlockTrailerSize; + int64_t delta; + const char* newp = GetVarsignedint64Ptr(value_.data(), limit, &delta); + decoded_value_ = + BlockHandle(next_value_base, decoded_value_.size() + delta); + value_ = Slice(value_.data(), newp - value_.data()); + } +} + // Binary search in restart array to find the first restart point that // is either the last restart point with a key less than target, // which means the key of next restart point is larger than target, or // the first restart point with a key = target -bool BlockIter::BinarySeek(const Slice& target, uint32_t left, uint32_t right, - uint32_t* index, const Comparator* comp) { +template +template +bool BlockIter::BinarySeek(const Slice& target, uint32_t left, + uint32_t right, uint32_t* index, + const Comparator* comp) { assert(left <= right); while (left < right) { uint32_t mid = (left + right + 1) / 2; uint32_t region_offset = GetRestartPoint(mid); - uint32_t shared, non_shared, value_length; - const char* key_ptr = DecodeEntry(data_ + region_offset, data_ + restarts_, - &shared, &non_shared, &value_length); + uint32_t shared, non_shared; + const char* key_ptr = DecodeKeyFunc()( + data_ + region_offset, data_ + restarts_, &shared, &non_shared); if (key_ptr == nullptr || (shared != 0)) { CorruptionError(); return false; @@ -425,9 +520,13 @@ bool BlockIter::BinarySeek(const Slice& target, uint32_t left, uint32_t right, // Return -1 if error. int IndexBlockIter::CompareBlockKey(uint32_t block_index, const Slice& target) { uint32_t region_offset = GetRestartPoint(block_index); - uint32_t shared, non_shared, value_length; - const char* key_ptr = DecodeEntry(data_ + region_offset, data_ + restarts_, - &shared, &non_shared, &value_length); + uint32_t shared, non_shared; + const char* key_ptr = + value_delta_encoded_ + ? DecodeKeyV4()(data_ + region_offset, data_ + restarts_, &shared, + &non_shared) + : DecodeKey()(data_ + region_offset, data_ + restarts_, &shared, + &non_shared); if (key_ptr == nullptr || (shared != 0)) { CorruptionError(); return 1; // Return target is smaller @@ -544,6 +643,7 @@ DataBlockIter* Block::NewIterator(const Comparator* cmp, const Comparator* ucmp, DataBlockIter* iter, Statistics* stats, bool /*total_order_seek*/, bool /*key_includes_seq*/, + bool /*value_is_full*/, BlockPrefixIndex* /*prefix_index*/) { DataBlockIter* ret_iter; if (iter != nullptr) { @@ -577,7 +677,7 @@ template <> IndexBlockIter* Block::NewIterator(const Comparator* cmp, const Comparator* ucmp, IndexBlockIter* iter, Statistics* /*stats*/, bool total_order_seek, - bool key_includes_seq, + bool key_includes_seq, bool value_is_full, BlockPrefixIndex* prefix_index) { IndexBlockIter* ret_iter; if (iter != nullptr) { @@ -597,7 +697,8 @@ IndexBlockIter* Block::NewIterator(const Comparator* cmp, BlockPrefixIndex* prefix_index_ptr = total_order_seek ? nullptr : prefix_index; ret_iter->Initialize(cmp, ucmp, data_, restart_offset_, num_restarts_, - prefix_index_ptr, key_includes_seq, cachable()); + prefix_index_ptr, key_includes_seq, value_is_full, + cachable()); } return ret_iter; diff --git a/table/block.h b/table/block.h index 8ee450ca9..a29ea1689 100644 --- a/table/block.h +++ b/table/block.h @@ -35,6 +35,7 @@ namespace rocksdb { struct BlockContents; class Comparator; +template class BlockIter; class DataBlockIter; class IndexBlockIter; @@ -164,6 +165,11 @@ class Block { // If iter is null, return new Iterator // If iter is not null, update this one and return it as Iterator* // + // key_includes_seq, default true, means that the keys are in internal key + // format. + // value_is_full, default ture, means that no delta encoding is + // applied to values. + // // NewIterator // Same as above but also updates read_amp_bitmap_ if it is not nullptr. // @@ -175,13 +181,11 @@ class Block { // the iterator will simply be set as "invalid", rather than returning // the key that is just pass the target key. template - TBlockIter* NewIterator(const Comparator* comparator, - const Comparator* user_comparator, - TBlockIter* iter = nullptr, - Statistics* stats = nullptr, - bool total_order_seek = true, - bool key_includes_seq = true, - BlockPrefixIndex* prefix_index = nullptr); + TBlockIter* NewIterator( + const Comparator* comparator, const Comparator* user_comparator, + TBlockIter* iter = nullptr, Statistics* stats = nullptr, + bool total_order_seek = true, bool key_includes_seq = true, + bool value_is_full = true, BlockPrefixIndex* prefix_index = nullptr); // Report an approximation of how much memory has been used. size_t ApproximateMemoryUsage() const; @@ -204,7 +208,8 @@ class Block { void operator=(const Block&) = delete; }; -class BlockIter : public InternalIterator { +template +class BlockIter : public InternalIteratorBase { public: void InitializeBase(const Comparator* comparator, const char* data, uint32_t restarts, uint32_t num_restarts, @@ -243,10 +248,6 @@ class BlockIter : public InternalIterator { assert(Valid()); return key_.GetKey(); } - virtual Slice value() const override { - assert(Valid()); - return value_; - } #ifndef NDEBUG virtual ~BlockIter() { @@ -280,7 +281,8 @@ class BlockIter : public InternalIterator { const char* data_; // underlying block contents uint32_t num_restarts_; // Number of uint32_t entries in restart array - uint32_t restart_index_; // Index of restart block in which current_ falls + // Index of restart block in which current_ or current_-1 falls + uint32_t restart_index_; uint32_t restarts_; // Offset of restart array (list of fixed32) // current_ is offset in data_ of current entry. >= restarts_ if !Valid uint32_t current_; @@ -316,11 +318,12 @@ class BlockIter : public InternalIterator { void CorruptionError(); - bool BinarySeek(const Slice& target, uint32_t left, uint32_t right, - uint32_t* index, const Comparator* comp); + template + inline bool BinarySeek(const Slice& target, uint32_t left, uint32_t right, + uint32_t* index, const Comparator* comp); }; -class DataBlockIter final : public BlockIter { +class DataBlockIter final : public BlockIter { public: DataBlockIter() : BlockIter(), read_amp_bitmap_(nullptr), last_bitmap_offset_(0) {} @@ -405,14 +408,14 @@ class DataBlockIter final : public BlockIter { std::vector prev_entries_; int32_t prev_entries_idx_ = -1; - bool ParseNextDataKey(); + inline bool ParseNextDataKey(); inline int Compare(const IterKey& ikey, const Slice& b) const { return comparator_->Compare(ikey.GetInternalKey(), b); } }; -class IndexBlockIter final : public BlockIter { +class IndexBlockIter final : public BlockIter { public: IndexBlockIter() : BlockIter(), prefix_index_(nullptr) {} @@ -420,27 +423,46 @@ class IndexBlockIter final : public BlockIter { assert(Valid()); return key_.GetKey(); } + // key_includes_seq, default true, means that the keys are in internal key + // format. + // value_is_full, default ture, means that no delta encoding is + // applied to values. IndexBlockIter(const Comparator* comparator, const Comparator* user_comparator, const char* data, uint32_t restarts, uint32_t num_restarts, BlockPrefixIndex* prefix_index, bool key_includes_seq, - bool block_contents_pinned) + bool value_is_full, bool block_contents_pinned) : IndexBlockIter() { Initialize(comparator, user_comparator, data, restarts, num_restarts, - prefix_index, key_includes_seq, block_contents_pinned); + prefix_index, key_includes_seq, block_contents_pinned, + value_is_full); } void Initialize(const Comparator* comparator, const Comparator* user_comparator, const char* data, uint32_t restarts, uint32_t num_restarts, BlockPrefixIndex* prefix_index, bool key_includes_seq, - bool block_contents_pinned) { + bool value_is_full, bool block_contents_pinned) { InitializeBase(comparator, data, restarts, num_restarts, kDisableGlobalSequenceNumber, block_contents_pinned); key_includes_seq_ = key_includes_seq; active_comparator_ = key_includes_seq_ ? comparator_ : user_comparator; key_.SetIsUserKey(!key_includes_seq_); prefix_index_ = prefix_index; + value_delta_encoded_ = !value_is_full; + } + + virtual BlockHandle value() const override { + assert(Valid()); + if (value_delta_encoded_) { + return decoded_value_; + } else { + BlockHandle handle; + Slice v = value_; + Status decode_s __attribute__((__unused__)) = handle.DecodeFrom(&v); + assert(decode_s.ok()); + return handle; + } } virtual void Seek(const Slice& target) override; @@ -467,11 +489,25 @@ class IndexBlockIter final : public BlockIter { void Invalidate(Status s) { InvalidateBase(s); } private: + // Key is in InternalKey format + bool key_includes_seq_; + bool value_delta_encoded_; + // key_includes_seq_ ? comparator_ : user_comparator_ + const Comparator* active_comparator_; + BlockPrefixIndex* prefix_index_; + // Whether the value is delta encoded. In that case the value is assumed to be + // BlockHandle. The first value in each restart interval is the full encoded + // BlockHandle; the restart of encoded size part of the BlockHandle. The + // offset of delta encoded BlockHandles is computed by adding the size of + // previous delta encoded values in the same restart interval to the offset of + // the first value in that restart interval. + BlockHandle decoded_value_; + bool PrefixSeek(const Slice& target, uint32_t* index); bool BinaryBlockIndexSeek(const Slice& target, uint32_t* block_ids, uint32_t left, uint32_t right, uint32_t* index); - int CompareBlockKey(uint32_t block_index, const Slice& target); + inline int CompareBlockKey(uint32_t block_index, const Slice& target); inline int Compare(const Slice& a, const Slice& b) const { return active_comparator_->Compare(a, b); @@ -481,13 +517,11 @@ class IndexBlockIter final : public BlockIter { return active_comparator_->Compare(ikey.GetKey(), b); } - bool ParseNextIndexKey(); + inline bool ParseNextIndexKey(); - // Key is in InternalKey format - bool key_includes_seq_; - // key_includes_seq_ ? comparator_ : user_comparator_ - const Comparator* active_comparator_; - BlockPrefixIndex* prefix_index_; + // When value_delta_encoded_ is enabled it decodes the value which is assumed + // to be BlockHandle and put it to decoded_value_ + inline void DecodeCurrentValue(uint32_t shared); }; } // namespace rocksdb diff --git a/table/block_based_table_builder.cc b/table/block_based_table_builder.cc index 768fb7566..b0234e7dc 100644 --- a/table/block_based_table_builder.cc +++ b/table/block_based_table_builder.cc @@ -63,6 +63,7 @@ namespace { FilterBlockBuilder* CreateFilterBlockBuilder( const ImmutableCFOptions& /*opt*/, const MutableCFOptions& mopt, const BlockBasedTableOptions& table_opt, + const bool use_delta_encoding_for_index_values, PartitionedIndexBuilder* const p_index_builder) { if (table_opt.filter_policy == nullptr) return nullptr; @@ -85,7 +86,7 @@ FilterBlockBuilder* CreateFilterBlockBuilder( return new PartitionedFilterBlockBuilder( mopt.prefix_extractor.get(), table_opt.whole_key_filtering, filter_bits_builder, table_opt.index_block_restart_interval, - p_index_builder, partition_size); + use_delta_encoding_for_index_values, p_index_builder, partition_size); } else { return new FullFilterBlockBuilder(mopt.prefix_extractor.get(), table_opt.whole_key_filtering, @@ -266,6 +267,7 @@ struct BlockBasedTableBuilder::Rep { TableProperties props; bool closed = false; // Either Finish() or Abandon() has been called. + const bool use_delta_encoding_for_index_values; std::unique_ptr filter_builder; char compressed_cache_key_prefix[BlockBasedTable::kMaxCacheKeyPrefixSize]; size_t compressed_cache_key_prefix_size; @@ -306,6 +308,8 @@ struct BlockBasedTableBuilder::Rep { internal_prefix_transform(_moptions.prefix_extractor.get()), compression_dict(_compression_dict), compression_ctx(_compression_type, _compression_opts), + use_delta_encoding_for_index_values(table_opt.format_version >= 4 && + !table_opt.block_align), compressed_cache_key_prefix_size(0), flush_block_policy( table_options.flush_block_policy_factory->NewFlushBlockPolicy( @@ -317,18 +321,21 @@ struct BlockBasedTableBuilder::Rep { if (table_options.index_type == BlockBasedTableOptions::kTwoLevelIndexSearch) { p_index_builder_ = PartitionedIndexBuilder::CreateIndexBuilder( - &internal_comparator, table_options); + &internal_comparator, use_delta_encoding_for_index_values, + table_options); index_builder.reset(p_index_builder_); } else { index_builder.reset(IndexBuilder::CreateIndexBuilder( table_options.index_type, &internal_comparator, - &this->internal_prefix_transform, table_options)); + &this->internal_prefix_transform, use_delta_encoding_for_index_values, + table_options)); } if (skip_filters) { filter_builder = nullptr; } else { filter_builder.reset(CreateFilterBlockBuilder( - _ioptions, _moptions, table_options, p_index_builder_)); + _ioptions, _moptions, table_options, + use_delta_encoding_for_index_values, p_index_builder_)); } for (auto& collector_factories : *int_tbl_prop_collector_factories) { @@ -793,6 +800,8 @@ void BlockBasedTableBuilder::WritePropertiesBlock( } rep_->props.index_key_is_user_key = !rep_->index_builder->seperator_is_key_plus_seq(); + rep_->props.index_value_is_delta_encoded = + rep_->use_delta_encoding_for_index_values; rep_->props.creation_time = rep_->creation_time; rep_->props.oldest_key_time = rep_->oldest_key_time; diff --git a/table/block_based_table_reader.cc b/table/block_based_table_reader.cc index cd292a3fe..83b722cc9 100644 --- a/table/block_based_table_reader.cc +++ b/table/block_based_table_reader.cc @@ -214,7 +214,8 @@ class PartitionIndexReader : public IndexReader, public Cleanable { const InternalKeyComparator* icomparator, IndexReader** index_reader, const PersistentCacheOptions& cache_options, - const int level, const bool index_key_includes_seq) { + const int level, const bool index_key_includes_seq, + const bool index_value_is_full) { std::unique_ptr index_block; auto s = ReadBlockFromFile( file, prefetch_buffer, footer, ReadOptions(), index_handle, @@ -225,36 +226,37 @@ class PartitionIndexReader : public IndexReader, public Cleanable { if (s.ok()) { *index_reader = new PartitionIndexReader( table, icomparator, std::move(index_block), ioptions.statistics, - level, index_key_includes_seq); + level, index_key_includes_seq, index_value_is_full); } return s; } // return a two-level iterator: first level is on the partition index - virtual InternalIterator* NewIterator(IndexBlockIter* /*iter*/ = nullptr, - bool /*dont_care*/ = true, - bool fill_cache = true) override { + virtual InternalIteratorBase* NewIterator( + IndexBlockIter* /*iter*/ = nullptr, bool /*dont_care*/ = true, + bool fill_cache = true) override { Statistics* kNullStats = nullptr; // Filters are already checked before seeking the index if (!partition_map_.empty()) { return NewTwoLevelIterator( new BlockBasedTable::PartitionedIndexIteratorState( - table_, &partition_map_, index_key_includes_seq_), + table_, &partition_map_, index_key_includes_seq_, + index_value_is_full_), index_block_->NewIterator( icomparator_, icomparator_->user_comparator(), nullptr, - kNullStats, true, index_key_includes_seq_)); + kNullStats, true, index_key_includes_seq_, index_value_is_full_)); } else { auto ro = ReadOptions(); ro.fill_cache = fill_cache; bool kIsIndex = true; - return new BlockBasedTableIterator( + return new BlockBasedTableIterator( table_, ro, *icomparator_, index_block_->NewIterator( icomparator_, icomparator_->user_comparator(), nullptr, - kNullStats, true, index_key_includes_seq_), + kNullStats, true, index_key_includes_seq_, index_value_is_full_), false, true, /* prefix_extractor */ nullptr, kIsIndex, - index_key_includes_seq_); + index_key_includes_seq_, index_value_is_full_); } // TODO(myabandeh): Update TwoLevelIterator to be able to make use of // on-stack BlockIter while the state is on heap. Currentlly it assumes @@ -270,7 +272,7 @@ class PartitionIndexReader : public IndexReader, public Cleanable { Statistics* kNullStats = nullptr; index_block_->NewIterator( icomparator_, icomparator_->user_comparator(), &biter, kNullStats, true, - index_key_includes_seq_); + index_key_includes_seq_, index_value_is_full_); // Index partitions are assumed to be consecuitive. Prefetch them all. // Read the first block offset biter.SeekToFirst(); @@ -278,14 +280,7 @@ class PartitionIndexReader : public IndexReader, public Cleanable { // Empty index. return; } - Slice input = biter.value(); - Status s = handle.DecodeFrom(&input); - assert(s.ok()); - if (!s.ok()) { - ROCKS_LOG_WARN(rep->ioptions.info_log, - "Could not read first index partition"); - return; - } + handle = biter.value(); uint64_t prefetch_off = handle.offset(); // Read the last block's offset @@ -294,36 +289,21 @@ class PartitionIndexReader : public IndexReader, public Cleanable { // Empty index. return; } - input = biter.value(); - s = handle.DecodeFrom(&input); - assert(s.ok()); - if (!s.ok()) { - ROCKS_LOG_WARN(rep->ioptions.info_log, - "Could not read last index partition"); - return; - } + handle = biter.value(); uint64_t last_off = handle.offset() + handle.size() + kBlockTrailerSize; uint64_t prefetch_len = last_off - prefetch_off; std::unique_ptr prefetch_buffer; auto& file = table_->rep_->file; prefetch_buffer.reset(new FilePrefetchBuffer()); - s = prefetch_buffer->Prefetch(file.get(), prefetch_off, - static_cast(prefetch_len)); + Status s = prefetch_buffer->Prefetch(file.get(), prefetch_off, + static_cast(prefetch_len)); // After prefetch, read the partitions one by one biter.SeekToFirst(); auto ro = ReadOptions(); Cache* block_cache = rep->table_options.block_cache.get(); for (; biter.Valid(); biter.Next()) { - input = biter.value(); - s = handle.DecodeFrom(&input); - assert(s.ok()); - if (!s.ok()) { - ROCKS_LOG_WARN(rep->ioptions.info_log, - "Could not read index partition"); - continue; - } - + handle = biter.value(); BlockBasedTable::CachableEntry block; Slice compression_dict; if (rep->compression_dict_block) { @@ -374,11 +354,13 @@ class PartitionIndexReader : public IndexReader, public Cleanable { PartitionIndexReader(BlockBasedTable* table, const InternalKeyComparator* icomparator, std::unique_ptr&& index_block, Statistics* stats, - const int /*level*/, const bool index_key_includes_seq) + const int /*level*/, const bool index_key_includes_seq, + const bool index_value_is_full) : IndexReader(icomparator, stats), table_(table), index_block_(std::move(index_block)), - index_key_includes_seq_(index_key_includes_seq) { + index_key_includes_seq_(index_key_includes_seq), + index_value_is_full_(index_value_is_full) { assert(index_block_ != nullptr); } BlockBasedTable* table_; @@ -386,6 +368,7 @@ class PartitionIndexReader : public IndexReader, public Cleanable { std::unordered_map> partition_map_; const bool index_key_includes_seq_; + const bool index_value_is_full_; }; // Index that allows binary search lookup for the first key of each block. @@ -404,7 +387,8 @@ class BinarySearchIndexReader : public IndexReader { const InternalKeyComparator* icomparator, IndexReader** index_reader, const PersistentCacheOptions& cache_options, - const bool index_key_includes_seq) { + const bool index_key_includes_seq, + const bool index_value_is_full) { std::unique_ptr index_block; auto s = ReadBlockFromFile( file, prefetch_buffer, footer, ReadOptions(), index_handle, @@ -415,19 +399,19 @@ class BinarySearchIndexReader : public IndexReader { if (s.ok()) { *index_reader = new BinarySearchIndexReader( icomparator, std::move(index_block), ioptions.statistics, - index_key_includes_seq); + index_key_includes_seq, index_value_is_full); } return s; } - virtual InternalIterator* NewIterator(IndexBlockIter* iter = nullptr, - bool /*dont_care*/ = true, - bool /*dont_care*/ = true) override { + virtual InternalIteratorBase* NewIterator( + IndexBlockIter* iter = nullptr, bool /*dont_care*/ = true, + bool /*dont_care*/ = true) override { Statistics* kNullStats = nullptr; return index_block_->NewIterator( icomparator_, icomparator_->user_comparator(), iter, kNullStats, true, - index_key_includes_seq_); + index_key_includes_seq_, index_value_is_full_); } virtual size_t size() const override { return index_block_->size(); } @@ -449,31 +433,32 @@ class BinarySearchIndexReader : public IndexReader { private: BinarySearchIndexReader(const InternalKeyComparator* icomparator, std::unique_ptr&& index_block, - Statistics* stats, const bool index_key_includes_seq) + Statistics* stats, const bool index_key_includes_seq, + const bool index_value_is_full) : IndexReader(icomparator, stats), index_block_(std::move(index_block)), - index_key_includes_seq_(index_key_includes_seq) { + index_key_includes_seq_(index_key_includes_seq), + index_value_is_full_(index_value_is_full) { assert(index_block_ != nullptr); } std::unique_ptr index_block_; const bool index_key_includes_seq_; + const bool index_value_is_full_; }; // Index that leverages an internal hash table to quicken the lookup for a given // key. class HashIndexReader : public IndexReader { public: - static Status Create(const SliceTransform* hash_key_extractor, - const Footer& footer, RandomAccessFileReader* file, - FilePrefetchBuffer* prefetch_buffer, - const ImmutableCFOptions& ioptions, - const InternalKeyComparator* icomparator, - const BlockHandle& index_handle, - InternalIterator* meta_index_iter, - IndexReader** index_reader, - bool /*hash_index_allow_collision*/, - const PersistentCacheOptions& cache_options, - const bool index_key_includes_seq) { + static Status Create( + const SliceTransform* hash_key_extractor, const Footer& footer, + RandomAccessFileReader* file, FilePrefetchBuffer* prefetch_buffer, + const ImmutableCFOptions& ioptions, + const InternalKeyComparator* icomparator, const BlockHandle& index_handle, + InternalIterator* meta_index_iter, IndexReader** index_reader, + bool /*hash_index_allow_collision*/, + const PersistentCacheOptions& cache_options, + const bool index_key_includes_seq, const bool index_value_is_full) { std::unique_ptr index_block; auto s = ReadBlockFromFile( file, prefetch_buffer, footer, ReadOptions(), index_handle, @@ -489,9 +474,9 @@ class HashIndexReader : public IndexReader { // hard error. We can still fall back to the original binary search index. // So, Create will succeed regardless, from this point on. - auto new_index_reader = - new HashIndexReader(icomparator, std::move(index_block), - ioptions.statistics, index_key_includes_seq); + auto new_index_reader = new HashIndexReader( + icomparator, std::move(index_block), ioptions.statistics, + index_key_includes_seq, index_value_is_full); *index_reader = new_index_reader; // Get prefixes block @@ -545,13 +530,14 @@ class HashIndexReader : public IndexReader { return Status::OK(); } - virtual InternalIterator* NewIterator(IndexBlockIter* iter = nullptr, - bool total_order_seek = true, - bool /*dont_care*/ = true) override { + virtual InternalIteratorBase* NewIterator( + IndexBlockIter* iter = nullptr, bool total_order_seek = true, + bool /*dont_care*/ = true) override { Statistics* kNullStats = nullptr; return index_block_->NewIterator( icomparator_, icomparator_->user_comparator(), iter, kNullStats, - total_order_seek, index_key_includes_seq_, prefix_index_.get()); + total_order_seek, index_key_includes_seq_, index_value_is_full_, + prefix_index_.get()); } virtual size_t size() const override { return index_block_->size(); } @@ -577,10 +563,12 @@ class HashIndexReader : public IndexReader { private: HashIndexReader(const InternalKeyComparator* icomparator, std::unique_ptr&& index_block, Statistics* stats, - const bool index_key_includes_seq) + const bool index_key_includes_seq, + const bool index_value_is_full) : IndexReader(icomparator, stats), index_block_(std::move(index_block)), - index_key_includes_seq_(index_key_includes_seq) { + index_key_includes_seq_(index_key_includes_seq), + index_value_is_full_(index_value_is_full) { assert(index_block_ != nullptr); } @@ -591,6 +579,7 @@ class HashIndexReader : public IndexReader { std::unique_ptr prefix_index_; BlockContents prefixes_contents_; const bool index_key_includes_seq_; + const bool index_value_is_full_; }; // Helper function to setup the cache key's prefix for the Table. @@ -1030,8 +1019,9 @@ Status BlockBasedTable::Open(const ImmutableCFOptions& ioptions, bool disable_prefix_seek = rep->index_type == BlockBasedTableOptions::kHashSearch && need_upper_bound_check; - unique_ptr iter(new_table->NewIndexIterator( - ReadOptions(), disable_prefix_seek, nullptr, &index_entry)); + unique_ptr> iter( + new_table->NewIndexIterator(ReadOptions(), disable_prefix_seek, + nullptr, &index_entry)); s = iter->status(); if (s.ok()) { // This is the first call to NewIndexIterator() since we're in Open(). @@ -1435,7 +1425,9 @@ FilterBlockReader* BlockBasedTable::ReadFilter( rep->whole_key_filtering, std::move(block), nullptr, rep->ioptions.statistics, rep->internal_comparator, this, rep_->table_properties == nullptr || - !rep_->table_properties->index_key_is_user_key); + rep_->table_properties->index_key_is_user_key == 0, + rep_->table_properties == nullptr || + rep_->table_properties->index_value_is_delta_encoded == 0); } case Rep::FilterType::kBlockFilter: @@ -1553,7 +1545,7 @@ BlockBasedTable::CachableEntry BlockBasedTable::GetFilter( // disable_prefix_seek should be set to true when prefix_extractor found in SST // differs from the one in mutable_cf_options and index type is HashBasedIndex -InternalIterator* BlockBasedTable::NewIndexIterator( +InternalIteratorBase* BlockBasedTable::NewIndexIterator( const ReadOptions& read_options, bool disable_prefix_seek, IndexBlockIter* input_iter, CachableEntry* index_entry, GetContext* get_context) { @@ -1592,7 +1584,8 @@ InternalIterator* BlockBasedTable::NewIndexIterator( input_iter->Invalidate(Status::Incomplete("no blocking io")); return input_iter; } else { - return NewErrorInternalIterator(Status::Incomplete("no blocking io")); + return NewErrorInternalIterator( + Status::Incomplete("no blocking io")); } } @@ -1639,7 +1632,7 @@ InternalIterator* BlockBasedTable::NewIndexIterator( input_iter->Invalidate(s); return input_iter; } else { - return NewErrorInternalIterator(s); + return NewErrorInternalIterator(s); } } @@ -1660,21 +1653,6 @@ InternalIterator* BlockBasedTable::NewIndexIterator( return iter; } -template -TBlockIter* BlockBasedTable::NewDataBlockIterator( - Rep* rep, const ReadOptions& ro, const Slice& index_value, - TBlockIter* input_iter, bool is_index, bool key_includes_seq, - GetContext* get_context, FilePrefetchBuffer* prefetch_buffer) { - BlockHandle handle; - Slice input = index_value; - // We intentionally allow extra stuff in index_value so that we - // can add more features in the future. - Status s = handle.DecodeFrom(&input); - return NewDataBlockIterator(rep, ro, handle, input_iter, is_index, - key_includes_seq, get_context, s, - prefetch_buffer); -} - // Convert an index iterator value (i.e., an encoded BlockHandle) // into an iterator over the contents of the corresponding block. // If input_iter is null, new a iterator @@ -1683,7 +1661,8 @@ template TBlockIter* BlockBasedTable::NewDataBlockIterator( Rep* rep, const ReadOptions& ro, const BlockHandle& handle, TBlockIter* input_iter, bool is_index, bool key_includes_seq, - GetContext* get_context, Status s, FilePrefetchBuffer* prefetch_buffer) { + bool index_key_is_full, GetContext* get_context, Status s, + FilePrefetchBuffer* prefetch_buffer) { PERF_TIMER_GUARD(new_table_block_iter_nanos); const bool no_io = (ro.read_tier == kBlockCacheTier); @@ -1733,7 +1712,8 @@ TBlockIter* BlockBasedTable::NewDataBlockIterator( const bool kTotalOrderSeek = true; iter = block.value->NewIterator( &rep->internal_comparator, rep->internal_comparator.user_comparator(), - iter, rep->ioptions.statistics, kTotalOrderSeek, key_includes_seq); + iter, rep->ioptions.statistics, kTotalOrderSeek, key_includes_seq, + index_key_is_full); if (block.cache_handle != nullptr) { iter->RegisterCleanup(&ReleaseCachedEntry, block_cache, block.cache_handle); @@ -1848,22 +1828,20 @@ Status BlockBasedTable::MaybeLoadDataBlockToCache( BlockBasedTable::PartitionedIndexIteratorState::PartitionedIndexIteratorState( BlockBasedTable* table, std::unordered_map>* block_map, - bool index_key_includes_seq) + bool index_key_includes_seq, bool index_key_is_full) : table_(table), block_map_(block_map), - index_key_includes_seq_(index_key_includes_seq) {} + index_key_includes_seq_(index_key_includes_seq), + index_key_is_full_(index_key_is_full) {} -template -const size_t BlockBasedTableIterator::kMaxReadaheadSize = +template +const size_t BlockBasedTableIterator::kMaxReadaheadSize = 256 * 1024; -InternalIterator* +InternalIteratorBase* BlockBasedTable::PartitionedIndexIteratorState::NewSecondaryIterator( - const Slice& index_value) { + const BlockHandle& handle) { // Return a block iterator on the index partition - BlockHandle handle; - Slice input = index_value; - Status s = handle.DecodeFrom(&input); auto rep = table_->get_rep(); auto block = block_map_->find(handle.offset()); // This is a possible scenario since block cache might not have had space @@ -1879,10 +1857,10 @@ BlockBasedTable::PartitionedIndexIteratorState::NewSecondaryIterator( Statistics* kNullStats = nullptr; return block->second.value->NewIterator( &rep->internal_comparator, rep->internal_comparator.user_comparator(), - nullptr, kNullStats, true, index_key_includes_seq_); + nullptr, kNullStats, true, index_key_includes_seq_, index_key_is_full_); } // Create an empty iterator - return new DataBlockIter(); + return new IndexBlockIter(); } // This will be broken if the user specifies an unusual implementation @@ -1955,7 +1933,7 @@ bool BlockBasedTable::PrefixMayMatch( // Then, try find it within each block // we already know prefix_extractor and prefix_extractor_name must match // because `CheckPrefixMayMatch` first checks `check_filter_ == true` - unique_ptr iiter( + unique_ptr> iiter( NewIndexIterator(no_io_read_options, /* need_upper_bound_check */ false)); iiter->Seek(internal_prefix); @@ -1988,10 +1966,7 @@ bool BlockBasedTable::PrefixMayMatch( // after the data block corresponding to iiter->key() cannot // possibly contain the key. Thus, the corresponding data block // is the only on could potentially contain the prefix. - Slice handle_value = iiter->value(); - BlockHandle handle; - s = handle.DecodeFrom(&handle_value); - assert(s.ok()); + BlockHandle handle = iiter->value(); may_match = filter->PrefixMayMatch(prefix, prefix_extractor, handle.offset()); } @@ -2015,8 +1990,8 @@ bool BlockBasedTable::PrefixMayMatch( return may_match; } -template -void BlockBasedTableIterator::Seek(const Slice& target) { +template +void BlockBasedTableIterator::Seek(const Slice& target) { is_out_of_bound_ = false; if (!CheckPrefixMayMatch(target)) { ResetDataIter(); @@ -2045,8 +2020,9 @@ void BlockBasedTableIterator::Seek(const Slice& target) { block_iter_.key()) <= 0)); } -template -void BlockBasedTableIterator::SeekForPrev(const Slice& target) { +template +void BlockBasedTableIterator::SeekForPrev( + const Slice& target) { is_out_of_bound_ = false; if (!CheckPrefixMayMatch(target)) { ResetDataIter(); @@ -2088,8 +2064,8 @@ void BlockBasedTableIterator::SeekForPrev(const Slice& target) { icomp_.Compare(target, block_iter_.key()) >= 0); } -template -void BlockBasedTableIterator::SeekToFirst() { +template +void BlockBasedTableIterator::SeekToFirst() { is_out_of_bound_ = false; SavePrevIndexValue(); index_iter_->SeekToFirst(); @@ -2102,8 +2078,8 @@ void BlockBasedTableIterator::SeekToFirst() { FindKeyForward(); } -template -void BlockBasedTableIterator::SeekToLast() { +template +void BlockBasedTableIterator::SeekToLast() { is_out_of_bound_ = false; SavePrevIndexValue(); index_iter_->SeekToLast(); @@ -2116,32 +2092,30 @@ void BlockBasedTableIterator::SeekToLast() { FindKeyBackward(); } -template -void BlockBasedTableIterator::Next() { +template +void BlockBasedTableIterator::Next() { assert(block_iter_points_to_real_block_); block_iter_.Next(); FindKeyForward(); } -template -void BlockBasedTableIterator::Prev() { +template +void BlockBasedTableIterator::Prev() { assert(block_iter_points_to_real_block_); block_iter_.Prev(); FindKeyBackward(); } -template -void BlockBasedTableIterator::InitDataBlock() { - BlockHandle data_block_handle; - Slice handle_slice = index_iter_->value(); +template +void BlockBasedTableIterator::InitDataBlock() { + BlockHandle data_block_handle = index_iter_->value(); if (!block_iter_points_to_real_block_ || - handle_slice.compare(prev_index_value_) != 0 || + data_block_handle.offset() != prev_index_value_.offset() || // if previous attempt of reading the block missed cache, try again block_iter_.status().IsIncomplete()) { if (block_iter_points_to_real_block_) { ResetDataIter(); } - Status s = data_block_handle.DecodeFrom(&handle_slice); auto* rep = table_->get_rep(); // Automatically prefetch additional data when a range scan (iterator) does @@ -2173,16 +2147,17 @@ void BlockBasedTableIterator::InitDataBlock() { } } + Status s; BlockBasedTable::NewDataBlockIterator( rep, read_options_, data_block_handle, &block_iter_, is_index_, - key_includes_seq_, + key_includes_seq_, index_key_is_full_, /* get_context */ nullptr, s, prefetch_buffer_.get()); block_iter_points_to_real_block_ = true; } } -template -void BlockBasedTableIterator::FindKeyForward() { +template +void BlockBasedTableIterator::FindKeyForward() { assert(!is_out_of_bound_); // TODO the while loop inherits from two-level-iterator. We don't know // whether a block can be empty so it can be replaced by an "if". @@ -2221,8 +2196,8 @@ void BlockBasedTableIterator::FindKeyForward() { } } -template -void BlockBasedTableIterator::FindKeyBackward() { +template +void BlockBasedTableIterator::FindKeyBackward() { assert(!is_out_of_bound_); while (!block_iter_.Valid()) { if (!block_iter_.status().ok()) { @@ -2297,11 +2272,10 @@ InternalIterator* BlockBasedTable::NewRangeTombstoneIterator( return iter; } } - std::string str; - rep_->range_del_handle.EncodeTo(&str); // The meta-block exists but isn't in uncompressed block cache (maybe // because it is disabled), so go through the full lookup process. - return NewDataBlockIterator(rep_, read_options, Slice(str)); + return NewDataBlockIterator(rep_, read_options, + rep_->range_del_handle); } bool BlockBasedTable::FullFilterKeyMayMatch( @@ -2364,7 +2338,7 @@ Status BlockBasedTable::Get(const ReadOptions& read_options, const Slice& key, auto iiter = NewIndexIterator(read_options, need_upper_bound_check, &iiter_on_stack, /* index_entry */ nullptr, get_context); - std::unique_ptr iiter_unique_ptr; + std::unique_ptr> iiter_unique_ptr; if (iiter != &iiter_on_stack) { iiter_unique_ptr.reset(iiter); } @@ -2372,12 +2346,10 @@ Status BlockBasedTable::Get(const ReadOptions& read_options, const Slice& key, bool matched = false; // if such user key mathced a key in SST bool done = false; for (iiter->Seek(key); iiter->Valid() && !done; iiter->Next()) { - Slice handle_value = iiter->value(); + BlockHandle handle = iiter->value(); - BlockHandle handle; bool not_exist_in_filter = filter != nullptr && filter->IsBlockBased() == true && - handle.DecodeFrom(&handle_value).ok() && !filter->KeyMayMatch(ExtractUserKey(key), prefix_extractor, handle.offset(), no_io); @@ -2455,9 +2427,10 @@ Status BlockBasedTable::Prefetch(const Slice* const begin, IndexBlockIter iiter_on_stack; auto iiter = NewIndexIterator(ReadOptions(), false, &iiter_on_stack); - std::unique_ptr iiter_unique_ptr; + std::unique_ptr> iiter_unique_ptr; if (iiter != &iiter_on_stack) { - iiter_unique_ptr = std::unique_ptr(iiter); + iiter_unique_ptr = + std::unique_ptr>(iiter); } if (!iiter->status().ok()) { @@ -2470,7 +2443,7 @@ Status BlockBasedTable::Prefetch(const Slice* const begin, for (begin ? iiter->Seek(*begin) : iiter->SeekToFirst(); iiter->Valid(); iiter->Next()) { - Slice block_handle = iiter->value(); + BlockHandle block_handle = iiter->value(); const bool is_user_key = rep_->table_properties && rep_->table_properties->index_key_is_user_key > 0; if (end && @@ -2516,11 +2489,12 @@ Status BlockBasedTable::VerifyChecksum() { } // Check Data blocks IndexBlockIter iiter_on_stack; - InternalIterator* iiter = + InternalIteratorBase* iiter = NewIndexIterator(ReadOptions(), false, &iiter_on_stack); - std::unique_ptr iiter_unique_ptr; + std::unique_ptr> iiter_unique_ptr; if (iiter != &iiter_on_stack) { - iiter_unique_ptr = std::unique_ptr(iiter); + iiter_unique_ptr = + std::unique_ptr>(iiter); } if (!iiter->status().ok()) { // error opening index iterator @@ -2530,19 +2504,41 @@ Status BlockBasedTable::VerifyChecksum() { return s; } -Status BlockBasedTable::VerifyChecksumInBlocks(InternalIterator* index_iter) { +Status BlockBasedTable::VerifyChecksumInBlocks( + InternalIteratorBase* index_iter) { Status s; for (index_iter->SeekToFirst(); index_iter->Valid(); index_iter->Next()) { s = index_iter->status(); if (!s.ok()) { break; } - BlockHandle handle; - Slice input = index_iter->value(); - s = handle.DecodeFrom(&input); + BlockHandle handle = index_iter->value(); + BlockContents contents; + Slice dummy_comp_dict; + BlockFetcher block_fetcher(rep_->file.get(), nullptr /* prefetch buffer */, + rep_->footer, ReadOptions(), handle, &contents, + rep_->ioptions, false /* decompress */, + dummy_comp_dict /*compression dict*/, + rep_->persistent_cache_options); + s = block_fetcher.ReadBlockContents(); + if (!s.ok()) { + break; + } + } + return s; +} + +Status BlockBasedTable::VerifyChecksumInBlocks( + InternalIteratorBase* index_iter) { + Status s; + for (index_iter->SeekToFirst(); index_iter->Valid(); index_iter->Next()) { + s = index_iter->status(); if (!s.ok()) { break; } + BlockHandle handle; + Slice input = index_iter->value(); + s = handle.DecodeFrom(&input); BlockContents contents; Slice dummy_comp_dict; BlockFetcher block_fetcher(rep_->file.get(), nullptr /* prefetch buffer */, @@ -2560,15 +2556,13 @@ Status BlockBasedTable::VerifyChecksumInBlocks(InternalIterator* index_iter) { bool BlockBasedTable::TEST_KeyInCache(const ReadOptions& options, const Slice& key) { - std::unique_ptr iiter(NewIndexIterator(options)); + std::unique_ptr> iiter( + NewIndexIterator(options)); iiter->Seek(key); assert(iiter->Valid()); CachableEntry block; - BlockHandle handle; - Slice input = iiter->value(); - Status s = handle.DecodeFrom(&input); - assert(s.ok()); + BlockHandle handle = iiter->value(); Cache* block_cache = rep_->table_options.block_cache.get(); assert(block_cache != nullptr); @@ -2578,6 +2572,7 @@ bool BlockBasedTable::TEST_KeyInCache(const ReadOptions& options, cache_key_storage); Slice ckey; + Status s; s = GetDataBlockFromCache( cache_key, ckey, block_cache, nullptr, rep_->ioptions, options, &block, rep_->table_options.format_version, @@ -2638,14 +2633,18 @@ Status BlockBasedTable::CreateIndexReader( rep_->ioptions, icomparator, index_reader, rep_->persistent_cache_options, level, rep_->table_properties == nullptr || - rep_->table_properties->index_key_is_user_key == 0); + rep_->table_properties->index_key_is_user_key == 0, + rep_->table_properties == nullptr || + rep_->table_properties->index_value_is_delta_encoded == 0); } case BlockBasedTableOptions::kBinarySearch: { return BinarySearchIndexReader::Create( file, prefetch_buffer, footer, footer.index_handle(), rep_->ioptions, icomparator, index_reader, rep_->persistent_cache_options, rep_->table_properties == nullptr || - rep_->table_properties->index_key_is_user_key == 0); + rep_->table_properties->index_key_is_user_key == 0, + rep_->table_properties == nullptr || + rep_->table_properties->index_value_is_delta_encoded == 0); } case BlockBasedTableOptions::kHashSearch: { std::unique_ptr meta_guard; @@ -2665,7 +2664,9 @@ Status BlockBasedTable::CreateIndexReader( rep_->ioptions, icomparator, index_reader, rep_->persistent_cache_options, rep_->table_properties == nullptr || - rep_->table_properties->index_key_is_user_key == 0); + rep_->table_properties->index_key_is_user_key == 0, + rep_->table_properties == nullptr || + rep_->table_properties->index_value_is_delta_encoded == 0); } meta_index_iter = meta_iter_guard.get(); } @@ -2676,7 +2677,9 @@ Status BlockBasedTable::CreateIndexReader( index_reader, rep_->hash_index_allow_collision, rep_->persistent_cache_options, rep_->table_properties == nullptr || - rep_->table_properties->index_key_is_user_key == 0); + rep_->table_properties->index_key_is_user_key == 0, + rep_->table_properties == nullptr || + rep_->table_properties->index_value_is_delta_encoded == 0); } default: { std::string error_message = @@ -2687,22 +2690,14 @@ Status BlockBasedTable::CreateIndexReader( } uint64_t BlockBasedTable::ApproximateOffsetOf(const Slice& key) { - unique_ptr index_iter(NewIndexIterator(ReadOptions())); + unique_ptr> index_iter( + NewIndexIterator(ReadOptions())); index_iter->Seek(key); uint64_t result; if (index_iter->Valid()) { - BlockHandle handle; - Slice input = index_iter->value(); - Status s = handle.DecodeFrom(&input); - if (s.ok()) { - result = handle.offset(); - } else { - // Strange: we can't decode the block handle in the index block. - // We'll just return the offset of the metaindex block, which is - // close to the whole file size for this case. - result = rep_->footer.metaindex_handle().offset(); - } + BlockHandle handle = index_iter->value(); + result = handle.offset(); } else { // key is past the last key in the file. If table_properties is not // available, approximate the offset by returning the offset of the @@ -2729,7 +2724,7 @@ bool BlockBasedTable::TEST_index_reader_preloaded() const { Status BlockBasedTable::GetKVPairsFromDataBlocks( std::vector* kv_pair_blocks) { - std::unique_ptr blockhandles_iter( + std::unique_ptr> blockhandles_iter( NewIndexIterator(ReadOptions())); Status s = blockhandles_iter->status(); @@ -2944,7 +2939,7 @@ Status BlockBasedTable::DumpIndexBlock(WritableFile* out_file) { out_file->Append( "Index Details:\n" "--------------------------------------\n"); - std::unique_ptr blockhandles_iter( + std::unique_ptr> blockhandles_iter( NewIndexIterator(ReadOptions())); Status s = blockhandles_iter->status(); if (!s.ok()) { @@ -2993,7 +2988,7 @@ Status BlockBasedTable::DumpIndexBlock(WritableFile* out_file) { } Status BlockBasedTable::DumpDataBlocks(WritableFile* out_file) { - std::unique_ptr blockhandles_iter( + std::unique_ptr> blockhandles_iter( NewIndexIterator(ReadOptions())); Status s = blockhandles_iter->status(); if (!s.ok()) { @@ -3013,9 +3008,7 @@ Status BlockBasedTable::DumpDataBlocks(WritableFile* out_file) { break; } - Slice bh_val = blockhandles_iter->value(); - BlockHandle bh; - bh.DecodeFrom(&bh_val); + BlockHandle bh = blockhandles_iter->value(); uint64_t datablock_size = bh.size(); datablock_size_min = std::min(datablock_size_min, datablock_size); datablock_size_max = std::max(datablock_size_max, datablock_size); diff --git a/table/block_based_table_reader.h b/table/block_based_table_reader.h index 7e7b41a71..3cada0c2c 100644 --- a/table/block_based_table_reader.h +++ b/table/block_based_table_reader.h @@ -51,7 +51,6 @@ struct BlockBasedTableOptions; struct EnvOptions; struct ReadOptions; class GetContext; -class InternalIterator; using std::unique_ptr; @@ -178,9 +177,9 @@ class BlockBasedTable : public TableReader { // to // a different object then iter and the callee has the ownership of the // returned object. - virtual InternalIterator* NewIterator(IndexBlockIter* iter = nullptr, - bool total_order_seek = true, - bool fill_cache = true) = 0; + virtual InternalIteratorBase* NewIterator( + IndexBlockIter* iter = nullptr, bool total_order_seek = true, + bool fill_cache = true) = 0; // The size of the index. virtual size_t size() const = 0; @@ -224,14 +223,16 @@ class BlockBasedTable : public TableReader { static TBlockIter* NewDataBlockIterator( Rep* rep, const ReadOptions& ro, const Slice& index_value, TBlockIter* input_iter = nullptr, bool is_index = false, - bool key_includes_seq = true, GetContext* get_context = nullptr, + bool key_includes_seq = true, bool index_key_is_full = true, + GetContext* get_context = nullptr, FilePrefetchBuffer* prefetch_buffer = nullptr); template static TBlockIter* NewDataBlockIterator( Rep* rep, const ReadOptions& ro, const BlockHandle& block_hanlde, TBlockIter* input_iter = nullptr, bool is_index = false, - bool key_includes_seq = true, GetContext* get_context = nullptr, - Status s = Status(), FilePrefetchBuffer* prefetch_buffer = nullptr); + bool key_includes_seq = true, bool index_key_is_full = true, + GetContext* get_context = nullptr, Status s = Status(), + FilePrefetchBuffer* prefetch_buffer = nullptr); class PartitionedIndexIteratorState; @@ -284,7 +285,7 @@ class BlockBasedTable : public TableReader { // 2. index is not present in block cache. // 3. We disallowed any io to be performed, that is, read_options == // kBlockCacheTier - InternalIterator* NewIndexIterator( + InternalIteratorBase* NewIndexIterator( const ReadOptions& read_options, bool need_upper_bound_check = false, IndexBlockIter* input_iter = nullptr, CachableEntry* index_entry = nullptr, @@ -353,7 +354,8 @@ class BlockBasedTable : public TableReader { std::unique_ptr* meta_block, std::unique_ptr* iter); - Status VerifyChecksumInBlocks(InternalIterator* index_iter); + Status VerifyChecksumInBlocks(InternalIteratorBase* index_iter); + Status VerifyChecksumInBlocks(InternalIteratorBase* index_iter); // Create the filter from the filter block. virtual FilterBlockReader* ReadFilter( @@ -390,14 +392,16 @@ class BlockBasedTable::PartitionedIndexIteratorState PartitionedIndexIteratorState( BlockBasedTable* table, std::unordered_map>* block_map, - const bool index_key_includes_seq); - InternalIterator* NewSecondaryIterator(const Slice& index_value) override; + const bool index_key_includes_seq, const bool index_key_is_full); + InternalIteratorBase* NewSecondaryIterator( + const BlockHandle& index_value) override; private: // Don't own table_ BlockBasedTable* table_; std::unordered_map>* block_map_; bool index_key_includes_seq_; + bool index_key_is_full_; }; // CachableEntry represents the entries that *may* be fetched from block cache. @@ -521,16 +525,17 @@ struct BlockBasedTable::Rep { const bool immortal_table; }; -template -class BlockBasedTableIterator : public InternalIterator { +template +class BlockBasedTableIterator : public InternalIteratorBase { public: BlockBasedTableIterator(BlockBasedTable* table, const ReadOptions& read_options, const InternalKeyComparator& icomp, - InternalIterator* index_iter, bool check_filter, - bool need_upper_bound_check, + InternalIteratorBase* index_iter, + bool check_filter, bool need_upper_bound_check, const SliceTransform* prefix_extractor, bool is_index, bool key_includes_seq = true, + bool index_key_is_full = true, bool for_compaction = false) : table_(table), read_options_(read_options), @@ -543,6 +548,7 @@ class BlockBasedTableIterator : public InternalIterator { prefix_extractor_(prefix_extractor), is_index_(is_index), key_includes_seq_(key_includes_seq), + index_key_is_full_(index_key_is_full), for_compaction_(for_compaction) {} ~BlockBasedTableIterator() { delete index_iter_; } @@ -561,7 +567,7 @@ class BlockBasedTableIterator : public InternalIterator { assert(Valid()); return block_iter_.key(); } - Slice value() const override { + TValue value() const override { assert(Valid()); return block_iter_.value(); } @@ -618,8 +624,7 @@ class BlockBasedTableIterator : public InternalIterator { if (block_iter_points_to_real_block_) { // Reseek. If they end up with the same data block, we shouldn't re-fetch // the same data block. - Slice v = index_iter_->value(); - prev_index_value_.assign(v.data(), v.size()); + prev_index_value_ = index_iter_->value(); } } @@ -631,7 +636,7 @@ class BlockBasedTableIterator : public InternalIterator { BlockBasedTable* table_; const ReadOptions read_options_; const InternalKeyComparator& icomp_; - InternalIterator* index_iter_; + InternalIteratorBase* index_iter_; PinnedIteratorsManager* pinned_iters_mgr_; TBlockIter block_iter_; bool block_iter_points_to_real_block_; @@ -644,10 +649,10 @@ class BlockBasedTableIterator : public InternalIterator { bool is_index_; // If the keys in the blocks over which we iterate include 8 byte sequence bool key_includes_seq_; + bool index_key_is_full_; // If this iterator is created for compaction bool for_compaction_; - // TODO use block offset instead - std::string prev_index_value_; + BlockHandle prev_index_value_; static const size_t kInitReadaheadSize = 8 * 1024; // Found that 256 KB readahead size provides the best performance, based on diff --git a/table/block_builder.cc b/table/block_builder.cc index 39bfffe51..ba4ef09ec 100644 --- a/table/block_builder.cc +++ b/table/block_builder.cc @@ -41,9 +41,11 @@ namespace rocksdb { -BlockBuilder::BlockBuilder(int block_restart_interval, bool use_delta_encoding) +BlockBuilder::BlockBuilder(int block_restart_interval, bool use_delta_encoding, + bool use_value_delta_encoding) : block_restart_interval_(block_restart_interval), use_delta_encoding_(use_delta_encoding), + use_value_delta_encoding_(use_value_delta_encoding), restarts_(), counter_(0), finished_(false) { @@ -65,14 +67,27 @@ void BlockBuilder::Reset() { size_t BlockBuilder::EstimateSizeAfterKV(const Slice& key, const Slice& value) const { size_t estimate = CurrentSizeEstimate(); - estimate += key.size() + value.size(); + // Note: this is an imprecise estimate as it accounts for the whole key size + // instead of non-shared key size. + estimate += key.size(); + // In value delta encoding we estimate the value delta size as half the full + // value size since only the size field of block handle is encoded. + estimate += + !use_value_delta_encoding_ || (counter_ >= block_restart_interval_) + ? value.size() + : value.size() / 2; + if (counter_ >= block_restart_interval_) { estimate += sizeof(uint32_t); // a new restart entry. } estimate += sizeof(int32_t); // varint for shared prefix length. + // Note: this is an imprecise estimate as we will have to encoded size, one + // for shared key and one for non-shared key. estimate += VarintLength(key.size()); // varint for key length. - estimate += VarintLength(value.size()); // varint for value length. + if (!use_value_delta_encoding_ || (counter_ >= block_restart_interval_)) { + estimate += VarintLength(value.size()); // varint for value length. + } return estimate; } @@ -87,9 +102,11 @@ Slice BlockBuilder::Finish() { return Slice(buffer_); } -void BlockBuilder::Add(const Slice& key, const Slice& value) { +void BlockBuilder::Add(const Slice& key, const Slice& value, + const Slice* const delta_value) { assert(!finished_); assert(counter_ <= block_restart_interval_); + assert(!use_value_delta_encoding_ || delta_value); size_t shared = 0; // number of bytes shared with prev key if (counter_ >= block_restart_interval_) { // Restart compression @@ -115,14 +132,27 @@ void BlockBuilder::Add(const Slice& key, const Slice& value) { const size_t non_shared = key.size() - shared; const size_t curr_size = buffer_.size(); - // Add "" to buffer_ - PutVarint32Varint32Varint32(&buffer_, static_cast(shared), - static_cast(non_shared), - static_cast(value.size())); + if (use_value_delta_encoding_) { + // Add "" to buffer_ + PutVarint32Varint32(&buffer_, static_cast(shared), + static_cast(non_shared)); + } else { + // Add "" to buffer_ + PutVarint32Varint32Varint32(&buffer_, static_cast(shared), + static_cast(non_shared), + static_cast(value.size())); + } // Add string delta to buffer_ followed by value buffer_.append(key.data() + shared, non_shared); - buffer_.append(value.data(), value.size()); + // Use value delta encoding only when the key has shared bytes. This would + // simplify the decoding, where it can figure which decoding to use simply by + // looking at the shared bytes size. + if (shared != 0 && use_value_delta_encoding_) { + buffer_.append(delta_value->data(), delta_value->size()); + } else { + buffer_.append(value.data(), value.size()); + } counter_++; estimate_ += buffer_.size() - curr_size; diff --git a/table/block_builder.h b/table/block_builder.h index 6b5297d04..f2be4c020 100644 --- a/table/block_builder.h +++ b/table/block_builder.h @@ -21,14 +21,16 @@ class BlockBuilder { void operator=(const BlockBuilder&) = delete; explicit BlockBuilder(int block_restart_interval, - bool use_delta_encoding = true); + bool use_delta_encoding = true, + bool use_value_delta_encoding = false); // Reset the contents as if the BlockBuilder was just constructed. void Reset(); // REQUIRES: Finish() has not been called since the last call to Reset(). // REQUIRES: key is larger than any previously added key - void Add(const Slice& key, const Slice& value); + void Add(const Slice& key, const Slice& value, + const Slice* const delta_value = nullptr); // Finish building the block and return a slice that refers to the // block contents. The returned slice will remain valid for the @@ -49,7 +51,10 @@ class BlockBuilder { private: const int block_restart_interval_; + //TODO(myabandeh): put it into a separate IndexBlockBuilder const bool use_delta_encoding_; + // Refer to BlockIter::DecodeCurrentValue for format of delta encoded values + const bool use_value_delta_encoding_; std::string buffer_; // Destination buffer std::vector restarts_; // Restart points diff --git a/table/block_test.cc b/table/block_test.cc index 0ed450f07..009740a28 100644 --- a/table/block_test.cc +++ b/table/block_test.cc @@ -68,6 +68,29 @@ void GenerateRandomKVs(std::vector *keys, } } +// Same as GenerateRandomKVs but the values are BlockHandle +void GenerateRandomKBHs(std::vector *keys, + std::vector *values, const int from, + const int len, const int step = 1, + const int padding_size = 0, + const int keys_share_prefix = 1) { + Random rnd(302); + uint64_t offset = 0; + + // generate different prefix + for (int i = from; i < from + len; i += step) { + // generate keys that shares the prefix + for (int j = 0; j < keys_share_prefix; ++j) { + keys->emplace_back(GenerateKey(i, j, padding_size, &rnd)); + + uint64_t size = rnd.Uniform(1024 * 16); + BlockHandle handle(offset, size); + offset += size + kBlockTrailerSize; + values->emplace_back(handle); + } + } +} + class BlockTest : public testing::Test {}; // block test @@ -131,6 +154,84 @@ TEST_F(BlockTest, SimpleTest) { delete iter; } +TEST_F(BlockTest, ValueDeltaEncodingTest) { + Random rnd(301); + Options options = Options(); + std::unique_ptr ic; + ic.reset(new test::PlainInternalKeyComparator(options.comparator)); + + std::vector keys; + std::vector values; + const bool kUseDeltaEncoding = true; + const bool kUseValueDeltaEncoding = true; + BlockBuilder builder(16, kUseDeltaEncoding, kUseValueDeltaEncoding); + int num_records = 100; + + GenerateRandomKBHs(&keys, &values, 0, num_records); + // add a bunch of records to a block + BlockHandle last_encoded_handle; + for (int i = 0; i < num_records; i++) { + auto block_handle = values[i]; + std::string handle_encoding; + block_handle.EncodeTo(&handle_encoding); + std::string handle_delta_encoding; + PutVarsignedint64(&handle_delta_encoding, + block_handle.size() - last_encoded_handle.size()); + last_encoded_handle = block_handle; + const Slice handle_delta_encoding_slice(handle_delta_encoding); + builder.Add(keys[i], handle_encoding, &handle_delta_encoding_slice); + } + + // read serialized contents of the block + Slice rawblock = builder.Finish(); + + // create block reader + BlockContents contents; + contents.data = rawblock; + contents.cachable = false; + Block reader(std::move(contents), kDisableGlobalSequenceNumber); + + const bool kTotalOrderSeek = true; + const bool kIncludesSeq = true; + const bool kValueIsFull = !kUseValueDeltaEncoding; + IndexBlockIter *kNullIter = nullptr; + Statistics *kNullStats = nullptr; + // read contents of block sequentially + int count = 0; + InternalIteratorBase *iter = reader.NewIterator( + options.comparator, options.comparator, kNullIter, kNullStats, + kTotalOrderSeek, kIncludesSeq, kValueIsFull); + for (iter->SeekToFirst(); iter->Valid(); count++, iter->Next()) { + // read kv from block + Slice k = iter->key(); + BlockHandle handle = iter->value(); + + // compare with lookaside array + ASSERT_EQ(k.ToString().compare(keys[count]), 0); + + ASSERT_EQ(values[count].offset(), handle.offset()); + ASSERT_EQ(values[count].size(), handle.size()); + } + delete iter; + + // read block contents randomly + iter = reader.NewIterator( + options.comparator, options.comparator, kNullIter, kNullStats, + kTotalOrderSeek, kIncludesSeq, kValueIsFull); + for (int i = 0; i < num_records; i++) { + // find a random key in the lookaside array + int index = rnd.Uniform(num_records); + Slice k(keys[index]); + + // search in block for this key + iter->Seek(k); + ASSERT_TRUE(iter->Valid()); + BlockHandle handle = iter->value(); + ASSERT_EQ(values[index].offset(), handle.offset()); + ASSERT_EQ(values[index].size(), handle.size()); + } + delete iter; +} // return the block contents BlockContents GetBlockContents(std::unique_ptr *builder, const std::vector &keys, diff --git a/table/cuckoo_table_reader.cc b/table/cuckoo_table_reader.cc index d008ff1a3..fb14b1759 100644 --- a/table/cuckoo_table_reader.cc +++ b/table/cuckoo_table_reader.cc @@ -374,15 +374,12 @@ Slice CuckooTableIterator::value() const { return curr_value_; } -extern InternalIterator* NewErrorInternalIterator(const Status& status, - Arena* arena); - InternalIterator* CuckooTableReader::NewIterator( const ReadOptions& /*read_options*/, const SliceTransform* /* prefix_extractor */, Arena* arena, bool /*skip_filters*/, bool /*for_compaction*/) { if (!status().ok()) { - return NewErrorInternalIterator( + return NewErrorInternalIterator( Status::Corruption("CuckooTableReader status is not okay."), arena); } CuckooTableIterator* iter; diff --git a/table/cuckoo_table_reader.h b/table/cuckoo_table_reader.h index f1539f09f..b37d46373 100644 --- a/table/cuckoo_table_reader.h +++ b/table/cuckoo_table_reader.h @@ -25,7 +25,6 @@ namespace rocksdb { class Arena; class TableReader; -class InternalIterator; class CuckooTableReader: public TableReader { public: diff --git a/table/format.cc b/table/format.cc index 1f8d7c3a6..a4e448870 100644 --- a/table/format.cc +++ b/table/format.cc @@ -54,6 +54,13 @@ void BlockHandle::EncodeTo(std::string* dst) const { PutVarint64Varint64(dst, offset_, size_); } +void BlockHandle::EncodeSizeTo(std::string* dst) const { + // Sanity check that all fields have been set + assert(offset_ != ~static_cast(0)); + assert(size_ != ~static_cast(0)); + PutVarint64(dst, size_); +} + Status BlockHandle::DecodeFrom(Slice* input) { if (GetVarint64(input, &offset_) && GetVarint64(input, &size_)) { @@ -66,6 +73,18 @@ Status BlockHandle::DecodeFrom(Slice* input) { } } +Status BlockHandle::DecodeSizeFrom(uint64_t _offset, Slice* input) { + if (GetVarint64(input, &size_)) { + offset_ = _offset; + return Status::OK(); + } else { + // reset in case failure after partially decoding + offset_ = 0; + size_ = 0; + return Status::Corruption("bad block handle"); + } +} + // Return a string that contains the copy of handle. std::string BlockHandle::ToString(bool hex) const { std::string handle_str; diff --git a/table/format.h b/table/format.h index fd8459fdc..ebc9c2539 100644 --- a/table/format.h +++ b/table/format.h @@ -54,6 +54,8 @@ class BlockHandle { void EncodeTo(std::string* dst) const; Status DecodeFrom(Slice* input); + Status DecodeSizeFrom(uint64_t offset, Slice* input); + void EncodeSizeTo(std::string* dst) const; // Return a string that contains the copy of handle. std::string ToString(bool hex = true) const; @@ -90,7 +92,7 @@ inline uint32_t GetCompressFormatForVersion( } inline bool BlockBasedTableSupportedVersion(uint32_t version) { - return version <= 3; + return version <= 4; } // Footer encapsulates the fixed information stored at the tail diff --git a/table/index_builder.cc b/table/index_builder.cc index ebabbeb8d..6b8114f3e 100644 --- a/table/index_builder.cc +++ b/table/index_builder.cc @@ -27,23 +27,26 @@ IndexBuilder* IndexBuilder::CreateIndexBuilder( BlockBasedTableOptions::IndexType index_type, const InternalKeyComparator* comparator, const InternalKeySliceTransform* int_key_slice_transform, + const bool use_value_delta_encoding, const BlockBasedTableOptions& table_opt) { IndexBuilder* result = nullptr; switch (index_type) { case BlockBasedTableOptions::kBinarySearch: { - result = new ShortenedIndexBuilder(comparator, - table_opt.index_block_restart_interval, - table_opt.format_version); + result = new ShortenedIndexBuilder( + comparator, table_opt.index_block_restart_interval, + table_opt.format_version, use_value_delta_encoding); } break; case BlockBasedTableOptions::kHashSearch: { result = new HashIndexBuilder(comparator, int_key_slice_transform, table_opt.index_block_restart_interval, - table_opt.format_version); + table_opt.format_version, + use_value_delta_encoding); } break; case BlockBasedTableOptions::kTwoLevelIndexSearch: { - result = PartitionedIndexBuilder::CreateIndexBuilder(comparator, table_opt); + result = PartitionedIndexBuilder::CreateIndexBuilder( + comparator, use_value_delta_encoding, table_opt); } break; default: { @@ -56,21 +59,27 @@ IndexBuilder* IndexBuilder::CreateIndexBuilder( PartitionedIndexBuilder* PartitionedIndexBuilder::CreateIndexBuilder( const InternalKeyComparator* comparator, + const bool use_value_delta_encoding, const BlockBasedTableOptions& table_opt) { - return new PartitionedIndexBuilder(comparator, table_opt); + return new PartitionedIndexBuilder(comparator, table_opt, + use_value_delta_encoding); } PartitionedIndexBuilder::PartitionedIndexBuilder( const InternalKeyComparator* comparator, - const BlockBasedTableOptions& table_opt) + const BlockBasedTableOptions& table_opt, + const bool use_value_delta_encoding) : IndexBuilder(comparator), index_block_builder_(table_opt.index_block_restart_interval, - table_opt.format_version), + true /*use_delta_encoding*/, + use_value_delta_encoding), index_block_builder_without_seq_(table_opt.index_block_restart_interval, - table_opt.format_version), + true /*use_delta_encoding*/, + use_value_delta_encoding), sub_index_builder_(nullptr), table_opt_(table_opt), - seperator_is_key_plus_seq_(false) {} + seperator_is_key_plus_seq_(false), + use_value_delta_encoding_(use_value_delta_encoding) {} PartitionedIndexBuilder::~PartitionedIndexBuilder() { delete sub_index_builder_; @@ -80,7 +89,7 @@ void PartitionedIndexBuilder::MakeNewSubIndexBuilder() { assert(sub_index_builder_ == nullptr); sub_index_builder_ = new ShortenedIndexBuilder( comparator_, table_opt_.index_block_restart_interval, - table_opt_.format_version); + table_opt_.format_version, use_value_delta_encoding_); flush_policy_.reset(FlushBlockBySizePolicyFactory::NewFlushBlockPolicy( table_opt_.metadata_block_size, table_opt_.block_size_deviation, sub_index_builder_->index_block_builder_)); @@ -149,10 +158,18 @@ Status PartitionedIndexBuilder::Finish( Entry& last_entry = entries_.front(); std::string handle_encoding; last_partition_block_handle.EncodeTo(&handle_encoding); - index_block_builder_.Add(last_entry.key, handle_encoding); + std::string handle_delta_encoding; + PutVarsignedint64( + &handle_delta_encoding, + last_partition_block_handle.size() - last_encoded_handle_.size()); + last_encoded_handle_ = last_partition_block_handle; + const Slice handle_delta_encoding_slice(handle_delta_encoding); + index_block_builder_.Add(last_entry.key, handle_encoding, + &handle_delta_encoding_slice); if (!seperator_is_key_plus_seq_) { index_block_builder_without_seq_.Add(ExtractUserKey(last_entry.key), - handle_encoding); + handle_encoding, + &handle_delta_encoding_slice); } entries_.pop_front(); } @@ -202,9 +219,12 @@ size_t PartitionedIndexBuilder::EstimateTopLevelIndexSize( uint64_t size = it->value->EstimatedSize(); BlockHandle tmp_block_handle(offset, size); tmp_block_handle.EncodeTo(&tmp_handle_encoding); + std::string handle_delta_encoding; + tmp_block_handle.EncodeSizeTo(&handle_delta_encoding); + const Slice handle_delta_encoding_slice(handle_delta_encoding); tmp_builder.Add( seperator_is_key_plus_seq_ ? it->key : ExtractUserKey(it->key), - tmp_handle_encoding); + tmp_handle_encoding, &handle_delta_encoding_slice); offset += size; } return tmp_builder.CurrentSizeEstimate(); diff --git a/table/index_builder.h b/table/index_builder.h index e8c060716..bc3a2bd54 100644 --- a/table/index_builder.h +++ b/table/index_builder.h @@ -38,6 +38,7 @@ class IndexBuilder { BlockBasedTableOptions::IndexType index_type, const rocksdb::InternalKeyComparator* comparator, const InternalKeySliceTransform* int_key_slice_transform, + const bool use_value_delta_encoding, const BlockBasedTableOptions& table_opt); // Index builder will construct a set of blocks which contain: @@ -117,11 +118,16 @@ class IndexBuilder { class ShortenedIndexBuilder : public IndexBuilder { public: explicit ShortenedIndexBuilder(const InternalKeyComparator* comparator, - int index_block_restart_interval, - uint32_t format_version) + const int index_block_restart_interval, + const uint32_t format_version, + const bool use_value_delta_encoding) : IndexBuilder(comparator), - index_block_builder_(index_block_restart_interval), - index_block_builder_without_seq_(index_block_restart_interval) { + index_block_builder_(index_block_restart_interval, + true /*use_delta_encoding*/, + use_value_delta_encoding), + index_block_builder_without_seq_(index_block_restart_interval, + true /*use_delta_encoding*/, + use_value_delta_encoding) { // Making the default true will disable the feature for old versions seperator_is_key_plus_seq_ = (format_version <= 2); } @@ -145,10 +151,17 @@ class ShortenedIndexBuilder : public IndexBuilder { std::string handle_encoding; block_handle.EncodeTo(&handle_encoding); - index_block_builder_.Add(sep, handle_encoding); + std::string handle_delta_encoding; + PutVarsignedint64(&handle_delta_encoding, + block_handle.size() - last_encoded_handle_.size()); + assert(handle_delta_encoding.size() != 0); + last_encoded_handle_ = block_handle; + const Slice handle_delta_encoding_slice(handle_delta_encoding); + index_block_builder_.Add(sep, handle_encoding, + &handle_delta_encoding_slice); if (!seperator_is_key_plus_seq_) { - index_block_builder_without_seq_.Add(ExtractUserKey(sep), - handle_encoding); + index_block_builder_without_seq_.Add(ExtractUserKey(sep), handle_encoding, + &handle_delta_encoding_slice); } } @@ -183,6 +196,7 @@ class ShortenedIndexBuilder : public IndexBuilder { BlockBuilder index_block_builder_; BlockBuilder index_block_builder_without_seq_; bool seperator_is_key_plus_seq_; + BlockHandle last_encoded_handle_; }; // HashIndexBuilder contains a binary-searchable primary index and the @@ -217,10 +231,10 @@ class HashIndexBuilder : public IndexBuilder { explicit HashIndexBuilder(const InternalKeyComparator* comparator, const SliceTransform* hash_key_extractor, int index_block_restart_interval, - int format_version) + int format_version, bool use_value_delta_encoding) : IndexBuilder(comparator), primary_index_builder_(comparator, index_block_restart_interval, - format_version), + format_version, use_value_delta_encoding), hash_key_extractor_(hash_key_extractor) {} virtual void AddIndexEntry(std::string* last_key_in_current_block, @@ -323,10 +337,12 @@ class PartitionedIndexBuilder : public IndexBuilder { public: static PartitionedIndexBuilder* CreateIndexBuilder( const rocksdb::InternalKeyComparator* comparator, + const bool use_value_delta_encoding, const BlockBasedTableOptions& table_opt); explicit PartitionedIndexBuilder(const InternalKeyComparator* comparator, - const BlockBasedTableOptions& table_opt); + const BlockBasedTableOptions& table_opt, + const bool use_value_delta_encoding); virtual ~PartitionedIndexBuilder(); @@ -361,6 +377,8 @@ class PartitionedIndexBuilder : public IndexBuilder { return seperator_is_key_plus_seq_; } + bool get_use_value_delta_encoding() { return use_value_delta_encoding_; } + private: void MakeNewSubIndexBuilder(); @@ -380,10 +398,12 @@ class PartitionedIndexBuilder : public IndexBuilder { bool finishing_indexes = false; const BlockBasedTableOptions& table_opt_; bool seperator_is_key_plus_seq_; + bool use_value_delta_encoding_; // true if an external entity (such as filter partition builder) request // cutting the next partition bool partition_cut_requested_ = true; // true if it should cut the next filter partition block bool cut_filter_block = false; + BlockHandle last_encoded_handle_; }; } // namespace rocksdb diff --git a/table/internal_iterator.h b/table/internal_iterator.h index 2fdb14c7f..a173d6069 100644 --- a/table/internal_iterator.h +++ b/table/internal_iterator.h @@ -10,15 +10,17 @@ #include "rocksdb/comparator.h" #include "rocksdb/iterator.h" #include "rocksdb/status.h" +#include "table/format.h" namespace rocksdb { class PinnedIteratorsManager; -class InternalIterator : public Cleanable { +template +class InternalIteratorBase : public Cleanable { public: - InternalIterator() {} - virtual ~InternalIterator() {} + InternalIteratorBase() {} + virtual ~InternalIteratorBase() {} // An iterator is either positioned at a key/value pair, or // not valid. This method returns true iff the iterator is valid. @@ -66,7 +68,7 @@ class InternalIterator : public Cleanable { // the returned slice is valid only until the next modification of // the iterator. // REQUIRES: Valid() - virtual Slice value() const = 0; + virtual TValue value() const = 0; // If an error has occurred, return it. Else return an ok status. // If non-blocking IO is requested and this operation cannot be @@ -117,14 +119,24 @@ class InternalIterator : public Cleanable { private: // No copying allowed - InternalIterator(const InternalIterator&) = delete; - InternalIterator& operator=(const InternalIterator&) = delete; + InternalIteratorBase(const InternalIteratorBase&) = delete; + InternalIteratorBase& operator=(const InternalIteratorBase&) = delete; }; +using InternalIterator = InternalIteratorBase; + // Return an empty iterator (yields nothing). -extern InternalIterator* NewEmptyInternalIterator(); +template +extern InternalIteratorBase* NewEmptyInternalIterator(); // Return an empty iterator with the specified status. -extern InternalIterator* NewErrorInternalIterator(const Status& status); +template +extern InternalIteratorBase* NewErrorInternalIterator( + const Status& status); + +// Return an empty iterator with the specified status, allocated arena. +template +extern InternalIteratorBase* NewErrorInternalIterator( + const Status& status, Arena* arena); } // namespace rocksdb diff --git a/table/iterator.cc b/table/iterator.cc index 0411b374a..97c47fb28 100644 --- a/table/iterator.cc +++ b/table/iterator.cc @@ -131,7 +131,8 @@ class EmptyIterator : public Iterator { Status status_; }; -class EmptyInternalIterator : public InternalIterator { +template +class EmptyInternalIterator : public InternalIteratorBase { public: explicit EmptyInternalIterator(const Status& s) : status_(s) {} virtual bool Valid() const override { return false; } @@ -145,9 +146,9 @@ class EmptyInternalIterator : public InternalIterator { assert(false); return Slice(); } - Slice value() const override { + TValue value() const override { assert(false); - return Slice(); + return TValue(); } virtual Status status() const override { return status_; } @@ -164,30 +165,48 @@ Iterator* NewErrorIterator(const Status& status) { return new EmptyIterator(status); } -InternalIterator* NewEmptyInternalIterator() { - return new EmptyInternalIterator(Status::OK()); +template +InternalIteratorBase* NewErrorInternalIterator(const Status& status) { + return new EmptyInternalIterator(status); } - -InternalIterator* NewEmptyInternalIterator(Arena* arena) { +template InternalIteratorBase* NewErrorInternalIterator( + const Status& status); +template InternalIteratorBase* NewErrorInternalIterator( + const Status& status); + +template +InternalIteratorBase* NewErrorInternalIterator(const Status& status, + Arena* arena) { if (arena == nullptr) { - return NewEmptyInternalIterator(); + return NewErrorInternalIterator(status); } else { auto mem = arena->AllocateAligned(sizeof(EmptyIterator)); - return new (mem) EmptyInternalIterator(Status::OK()); + return new (mem) EmptyInternalIterator(status); } } - -InternalIterator* NewErrorInternalIterator(const Status& status) { - return new EmptyInternalIterator(status); +template InternalIteratorBase* NewErrorInternalIterator( + const Status& status, Arena* arena); +template InternalIteratorBase* NewErrorInternalIterator( + const Status& status, Arena* arena); + +template +InternalIteratorBase* NewEmptyInternalIterator() { + return new EmptyInternalIterator(Status::OK()); } +template InternalIteratorBase* NewEmptyInternalIterator(); +template InternalIteratorBase* NewEmptyInternalIterator(); -InternalIterator* NewErrorInternalIterator(const Status& status, Arena* arena) { +template +InternalIteratorBase* NewEmptyInternalIterator(Arena* arena) { if (arena == nullptr) { - return NewErrorInternalIterator(status); + return NewEmptyInternalIterator(); } else { auto mem = arena->AllocateAligned(sizeof(EmptyIterator)); - return new (mem) EmptyInternalIterator(status); + return new (mem) EmptyInternalIterator(Status::OK()); } } +template InternalIteratorBase* NewEmptyInternalIterator( + Arena* arena); +template InternalIteratorBase* NewEmptyInternalIterator(Arena* arena); } // namespace rocksdb diff --git a/table/iterator_wrapper.h b/table/iterator_wrapper.h index 5ddea2470..5941b846a 100644 --- a/table/iterator_wrapper.h +++ b/table/iterator_wrapper.h @@ -19,19 +19,21 @@ namespace rocksdb { // the valid() and key() results for an underlying iterator. // This can help avoid virtual function calls and also gives better // cache locality. -class IteratorWrapper { +template +class IteratorWrapperBase { public: - IteratorWrapper() : iter_(nullptr), valid_(false) {} - explicit IteratorWrapper(InternalIterator* _iter) : iter_(nullptr) { + IteratorWrapperBase() : iter_(nullptr), valid_(false) {} + explicit IteratorWrapperBase(InternalIteratorBase* _iter) + : iter_(nullptr) { Set(_iter); } - ~IteratorWrapper() {} - InternalIterator* iter() const { return iter_; } + ~IteratorWrapperBase() {} + InternalIteratorBase* iter() const { return iter_; } // Set the underlying Iterator to _iter and return // previous underlying Iterator. - InternalIterator* Set(InternalIterator* _iter) { - InternalIterator* old_iter = iter_; + InternalIteratorBase* Set(InternalIteratorBase* _iter) { + InternalIteratorBase* old_iter = iter_; iter_ = _iter; if (iter_ == nullptr) { @@ -47,7 +49,7 @@ class IteratorWrapper { if (!is_arena_mode) { delete iter_; } else { - iter_->~InternalIterator(); + iter_->~InternalIteratorBase(); } } } @@ -55,7 +57,10 @@ class IteratorWrapper { // Iterator interface methods bool Valid() const { return valid_; } Slice key() const { assert(Valid()); return key_; } - Slice value() const { assert(Valid()); return iter_->value(); } + TValue value() const { + assert(Valid()); + return iter_->value(); + } // Methods below require iter() != nullptr Status status() const { assert(iter_); return iter_->status(); } void Next() { assert(iter_); iter_->Next(); Update(); } @@ -91,17 +96,16 @@ class IteratorWrapper { } } - InternalIterator* iter_; + InternalIteratorBase* iter_; bool valid_; Slice key_; }; +using IteratorWrapper = IteratorWrapperBase; + class Arena; // Return an empty iterator (yields nothing) allocated from arena. -extern InternalIterator* NewEmptyInternalIterator(Arena* arena); - -// Return an empty iterator with the specified status, allocated arena. -extern InternalIterator* NewErrorInternalIterator(const Status& status, - Arena* arena); +template +extern InternalIteratorBase* NewEmptyInternalIterator(Arena* arena); } // namespace rocksdb diff --git a/table/merging_iterator.cc b/table/merging_iterator.cc index d0bd0f836..744de37da 100644 --- a/table/merging_iterator.cc +++ b/table/merging_iterator.cc @@ -387,7 +387,7 @@ InternalIterator* NewMergingIterator(const InternalKeyComparator* cmp, Arena* arena, bool prefix_seek_mode) { assert(n >= 0); if (n == 0) { - return NewEmptyInternalIterator(arena); + return NewEmptyInternalIterator(arena); } else if (n == 1) { return list[0]; } else { diff --git a/table/merging_iterator.h b/table/merging_iterator.h index 04fcf421d..21ff79bf6 100644 --- a/table/merging_iterator.h +++ b/table/merging_iterator.h @@ -15,9 +15,11 @@ namespace rocksdb { class Comparator; -class InternalIterator; class Env; class Arena; +template +class InternalIteratorBase; +using InternalIterator = InternalIteratorBase; // Return an iterator that provided the union of the data in // children[0,n-1]. Takes ownership of the child iterators and diff --git a/table/meta_blocks.cc b/table/meta_blocks.cc index ba25c58c8..256730bfa 100644 --- a/table/meta_blocks.cc +++ b/table/meta_blocks.cc @@ -76,6 +76,8 @@ void PropertyBlockBuilder::AddTableProperty(const TableProperties& props) { Add(TablePropertiesNames::kTopLevelIndexSize, props.top_level_index_size); } Add(TablePropertiesNames::kIndexKeyIsUserKey, props.index_key_is_user_key); + Add(TablePropertiesNames::kIndexValueIsDeltaEncoded, + props.index_value_is_delta_encoded); Add(TablePropertiesNames::kNumEntries, props.num_entries); Add(TablePropertiesNames::kNumRangeDeletions, props.num_range_deletions); Add(TablePropertiesNames::kNumDataBlocks, props.num_data_blocks); @@ -218,6 +220,8 @@ Status ReadProperties(const Slice& handle_value, RandomAccessFileReader* file, &new_table_properties->top_level_index_size}, {TablePropertiesNames::kIndexKeyIsUserKey, &new_table_properties->index_key_is_user_key}, + {TablePropertiesNames::kIndexValueIsDeltaEncoded, + &new_table_properties->index_value_is_delta_encoded}, {TablePropertiesNames::kFilterSize, &new_table_properties->filter_size}, {TablePropertiesNames::kRawKeySize, &new_table_properties->raw_key_size}, {TablePropertiesNames::kRawValueSize, diff --git a/table/meta_blocks.h b/table/meta_blocks.h index 4fceb1a63..a18c8edc4 100644 --- a/table/meta_blocks.h +++ b/table/meta_blocks.h @@ -27,7 +27,6 @@ class Footer; class Logger; class RandomAccessFile; struct TableProperties; -class InternalIterator; class MetaIndexBuilder { public: diff --git a/table/partitioned_filter_block.cc b/table/partitioned_filter_block.cc index ec0ac7bed..6084133b7 100644 --- a/table/partitioned_filter_block.cc +++ b/table/partitioned_filter_block.cc @@ -26,12 +26,17 @@ namespace rocksdb { PartitionedFilterBlockBuilder::PartitionedFilterBlockBuilder( const SliceTransform* prefix_extractor, bool whole_key_filtering, FilterBitsBuilder* filter_bits_builder, int index_block_restart_interval, + const bool use_value_delta_encoding, PartitionedIndexBuilder* const p_index_builder, const uint32_t partition_size) : FullFilterBlockBuilder(prefix_extractor, whole_key_filtering, filter_bits_builder), - index_on_filter_block_builder_(index_block_restart_interval), - index_on_filter_block_builder_without_seq_(index_block_restart_interval), + index_on_filter_block_builder_(index_block_restart_interval, + true /*use_delta_encoding*/, + use_value_delta_encoding), + index_on_filter_block_builder_without_seq_(index_block_restart_interval, + true /*use_delta_encoding*/, + use_value_delta_encoding), p_index_builder_(p_index_builder), filters_in_partition_(0), num_added_(0) { @@ -73,10 +78,15 @@ Slice PartitionedFilterBlockBuilder::Finish( FilterEntry& last_entry = filters.front(); std::string handle_encoding; last_partition_block_handle.EncodeTo(&handle_encoding); - index_on_filter_block_builder_.Add(last_entry.key, handle_encoding); + std::string handle_delta_encoding; + last_partition_block_handle.EncodeSizeTo(&handle_delta_encoding); + const Slice handle_delta_encoding_slice(handle_delta_encoding); + index_on_filter_block_builder_.Add(last_entry.key, handle_encoding, + &handle_delta_encoding_slice); if (!p_index_builder_->seperator_is_key_plus_seq()) { index_on_filter_block_builder_without_seq_.Add( - ExtractUserKey(last_entry.key), handle_encoding); + ExtractUserKey(last_entry.key), handle_encoding, + &handle_delta_encoding_slice); } filters.pop_front(); } else { @@ -109,12 +119,14 @@ PartitionedFilterBlockReader::PartitionedFilterBlockReader( const SliceTransform* prefix_extractor, bool _whole_key_filtering, BlockContents&& contents, FilterBitsReader* /*filter_bits_reader*/, Statistics* stats, const InternalKeyComparator comparator, - const BlockBasedTable* table, const bool index_key_includes_seq) + const BlockBasedTable* table, const bool index_key_includes_seq, + const bool index_value_is_full) : FilterBlockReader(contents.data.size(), stats, _whole_key_filtering), prefix_extractor_(prefix_extractor), comparator_(comparator), table_(table), - index_key_includes_seq_(index_key_includes_seq) { + index_key_includes_seq_(index_key_includes_seq), + index_value_is_full_(index_value_is_full) { idx_on_fltr_blk_.reset(new Block(std::move(contents), kDisableGlobalSequenceNumber, 0 /* read_amp_bytes_per_bit */, stats)); @@ -134,15 +146,10 @@ PartitionedFilterBlockReader::~PartitionedFilterBlockReader() { Statistics* kNullStats = nullptr; idx_on_fltr_blk_->NewIterator( &comparator_, comparator_.user_comparator(), &biter, kNullStats, true, - index_key_includes_seq_); + index_key_includes_seq_, index_value_is_full_); biter.SeekToFirst(); for (; biter.Valid(); biter.Next()) { - auto input = biter.value(); - auto s = handle.DecodeFrom(&input); - assert(s.ok()); - if (!s.ok()) { - continue; - } + handle = biter.value(); auto key = BlockBasedTable::GetCacheKey(table_->rep_->cache_key_prefix, table_->rep_->cache_key_prefix_size, handle, cache_key); @@ -168,7 +175,7 @@ bool PartitionedFilterBlockReader::KeyMayMatch( } bool cached = false; auto filter_partition = - GetFilterPartition(nullptr /* prefetch_buffer */, &filter_handle, no_io, + GetFilterPartition(nullptr /* prefetch_buffer */, filter_handle, no_io, &cached, prefix_extractor); if (UNLIKELY(!filter_partition.value)) { return true; @@ -207,7 +214,7 @@ bool PartitionedFilterBlockReader::PrefixMayMatch( } bool cached = false; auto filter_partition = - GetFilterPartition(nullptr /* prefetch_buffer */, &filter_handle, no_io, + GetFilterPartition(nullptr /* prefetch_buffer */, filter_handle, no_io, &cached, prefix_extractor); if (UNLIKELY(!filter_partition.value)) { return true; @@ -225,29 +232,26 @@ bool PartitionedFilterBlockReader::PrefixMayMatch( return res; } -Slice PartitionedFilterBlockReader::GetFilterPartitionHandle( +BlockHandle PartitionedFilterBlockReader::GetFilterPartitionHandle( const Slice& entry) { IndexBlockIter iter; Statistics* kNullStats = nullptr; idx_on_fltr_blk_->NewIterator( &comparator_, comparator_.user_comparator(), &iter, kNullStats, true, - index_key_includes_seq_); + index_key_includes_seq_, index_value_is_full_); iter.Seek(entry); if (UNLIKELY(!iter.Valid())) { - return Slice(); + return BlockHandle(0, 0); } assert(iter.Valid()); - Slice handle_value = iter.value(); - return handle_value; + BlockHandle fltr_blk_handle = iter.value(); + return fltr_blk_handle; } BlockBasedTable::CachableEntry PartitionedFilterBlockReader::GetFilterPartition( - FilePrefetchBuffer* prefetch_buffer, Slice* handle_value, const bool no_io, - bool* cached, const SliceTransform* prefix_extractor) { - BlockHandle fltr_blk_handle; - auto s = fltr_blk_handle.DecodeFrom(handle_value); - assert(s.ok()); + FilePrefetchBuffer* prefetch_buffer, BlockHandle& fltr_blk_handle, + const bool no_io, bool* cached, const SliceTransform* prefix_extractor) { const bool is_a_filter_partition = true; auto block_cache = table_->rep_->table_options.block_cache.get(); if (LIKELY(block_cache != nullptr)) { @@ -299,39 +303,25 @@ void PartitionedFilterBlockReader::CacheDependencies( // Before read partitions, prefetch them to avoid lots of IOs auto rep = table_->rep_; IndexBlockIter biter; - BlockHandle handle; Statistics* kNullStats = nullptr; idx_on_fltr_blk_->NewIterator( &comparator_, comparator_.user_comparator(), &biter, kNullStats, true, - index_key_includes_seq_); + index_key_includes_seq_, index_value_is_full_); // Index partitions are assumed to be consecuitive. Prefetch them all. // Read the first block offset biter.SeekToFirst(); - Slice input = biter.value(); - Status s = handle.DecodeFrom(&input); - assert(s.ok()); - if (!s.ok()) { - ROCKS_LOG_WARN(rep->ioptions.info_log, - "Could not read first index partition"); - return; - } + BlockHandle handle = biter.value(); uint64_t prefetch_off = handle.offset(); // Read the last block's offset biter.SeekToLast(); - input = biter.value(); - s = handle.DecodeFrom(&input); - assert(s.ok()); - if (!s.ok()) { - ROCKS_LOG_WARN(rep->ioptions.info_log, - "Could not read last index partition"); - return; - } + handle = biter.value(); uint64_t last_off = handle.offset() + handle.size() + kBlockTrailerSize; uint64_t prefetch_len = last_off - prefetch_off; std::unique_ptr prefetch_buffer; auto& file = table_->rep_->file; prefetch_buffer.reset(new FilePrefetchBuffer()); + Status s; s = prefetch_buffer->Prefetch(file.get(), prefetch_off, static_cast(prefetch_len)); @@ -339,14 +329,7 @@ void PartitionedFilterBlockReader::CacheDependencies( biter.SeekToFirst(); Cache* block_cache = rep->table_options.block_cache.get(); for (; biter.Valid(); biter.Next()) { - input = biter.value(); - s = handle.DecodeFrom(&input); - assert(s.ok()); - if (!s.ok()) { - ROCKS_LOG_WARN(rep->ioptions.info_log, "Could not read index partition"); - continue; - } - + handle = biter.value(); const bool no_io = true; const bool is_a_filter_partition = true; auto filter = table_->GetFilter( diff --git a/table/partitioned_filter_block.h b/table/partitioned_filter_block.h index 86ec038a9..f6241749d 100644 --- a/table/partitioned_filter_block.h +++ b/table/partitioned_filter_block.h @@ -26,6 +26,7 @@ class PartitionedFilterBlockBuilder : public FullFilterBlockBuilder { explicit PartitionedFilterBlockBuilder( const SliceTransform* prefix_extractor, bool whole_key_filtering, FilterBitsBuilder* filter_bits_builder, int index_block_restart_interval, + const bool use_value_delta_encoding, PartitionedIndexBuilder* const p_index_builder, const uint32_t partition_size); @@ -74,7 +75,8 @@ class PartitionedFilterBlockReader : public FilterBlockReader, const SliceTransform* prefix_extractor, bool whole_key_filtering, BlockContents&& contents, FilterBitsReader* filter_bits_reader, Statistics* stats, const InternalKeyComparator comparator, - const BlockBasedTable* table, const bool index_key_includes_seq); + const BlockBasedTable* table, const bool index_key_includes_seq, + const bool index_value_is_full); virtual ~PartitionedFilterBlockReader(); virtual bool IsBlockBased() override { return false; } @@ -89,10 +91,11 @@ class PartitionedFilterBlockReader : public FilterBlockReader, virtual size_t ApproximateMemoryUsage() const override; private: - Slice GetFilterPartitionHandle(const Slice& entry); + BlockHandle GetFilterPartitionHandle(const Slice& entry); BlockBasedTable::CachableEntry GetFilterPartition( - FilePrefetchBuffer* prefetch_buffer, Slice* handle, const bool no_io, - bool* cached, const SliceTransform* prefix_extractor = nullptr); + FilePrefetchBuffer* prefetch_buffer, BlockHandle& handle, + const bool no_io, bool* cached, + const SliceTransform* prefix_extractor = nullptr); virtual void CacheDependencies( bool bin, const SliceTransform* prefix_extractor) override; @@ -101,6 +104,7 @@ class PartitionedFilterBlockReader : public FilterBlockReader, const InternalKeyComparator comparator_; const BlockBasedTable* table_; const bool index_key_includes_seq_; + const bool index_value_is_full_; std::unordered_map> filter_map_; diff --git a/table/partitioned_filter_block_test.cc b/table/partitioned_filter_block_test.cc index feb0c99c6..150eac6a8 100644 --- a/table/partitioned_filter_block_test.cc +++ b/table/partitioned_filter_block_test.cc @@ -100,7 +100,9 @@ class PartitionedFilterBlockTest : public testing::Test { } PartitionedIndexBuilder* NewIndexBuilder() { - return PartitionedIndexBuilder::CreateIndexBuilder(&icomp, table_options_); + const bool kValueDeltaEncoded = true; + return PartitionedIndexBuilder::CreateIndexBuilder( + &icomp, !kValueDeltaEncoded, table_options_); } PartitionedFilterBlockBuilder* NewBuilder( @@ -113,11 +115,12 @@ class PartitionedFilterBlockTest : public testing::Test { 99) / 100); partition_size = std::max(partition_size, static_cast(1)); + const bool kValueDeltaEncoded = true; return new PartitionedFilterBlockBuilder( prefix_extractor, table_options_.whole_key_filtering, table_options_.filter_policy->GetFilterBitsBuilder(), - table_options_.index_block_restart_interval, p_index_builder, - partition_size); + table_options_.index_block_restart_interval, !kValueDeltaEncoded, + p_index_builder, partition_size); } std::unique_ptr table; @@ -143,7 +146,8 @@ class PartitionedFilterBlockTest : public testing::Test { !kSkipFilters, !kImmortal))); auto reader = new PartitionedFilterBlockReader( prefix_extractor, true, BlockContents(slice, false, kNoCompression), - nullptr, nullptr, icomp, table.get(), pib->seperator_is_key_plus_seq()); + nullptr, nullptr, icomp, table.get(), pib->seperator_is_key_plus_seq(), + !pib->get_use_value_delta_encoding()); return reader; } diff --git a/table/plain_table_reader.h b/table/plain_table_reader.h index 8dd774e4a..df08a98fa 100644 --- a/table/plain_table_reader.h +++ b/table/plain_table_reader.h @@ -38,7 +38,6 @@ class TableReader; class InternalKeyComparator; class PlainTableKeyDecoder; class GetContext; -class InternalIterator; using std::unique_ptr; using std::unordered_map; diff --git a/table/table_properties.cc b/table/table_properties.cc index 9c1c4bd8e..207a64191 100644 --- a/table/table_properties.cc +++ b/table/table_properties.cc @@ -94,8 +94,9 @@ std::string TableProperties::ToString( AppendProperty(result, "data block size", data_size, prop_delim, kv_delim); char index_block_size_str[80]; snprintf(index_block_size_str, sizeof(index_block_size_str), - "index block size (user-key? %d)", - static_cast(index_key_is_user_key)); + "index block size (user-key? %d, delta-value? %d)", + static_cast(index_key_is_user_key), + static_cast(index_value_is_delta_encoded)); AppendProperty(result, index_block_size_str, index_size, prop_delim, kv_delim); if (index_partitions != 0) { @@ -163,6 +164,7 @@ void TableProperties::Add(const TableProperties& tp) { index_partitions += tp.index_partitions; top_level_index_size += tp.top_level_index_size; index_key_is_user_key += tp.index_key_is_user_key; + index_value_is_delta_encoded += tp.index_value_is_delta_encoded; filter_size += tp.filter_size; raw_key_size += tp.raw_key_size; raw_value_size += tp.raw_value_size; @@ -181,6 +183,8 @@ const std::string TablePropertiesNames::kTopLevelIndexSize = "rocksdb.top-level.index.size"; const std::string TablePropertiesNames::kIndexKeyIsUserKey = "rocksdb.index.key.is.user.key"; +const std::string TablePropertiesNames::kIndexValueIsDeltaEncoded = + "rocksdb.index.value.is.delta.encoded"; const std::string TablePropertiesNames::kFilterSize = "rocksdb.filter.size"; const std::string TablePropertiesNames::kRawKeySize = diff --git a/table/table_properties_internal.h b/table/table_properties_internal.h index b4e95750b..888b43d24 100644 --- a/table/table_properties_internal.h +++ b/table/table_properties_internal.h @@ -10,7 +10,6 @@ namespace rocksdb { -class InternalIterator; class BlockHandle; // Seek to the properties block. diff --git a/table/table_reader.h b/table/table_reader.h index b51b44b67..505b5ba1f 100644 --- a/table/table_reader.h +++ b/table/table_reader.h @@ -21,7 +21,6 @@ class Arena; struct ReadOptions; struct TableProperties; class GetContext; -class InternalIterator; // A Table is a sorted map from strings to strings. Tables are // immutable and persistent. A Table may be safely accessed from diff --git a/table/table_test.cc b/table/table_test.cc index 9ce98ab47..42220ad1e 100644 --- a/table/table_test.cc +++ b/table/table_test.cc @@ -2988,9 +2988,13 @@ TEST_F(HarnessTest, FooterTests) { class IndexBlockRestartIntervalTest : public TableTest, - public ::testing::WithParamInterface { + public ::testing::WithParamInterface> { public: - static std::vector GetRestartValues() { return {-1, 0, 1, 8, 16, 32}; } + static std::vector> GetRestartValues() { + return {{-1, false}, {0, false}, {1, false}, {8, false}, + {16, false}, {32, false}, {-1, true}, {0, true}, + {1, true}, {8, true}, {16, true}, {32, true}}; + } }; INSTANTIATE_TEST_CASE_P( @@ -3002,12 +3006,16 @@ TEST_P(IndexBlockRestartIntervalTest, IndexBlockRestartInterval) { const int kKeySize = 100; const int kValSize = 500; - int index_block_restart_interval = GetParam(); + const int index_block_restart_interval = std::get<0>(GetParam()); + const bool value_delta_encoding = std::get<1>(GetParam()); Options options; BlockBasedTableOptions table_options; table_options.block_size = 64; // small block size to get big index block table_options.index_block_restart_interval = index_block_restart_interval; + if (value_delta_encoding) { + table_options.format_version = 4; + } options.table_factory.reset(new BlockBasedTableFactory(table_options)); TableConstructor c(BytewiseComparator()); diff --git a/table/two_level_iterator.cc b/table/two_level_iterator.cc index 09e0e1ef1..58ab61c69 100644 --- a/table/two_level_iterator.cc +++ b/table/two_level_iterator.cc @@ -19,12 +19,13 @@ namespace rocksdb { namespace { -class TwoLevelIterator : public InternalIterator { +class TwoLevelIndexIterator : public InternalIteratorBase { public: - explicit TwoLevelIterator(TwoLevelIteratorState* state, - InternalIterator* first_level_iter); + explicit TwoLevelIndexIterator( + TwoLevelIteratorState* state, + InternalIteratorBase* first_level_iter); - virtual ~TwoLevelIterator() { + virtual ~TwoLevelIndexIterator() { first_level_iter_.DeleteIter(false /* is_arena_mode */); second_level_iter_.DeleteIter(false /* is_arena_mode */); delete state_; @@ -42,7 +43,7 @@ class TwoLevelIterator : public InternalIterator { assert(Valid()); return second_level_iter_.key(); } - virtual Slice value() const override { + virtual BlockHandle value() const override { assert(Valid()); return second_level_iter_.value(); } @@ -68,23 +69,24 @@ class TwoLevelIterator : public InternalIterator { } void SkipEmptyDataBlocksForward(); void SkipEmptyDataBlocksBackward(); - void SetSecondLevelIterator(InternalIterator* iter); + void SetSecondLevelIterator(InternalIteratorBase* iter); void InitDataBlock(); TwoLevelIteratorState* state_; - IteratorWrapper first_level_iter_; - IteratorWrapper second_level_iter_; // May be nullptr + IteratorWrapperBase first_level_iter_; + IteratorWrapperBase second_level_iter_; // May be nullptr Status status_; // If second_level_iter is non-nullptr, then "data_block_handle_" holds the // "index_value" passed to block_function_ to create the second_level_iter. - std::string data_block_handle_; + BlockHandle data_block_handle_; }; -TwoLevelIterator::TwoLevelIterator(TwoLevelIteratorState* state, - InternalIterator* first_level_iter) +TwoLevelIndexIterator::TwoLevelIndexIterator( + TwoLevelIteratorState* state, + InternalIteratorBase* first_level_iter) : state_(state), first_level_iter_(first_level_iter) {} -void TwoLevelIterator::Seek(const Slice& target) { +void TwoLevelIndexIterator::Seek(const Slice& target) { first_level_iter_.Seek(target); InitDataBlock(); @@ -94,7 +96,7 @@ void TwoLevelIterator::Seek(const Slice& target) { SkipEmptyDataBlocksForward(); } -void TwoLevelIterator::SeekForPrev(const Slice& target) { +void TwoLevelIndexIterator::SeekForPrev(const Slice& target) { first_level_iter_.Seek(target); InitDataBlock(); if (second_level_iter_.iter() != nullptr) { @@ -112,7 +114,7 @@ void TwoLevelIterator::SeekForPrev(const Slice& target) { } } -void TwoLevelIterator::SeekToFirst() { +void TwoLevelIndexIterator::SeekToFirst() { first_level_iter_.SeekToFirst(); InitDataBlock(); if (second_level_iter_.iter() != nullptr) { @@ -121,7 +123,7 @@ void TwoLevelIterator::SeekToFirst() { SkipEmptyDataBlocksForward(); } -void TwoLevelIterator::SeekToLast() { +void TwoLevelIndexIterator::SeekToLast() { first_level_iter_.SeekToLast(); InitDataBlock(); if (second_level_iter_.iter() != nullptr) { @@ -130,19 +132,19 @@ void TwoLevelIterator::SeekToLast() { SkipEmptyDataBlocksBackward(); } -void TwoLevelIterator::Next() { +void TwoLevelIndexIterator::Next() { assert(Valid()); second_level_iter_.Next(); SkipEmptyDataBlocksForward(); } -void TwoLevelIterator::Prev() { +void TwoLevelIndexIterator::Prev() { assert(Valid()); second_level_iter_.Prev(); SkipEmptyDataBlocksBackward(); } -void TwoLevelIterator::SkipEmptyDataBlocksForward() { +void TwoLevelIndexIterator::SkipEmptyDataBlocksForward() { while (second_level_iter_.iter() == nullptr || (!second_level_iter_.Valid() && second_level_iter_.status().ok())) { // Move to next block @@ -158,7 +160,7 @@ void TwoLevelIterator::SkipEmptyDataBlocksForward() { } } -void TwoLevelIterator::SkipEmptyDataBlocksBackward() { +void TwoLevelIndexIterator::SkipEmptyDataBlocksBackward() { while (second_level_iter_.iter() == nullptr || (!second_level_iter_.Valid() && second_level_iter_.status().ok())) { // Move to next block @@ -174,24 +176,26 @@ void TwoLevelIterator::SkipEmptyDataBlocksBackward() { } } -void TwoLevelIterator::SetSecondLevelIterator(InternalIterator* iter) { - InternalIterator* old_iter = second_level_iter_.Set(iter); +void TwoLevelIndexIterator::SetSecondLevelIterator( + InternalIteratorBase* iter) { + InternalIteratorBase* old_iter = second_level_iter_.Set(iter); delete old_iter; } -void TwoLevelIterator::InitDataBlock() { +void TwoLevelIndexIterator::InitDataBlock() { if (!first_level_iter_.Valid()) { SetSecondLevelIterator(nullptr); } else { - Slice handle = first_level_iter_.value(); + BlockHandle handle = first_level_iter_.value(); if (second_level_iter_.iter() != nullptr && !second_level_iter_.status().IsIncomplete() && - handle.compare(data_block_handle_) == 0) { + handle.offset() == data_block_handle_.offset()) { // second_level_iter is already constructed with this iterator, so // no need to change anything } else { - InternalIterator* iter = state_->NewSecondaryIterator(handle); - data_block_handle_.assign(handle.data(), handle.size()); + InternalIteratorBase* iter = + state_->NewSecondaryIterator(handle); + data_block_handle_ = handle; SetSecondLevelIterator(iter); } } @@ -199,8 +203,9 @@ void TwoLevelIterator::InitDataBlock() { } // namespace -InternalIterator* NewTwoLevelIterator(TwoLevelIteratorState* state, - InternalIterator* first_level_iter) { - return new TwoLevelIterator(state, first_level_iter); +InternalIteratorBase* NewTwoLevelIterator( + TwoLevelIteratorState* state, + InternalIteratorBase* first_level_iter) { + return new TwoLevelIndexIterator(state, first_level_iter); } } // namespace rocksdb diff --git a/table/two_level_iterator.h b/table/two_level_iterator.h index d1c8f91af..55d5c01a4 100644 --- a/table/two_level_iterator.h +++ b/table/two_level_iterator.h @@ -22,7 +22,8 @@ struct TwoLevelIteratorState { TwoLevelIteratorState() {} virtual ~TwoLevelIteratorState() {} - virtual InternalIterator* NewSecondaryIterator(const Slice& handle) = 0; + virtual InternalIteratorBase* NewSecondaryIterator( + const BlockHandle& handle) = 0; }; @@ -36,7 +37,8 @@ struct TwoLevelIteratorState { // Uses a supplied function to convert an index_iter value into // an iterator over the contents of the corresponding block. // Note: this function expects first_level_iter was not created using the arena -extern InternalIterator* NewTwoLevelIterator( - TwoLevelIteratorState* state, InternalIterator* first_level_iter); +extern InternalIteratorBase* NewTwoLevelIterator( + TwoLevelIteratorState* state, + InternalIteratorBase* first_level_iter); } // namespace rocksdb diff --git a/util/coding.h b/util/coding.h index 803e4349d..27a638347 100644 --- a/util/coding.h +++ b/util/coding.h @@ -64,12 +64,27 @@ extern Slice GetLengthPrefixedSlice(const char* data); extern Slice GetSliceUntil(Slice* slice, char delimiter); +// Borrowed from https://github.com/facebook/fbthrift/blob/449a5f77f9f9bae72c9eb5e78093247eef185c04/thrift/lib/cpp/util/VarintUtils-inl.h#L202-L208 +constexpr inline uint64_t i64ToZigzag(const int64_t l) { + return (static_cast(l) << 1) ^ static_cast(l >> 63); +} +inline int64_t zigzagToI64(uint64_t n) { + return (n >> 1) ^ -static_cast(n & 1); +} + // Pointer-based variants of GetVarint... These either store a value // in *v and return a pointer just past the parsed value, or return // nullptr on error. These routines only look at bytes in the range // [p..limit-1] extern const char* GetVarint32Ptr(const char* p,const char* limit, uint32_t* v); extern const char* GetVarint64Ptr(const char* p,const char* limit, uint64_t* v); +inline const char* GetVarsignedint64Ptr(const char* p, const char* limit, + int64_t* value) { + uint64_t u = 0; + const char* ret = GetVarint64Ptr(p, limit, &u); + *value = zigzagToI64(u); + return ret; +} // Returns the length of the varint32 or varint64 encoding of "v" extern int VarintLength(uint64_t v); @@ -249,11 +264,18 @@ inline char* EncodeVarint64(char* dst, uint64_t v) { } inline void PutVarint64(std::string* dst, uint64_t v) { - char buf[10]; + char buf[kMaxVarint64Length]; char* ptr = EncodeVarint64(buf, v); dst->append(buf, static_cast(ptr - buf)); } +inline void PutVarsignedint64(std::string* dst, int64_t v) { + char buf[kMaxVarint64Length]; + // Using Zigzag format to convert signed to unsigned + char* ptr = EncodeVarint64(buf, i64ToZigzag(v)); + dst->append(buf, static_cast(ptr - buf)); +} + inline void PutVarint64Varint64(std::string* dst, uint64_t v1, uint64_t v2) { char buf[20]; char* ptr = EncodeVarint64(buf, v1); diff --git a/util/testutil.cc b/util/testutil.cc index 7625d20ee..6094d7ba0 100644 --- a/util/testutil.cc +++ b/util/testutil.cc @@ -20,7 +20,7 @@ namespace rocksdb { namespace test { const uint32_t kDefaultFormatVersion = BlockBasedTableOptions().format_version; -const uint32_t kLatestFormatVersion = 3u; +const uint32_t kLatestFormatVersion = 4u; Slice RandomString(Random* rnd, int len, std::string* dst) { dst->resize(len); diff --git a/utilities/transactions/transaction_test.h b/utilities/transactions/transaction_test.h index ea8b1717e..d024cecb2 100644 --- a/utilities/transactions/transaction_test.h +++ b/utilities/transactions/transaction_test.h @@ -406,7 +406,7 @@ class TransactionTestBase : public ::testing::Test { if (empty_wal) { ASSERT_OK(s); } else { - // Test that we can detect the WAL that is produced by an incompatbile + // Test that we can detect the WAL that is produced by an incompatible // WritePolicy and fail fast before mis-interpreting the WAL. ASSERT_TRUE(s.IsNotSupported()); return; -- GitLab