block_based_table_reader.cc 134.3 KB
Newer Older
1
//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
S
Siying Dong 已提交
2 3 4
//  This source code is licensed under both the GPLv2 (found in the
//  COPYING file in the root directory) and Apache 2.0 License
//  (found in the LICENSE.Apache file in the root directory).
5
//
J
jorlow@chromium.org 已提交
6 7 8
// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file. See the AUTHORS file for names of contributors.
9
#include "table/block_based/block_based_table_reader.h"
10

11
#include <algorithm>
12
#include <array>
13
#include <limits>
14 15
#include <string>
#include <utility>
O
omegaga 已提交
16
#include <vector>
17

T
Tyler Harter 已提交
18
#include "db/dbformat.h"
19
#include "db/pinned_iterators_manager.h"
T
Tyler Harter 已提交
20

21
#include "rocksdb/cache.h"
22 23 24
#include "rocksdb/comparator.h"
#include "rocksdb/env.h"
#include "rocksdb/filter_policy.h"
25
#include "rocksdb/iterator.h"
26 27
#include "rocksdb/options.h"
#include "rocksdb/statistics.h"
S
Siying Dong 已提交
28
#include "rocksdb/table.h"
29
#include "rocksdb/table_properties.h"
30

31 32 33 34 35 36 37
#include "table/block_based/block.h"
#include "table/block_based/block_based_filter_block.h"
#include "table/block_based/block_based_table_factory.h"
#include "table/block_based/block_prefix_index.h"
#include "table/block_based/filter_block.h"
#include "table/block_based/full_filter_block.h"
#include "table/block_based/partitioned_filter_block.h"
38
#include "table/block_fetcher.h"
J
jorlow@chromium.org 已提交
39
#include "table/format.h"
K
krad 已提交
40
#include "table/get_context.h"
S
sdong 已提交
41
#include "table/internal_iterator.h"
42
#include "table/meta_blocks.h"
43
#include "table/multiget_context.h"
K
krad 已提交
44
#include "table/persistent_cache_helper.h"
45
#include "table/sst_file_writer_collectors.h"
J
jorlow@chromium.org 已提交
46
#include "table/two_level_iterator.h"
47

48
#include "monitoring/perf_context_imp.h"
49
#include "test_util/sync_point.h"
J
jorlow@chromium.org 已提交
50
#include "util/coding.h"
51
#include "util/crc32c.h"
52
#include "util/file_reader_writer.h"
53
#include "util/stop_watch.h"
54
#include "util/string_util.h"
55
#include "util/xxhash.h"
J
jorlow@chromium.org 已提交
56

57
namespace rocksdb {
J
jorlow@chromium.org 已提交
58

I
xxHash  
Igor Canadi 已提交
59
extern const uint64_t kBlockBasedTableMagicNumber;
K
Kai Liu 已提交
60 61
extern const std::string kHashIndexPrefixesBlock;
extern const std::string kHashIndexPrefixesMetadataBlock;
62 63 64

typedef BlockBasedTable::IndexReader IndexReader;

M
Maysam Yabandeh 已提交
65 66 67 68 69
BlockBasedTable::~BlockBasedTable() {
  Close();
  delete rep_;
}

70 71
std::atomic<uint64_t> BlockBasedTable::next_cache_key_id_(0);

72 73 74 75 76
namespace {
// Read the block identified by "handle" from "file".
// The only relevant option is options.verify_checksums for now.
// On failure return non-OK.
// On success fill *result and return OK - caller owns *result
77
// @param uncompression_dict Data for presetting the compression library's
78
//    dictionary.
79 80 81 82
Status ReadBlockFromFile(
    RandomAccessFileReader* file, FilePrefetchBuffer* prefetch_buffer,
    const Footer& footer, const ReadOptions& options, const BlockHandle& handle,
    std::unique_ptr<Block>* result, const ImmutableCFOptions& ioptions,
83 84
    bool do_uncompress, bool maybe_compressed,
    const UncompressionDict& uncompression_dict,
85
    const PersistentCacheOptions& cache_options, SequenceNumber global_seqno,
86
    size_t read_amp_bytes_per_bit, MemoryAllocator* memory_allocator) {
87
  BlockContents contents;
88 89
  BlockFetcher block_fetcher(file, prefetch_buffer, footer, options, handle,
                             &contents, ioptions, do_uncompress,
90 91
                             maybe_compressed, uncompression_dict,
                             cache_options, memory_allocator);
S
Siying Dong 已提交
92
  Status s = block_fetcher.ReadBlockContents();
93
  if (s.ok()) {
94 95
    result->reset(new Block(std::move(contents), global_seqno,
                            read_amp_bytes_per_bit, ioptions.statistics));
96 97 98 99 100
  }

  return s;
}

Y
Yi Wu 已提交
101
inline MemoryAllocator* GetMemoryAllocator(
102
    const BlockBasedTableOptions& table_options) {
103 104 105
  return table_options.block_cache.get()
             ? table_options.block_cache->memory_allocator()
             : nullptr;
106 107
}

108 109 110 111 112 113 114
inline MemoryAllocator* GetMemoryAllocatorForCompressedBlock(
    const BlockBasedTableOptions& table_options) {
  return table_options.block_cache_compressed.get()
             ? table_options.block_cache_compressed->memory_allocator()
             : nullptr;
}

115 116
// Delete the entry resided in the cache.
template <class Entry>
A
Andrew Kryczka 已提交
117
void DeleteCachedEntry(const Slice& /*key*/, void* value) {
118 119 120 121
  auto entry = reinterpret_cast<Entry*>(value);
  delete entry;
}

122
void DeleteCachedFilterEntry(const Slice& key, void* value);
123
void DeleteCachedUncompressionDictEntry(const Slice& key, void* value);
124

125 126 127 128
// Release the cached entry and decrement its ref count.
void ForceReleaseCachedEntry(void* arg, void* h) {
  Cache* cache = reinterpret_cast<Cache*>(arg);
  Cache::Handle* handle = reinterpret_cast<Cache::Handle*>(h);
129
  cache->Release(handle, true /* force_erase */);
130 131
}

132 133 134
// For hash based index, return true if prefix_extractor and
// prefix_extractor_block mismatch, false otherwise. This flag will be used
// as total_order_seek via NewIndexIterator
135 136
bool PrefixExtractorChanged(const TableProperties* table_properties,
                            const SliceTransform* prefix_extractor) {
137 138 139
  // BlockBasedTableOptions::kHashSearch requires prefix_extractor to be set.
  // Turn off hash index in prefix_extractor is not set; if  prefix_extractor
  // is set but prefix_extractor_block is not set, also disable hash index
140 141
  if (prefix_extractor == nullptr || table_properties == nullptr ||
      table_properties->prefix_extractor_name.empty()) {
142 143
    return true;
  }
144

145
  // prefix_extractor and prefix_extractor_block are both non-empty
146 147
  if (table_properties->prefix_extractor_name.compare(
          prefix_extractor->Name()) != 0) {
148 149 150 151 152 153
    return true;
  } else {
    return false;
  }
}

154 155
}  // namespace

156 157 158 159 160
// Encapsulates common functionality for the various index reader
// implementations. Provides access to the index block regardless of whether
// it is owned by the reader or stored in the cache, or whether it is pinned
// in the cache or not.
class BlockBasedTable::IndexReaderCommon : public BlockBasedTable::IndexReader {
161
 public:
162 163
  IndexReaderCommon(const BlockBasedTable* t,
                    CachableEntry<Block>&& index_block)
164
      : table_(t), index_block_(std::move(index_block)) {
165 166 167
    assert(table_ != nullptr);
  }

168
 protected:
169
  static Status ReadIndexBlock(const BlockBasedTable* table,
170 171 172 173
                               FilePrefetchBuffer* prefetch_buffer,
                               const ReadOptions& read_options,
                               GetContext* get_context,
                               CachableEntry<Block>* index_block);
174

175
  const BlockBasedTable* table() const { return table_; }
176 177 178 179 180 181 182 183 184 185 186 187 188

  const InternalKeyComparator* internal_comparator() const {
    assert(table_ != nullptr);
    assert(table_->get_rep() != nullptr);

    return &table_->get_rep()->internal_comparator;
  }

  bool index_key_includes_seq() const {
    assert(table_ != nullptr);
    assert(table_->get_rep() != nullptr);

    const TableProperties* const properties =
189
        table_->get_rep()->table_properties.get();
190 191 192 193 194 195 196 197 198

    return properties == nullptr || !properties->index_key_is_user_key;
  }

  bool index_value_is_full() const {
    assert(table_ != nullptr);
    assert(table_->get_rep() != nullptr);

    const TableProperties* const properties =
199
        table_->get_rep()->table_properties.get();
200 201 202 203 204 205 206 207 208 209

    return properties == nullptr || !properties->index_value_is_delta_encoded;
  }

  Status GetOrReadIndexBlock(const ReadOptions& read_options,
                             GetContext* get_context,
                             CachableEntry<Block>* index_block) const;

  size_t ApproximateIndexBlockMemoryUsage() const {
    assert(!index_block_.GetOwnValue() || index_block_.GetValue() != nullptr);
210 211 212
    return index_block_.GetOwnValue()
               ? index_block_.GetValue()->ApproximateMemoryUsage()
               : 0;
213 214
  }

215
 private:
216
  const BlockBasedTable* table_;
217 218 219 220
  CachableEntry<Block> index_block_;
};

Status BlockBasedTable::IndexReaderCommon::ReadIndexBlock(
221
    const BlockBasedTable* table, FilePrefetchBuffer* prefetch_buffer,
222 223
    const ReadOptions& read_options, GetContext* get_context,
    CachableEntry<Block>* index_block) {
224 225 226 227 228 229 230 231 232 233
  PERF_TIMER_GUARD(read_index_block_nanos);

  assert(table != nullptr);
  assert(index_block != nullptr);
  assert(index_block->IsEmpty());

  const Rep* const rep = table->get_rep();
  assert(rep != nullptr);

  constexpr bool is_index = true;
234 235
  const Status s = table->RetrieveBlock(
      prefetch_buffer, read_options, rep->footer.index_handle(),
236
      UncompressionDict::GetEmptyDict(), index_block, is_index, get_context);
237 238 239 240 241

  return s;
}

Status BlockBasedTable::IndexReaderCommon::GetOrReadIndexBlock(
242 243
    const ReadOptions& read_options, GetContext* get_context,
    CachableEntry<Block>* index_block) const {
244 245 246
  assert(index_block != nullptr);

  if (!index_block_.IsEmpty()) {
247 248 249
    *index_block =
        CachableEntry<Block>(index_block_.GetValue(), nullptr /* cache */,
                             nullptr /* cache_handle */, false /* own_value */);
250 251 252
    return Status::OK();
  }

253 254
  return ReadIndexBlock(table_, nullptr /* prefetch_buffer */, read_options,
                        get_context, index_block);
255 256
}

M
Maysam Yabandeh 已提交
257
// Index that allows binary search lookup in a two-level index structure.
258
class PartitionIndexReader : public BlockBasedTable::IndexReaderCommon {
M
Maysam Yabandeh 已提交
259 260 261 262 263
 public:
  // Read the partition index from the file and create an instance for
  // `PartitionIndexReader`.
  // On success, index_reader will be populated; otherwise it will remain
  // unmodified.
264
  static Status Create(const BlockBasedTable* table,
265 266 267 268 269 270 271 272 273 274 275 276 277 278
                       FilePrefetchBuffer* prefetch_buffer, bool use_cache,
                       bool prefetch, bool pin, IndexReader** index_reader) {
    assert(table != nullptr);
    assert(table->get_rep());
    assert(!pin || prefetch);
    assert(index_reader != nullptr);

    CachableEntry<Block> index_block;
    if (prefetch || !use_cache) {
      const Status s = ReadIndexBlock(table, prefetch_buffer, ReadOptions(),
                                      nullptr /* get_context */, &index_block);
      if (!s.ok()) {
        return s;
      }
M
Maysam Yabandeh 已提交
279

280 281 282
      if (use_cache && !pin) {
        index_block.Reset();
      }
M
Maysam Yabandeh 已提交
283 284
    }

285 286 287
    *index_reader = new PartitionIndexReader(table, std::move(index_block));

    return Status::OK();
M
Maysam Yabandeh 已提交
288 289 290
  }

  // return a two-level iterator: first level is on the partition index
291
  InternalIteratorBase<BlockHandle>* NewIterator(
292 293 294
      const ReadOptions& read_options, bool /* disable_prefix_seek */,
      IndexBlockIter* iter, GetContext* get_context) override {
    CachableEntry<Block> index_block;
295 296
    const Status s =
        GetOrReadIndexBlock(read_options, get_context, &index_block);
297 298 299 300 301 302 303 304 305 306 307
    if (!s.ok()) {
      if (iter != nullptr) {
        iter->Invalidate(s);
        return iter;
      }

      return NewErrorInternalIterator<BlockHandle>(s);
    }

    InternalIteratorBase<BlockHandle>* it = nullptr;

M
Maysam Yabandeh 已提交
308
    Statistics* kNullStats = nullptr;
M
Maysam Yabandeh 已提交
309
    // Filters are already checked before seeking the index
310
    if (!partition_map_.empty()) {
311
      // We don't return pinned data from index blocks, so no need
312
      // to set `block_contents_pinned`.
313
      it = NewTwoLevelIterator(
314
          new BlockBasedTable::PartitionedIndexIteratorState(
315 316 317 318 319 320
              table(), &partition_map_, index_key_includes_seq(),
              index_value_is_full()),
          index_block.GetValue()->NewIterator<IndexBlockIter>(
              internal_comparator(), internal_comparator()->user_comparator(),
              nullptr, kNullStats, true, index_key_includes_seq(),
              index_value_is_full()));
321
    } else {
322 323 324 325
      ReadOptions ro;
      ro.fill_cache = read_options.fill_cache;
      constexpr bool is_index = true;
      // We don't return pinned data from index blocks, so no need
326
      // to set `block_contents_pinned`.
327 328 329 330 331 332 333 334
      it = new BlockBasedTableIterator<IndexBlockIter, BlockHandle>(
          table(), ro, *internal_comparator(),
          index_block.GetValue()->NewIterator<IndexBlockIter>(
              internal_comparator(), internal_comparator()->user_comparator(),
              nullptr, kNullStats, true, index_key_includes_seq(),
              index_value_is_full()),
          false, true, /* prefix_extractor */ nullptr, is_index,
          index_key_includes_seq(), index_value_is_full());
335
    }
336 337 338 339 340 341

    assert(it != nullptr);
    index_block.TransferTo(it);

    return it;

M
Maysam Yabandeh 已提交
342
    // TODO(myabandeh): Update TwoLevelIterator to be able to make use of
M
Maysam Yabandeh 已提交
343 344 345 346 347
    // on-stack BlockIter while the state is on heap. Currentlly it assumes
    // the first level iter is always on heap and will attempt to delete it
    // in its destructor.
  }

348
  void CacheDependencies(bool pin) override {
M
Maysam Yabandeh 已提交
349
    // Before read partitions, prefetch them to avoid lots of IOs
350
    auto rep = table()->rep_;
M
Maysam Yabandeh 已提交
351
    IndexBlockIter biter;
M
Maysam Yabandeh 已提交
352
    BlockHandle handle;
M
Maysam Yabandeh 已提交
353
    Statistics* kNullStats = nullptr;
354 355 356 357 358 359 360

    CachableEntry<Block> index_block;
    Status s = GetOrReadIndexBlock(ReadOptions(), nullptr /* get_context */,
                                   &index_block);
    if (!s.ok()) {
      ROCKS_LOG_WARN(rep->ioptions.info_log,
                     "Error retrieving top-level index block while trying to "
361 362
                     "cache index partitions: %s",
                     s.ToString().c_str());
363 364 365 366
      return;
    }

    // We don't return pinned data from index blocks, so no need
367
    // to set `block_contents_pinned`.
368 369 370
    index_block.GetValue()->NewIterator<IndexBlockIter>(
        internal_comparator(), internal_comparator()->user_comparator(), &biter,
        kNullStats, true, index_key_includes_seq(), index_value_is_full());
M
Maysam Yabandeh 已提交
371 372 373
    // Index partitions are assumed to be consecuitive. Prefetch them all.
    // Read the first block offset
    biter.SeekToFirst();
374 375 376 377
    if (!biter.Valid()) {
      // Empty index.
      return;
    }
378
    handle = biter.value();
M
Maysam Yabandeh 已提交
379 380 381 382
    uint64_t prefetch_off = handle.offset();

    // Read the last block's offset
    biter.SeekToLast();
383 384 385 386
    if (!biter.Valid()) {
      // Empty index.
      return;
    }
387
    handle = biter.value();
M
Maysam Yabandeh 已提交
388 389 390
    uint64_t last_off = handle.offset() + handle.size() + kBlockTrailerSize;
    uint64_t prefetch_len = last_off - prefetch_off;
    std::unique_ptr<FilePrefetchBuffer> prefetch_buffer;
391
    auto& file = rep->file;
M
Maysam Yabandeh 已提交
392
    prefetch_buffer.reset(new FilePrefetchBuffer());
393 394
    s = prefetch_buffer->Prefetch(file.get(), prefetch_off,
                                  static_cast<size_t>(prefetch_len));
M
Maysam Yabandeh 已提交
395 396 397 398 399

    // After prefetch, read the partitions one by one
    biter.SeekToFirst();
    auto ro = ReadOptions();
    for (; biter.Valid(); biter.Next()) {
400
      handle = biter.value();
401
      CachableEntry<Block> block;
M
Maysam Yabandeh 已提交
402
      const bool is_index = true;
403 404
      // TODO: Support counter batch update for partitioned index and
      // filter blocks
405 406 407
      s = table()->MaybeReadBlockAndLoadToCache(
          prefetch_buffer.get(), ro, handle, UncompressionDict::GetEmptyDict(),
          &block, is_index, nullptr /* get_context */);
M
Maysam Yabandeh 已提交
408

409 410 411
      assert(s.ok() || block.GetValue() == nullptr);
      if (s.ok() && block.GetValue() != nullptr) {
        if (block.IsCached()) {
412
          if (pin) {
413
            partition_map_[handle.offset()] = std::move(block);
414
          }
M
Maysam Yabandeh 已提交
415 416 417
        }
      }
    }
M
Maysam Yabandeh 已提交
418 419
  }

420
  size_t ApproximateMemoryUsage() const override {
421
    size_t usage = ApproximateIndexBlockMemoryUsage();
422 423 424 425 426 427 428
#ifdef ROCKSDB_MALLOC_USABLE_SIZE
    usage += malloc_usable_size((void*)this);
#else
    usage += sizeof(*this);
#endif  // ROCKSDB_MALLOC_USABLE_SIZE
    // TODO(myabandeh): more accurate estimate of partition_map_ mem usage
    return usage;
M
Maysam Yabandeh 已提交
429 430 431
  }

 private:
432 433
  PartitionIndexReader(const BlockBasedTable* t,
                       CachableEntry<Block>&& index_block)
434
      : IndexReaderCommon(t, std::move(index_block)) {}
435

436
  std::unordered_map<uint64_t, CachableEntry<Block>> partition_map_;
M
Maysam Yabandeh 已提交
437 438
};

439 440 441
// Index that allows binary search lookup for the first key of each block.
// This class can be viewed as a thin wrapper for `Block` class which already
// supports binary search.
442
class BinarySearchIndexReader : public BlockBasedTable::IndexReaderCommon {
443 444 445
 public:
  // Read index from the file and create an intance for
  // `BinarySearchIndexReader`.
446 447
  // On success, index_reader will be populated; otherwise it will remain
  // unmodified.
448
  static Status Create(const BlockBasedTable* table,
449 450 451 452 453 454 455 456 457 458 459 460 461 462
                       FilePrefetchBuffer* prefetch_buffer, bool use_cache,
                       bool prefetch, bool pin, IndexReader** index_reader) {
    assert(table != nullptr);
    assert(table->get_rep());
    assert(!pin || prefetch);
    assert(index_reader != nullptr);

    CachableEntry<Block> index_block;
    if (prefetch || !use_cache) {
      const Status s = ReadIndexBlock(table, prefetch_buffer, ReadOptions(),
                                      nullptr /* get_context */, &index_block);
      if (!s.ok()) {
        return s;
      }
463

464 465 466
      if (use_cache && !pin) {
        index_block.Reset();
      }
467 468
    }

469 470 471
    *index_reader = new BinarySearchIndexReader(table, std::move(index_block));

    return Status::OK();
472 473
  }

474
  InternalIteratorBase<BlockHandle>* NewIterator(
475 476 477
      const ReadOptions& read_options, bool /* disable_prefix_seek */,
      IndexBlockIter* iter, GetContext* get_context) override {
    CachableEntry<Block> index_block;
478 479
    const Status s =
        GetOrReadIndexBlock(read_options, get_context, &index_block);
480 481 482 483 484 485 486 487 488
    if (!s.ok()) {
      if (iter != nullptr) {
        iter->Invalidate(s);
        return iter;
      }

      return NewErrorInternalIterator<BlockHandle>(s);
    }

M
Maysam Yabandeh 已提交
489
    Statistics* kNullStats = nullptr;
490
    // We don't return pinned data from index blocks, so no need
491
    // to set `block_contents_pinned`.
492 493 494
    auto it = index_block.GetValue()->NewIterator<IndexBlockIter>(
        internal_comparator(), internal_comparator()->user_comparator(), iter,
        kNullStats, true, index_key_includes_seq(), index_value_is_full());
495

496 497 498 499 500
    assert(it != nullptr);
    index_block.TransferTo(it);

    return it;
  }
501

502
  size_t ApproximateMemoryUsage() const override {
503
    size_t usage = ApproximateIndexBlockMemoryUsage();
504 505 506 507 508 509
#ifdef ROCKSDB_MALLOC_USABLE_SIZE
    usage += malloc_usable_size((void*)this);
#else
    usage += sizeof(*this);
#endif  // ROCKSDB_MALLOC_USABLE_SIZE
    return usage;
510 511
  }

512
 private:
513
  BinarySearchIndexReader(const BlockBasedTable* t,
514
                          CachableEntry<Block>&& index_block)
515
      : IndexReaderCommon(t, std::move(index_block)) {}
516 517 518 519
};

// Index that leverages an internal hash table to quicken the lookup for a given
// key.
520
class HashIndexReader : public BlockBasedTable::IndexReaderCommon {
521
 public:
522
  static Status Create(const BlockBasedTable* table,
523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539
                       FilePrefetchBuffer* prefetch_buffer,
                       InternalIterator* meta_index_iter, bool use_cache,
                       bool prefetch, bool pin, IndexReader** index_reader) {
    assert(table != nullptr);
    assert(index_reader != nullptr);
    assert(!pin || prefetch);

    auto rep = table->get_rep();
    assert(rep != nullptr);

    CachableEntry<Block> index_block;
    if (prefetch || !use_cache) {
      const Status s = ReadIndexBlock(table, prefetch_buffer, ReadOptions(),
                                      nullptr /* get_context */, &index_block);
      if (!s.ok()) {
        return s;
      }
540

541 542 543
      if (use_cache && !pin) {
        index_block.Reset();
      }
544 545
    }

546 547 548 549
    // Note, failure to create prefix hash index does not need to be a
    // hard error. We can still fall back to the original binary search index.
    // So, Create will succeed regardless, from this point on.

550
    auto new_index_reader = new HashIndexReader(table, std::move(index_block));
551 552
    *index_reader = new_index_reader;

K
Kai Liu 已提交
553 554
    // Get prefixes block
    BlockHandle prefixes_handle;
555 556
    Status s = FindMetaBlock(meta_index_iter, kHashIndexPrefixesBlock,
                             &prefixes_handle);
K
Kai Liu 已提交
557
    if (!s.ok()) {
558 559
      // TODO: log error
      return Status::OK();
K
Kai Liu 已提交
560 561 562 563 564 565 566
    }

    // Get index metadata block
    BlockHandle prefixes_meta_handle;
    s = FindMetaBlock(meta_index_iter, kHashIndexPrefixesMetadataBlock,
                      &prefixes_meta_handle);
    if (!s.ok()) {
567 568
      // TODO: log error
      return Status::OK();
K
Kai Liu 已提交
569 570
    }

571 572 573 574 575
    RandomAccessFileReader* const file = rep->file.get();
    const Footer& footer = rep->footer;
    const ImmutableCFOptions& ioptions = rep->ioptions;
    const PersistentCacheOptions& cache_options = rep->persistent_cache_options;
    MemoryAllocator* const memory_allocator =
576
        GetMemoryAllocator(rep->table_options);
577

K
Kai Liu 已提交
578 579
    // Read contents for the blocks
    BlockContents prefixes_contents;
S
Siying Dong 已提交
580 581
    BlockFetcher prefixes_block_fetcher(
        file, prefetch_buffer, footer, ReadOptions(), prefixes_handle,
582
        &prefixes_contents, ioptions, true /*decompress*/,
583
        true /*maybe_compressed*/, UncompressionDict::GetEmptyDict(),
584
        cache_options, memory_allocator);
S
Siying Dong 已提交
585
    s = prefixes_block_fetcher.ReadBlockContents();
K
Kai Liu 已提交
586 587 588 589
    if (!s.ok()) {
      return s;
    }
    BlockContents prefixes_meta_contents;
S
Siying Dong 已提交
590 591
    BlockFetcher prefixes_meta_block_fetcher(
        file, prefetch_buffer, footer, ReadOptions(), prefixes_meta_handle,
592
        &prefixes_meta_contents, ioptions, true /*decompress*/,
593
        true /*maybe_compressed*/, UncompressionDict::GetEmptyDict(),
594
        cache_options, memory_allocator);
595
    s = prefixes_meta_block_fetcher.ReadBlockContents();
K
Kai Liu 已提交
596
    if (!s.ok()) {
597 598
      // TODO: log error
      return Status::OK();
K
Kai Liu 已提交
599 600
    }

601
    BlockPrefixIndex* prefix_index = nullptr;
602 603
    s = BlockPrefixIndex::Create(rep->internal_prefix_transform.get(),
                                 prefixes_contents.data,
604 605 606
                                 prefixes_meta_contents.data, &prefix_index);
    // TODO: log error
    if (s.ok()) {
M
Maysam Yabandeh 已提交
607
      new_index_reader->prefix_index_.reset(prefix_index);
K
Kai Liu 已提交
608 609
    }

610
    return Status::OK();
611 612
  }

613
  InternalIteratorBase<BlockHandle>* NewIterator(
614 615 616
      const ReadOptions& read_options, bool disable_prefix_seek,
      IndexBlockIter* iter, GetContext* get_context) override {
    CachableEntry<Block> index_block;
617 618
    const Status s =
        GetOrReadIndexBlock(read_options, get_context, &index_block);
619 620 621 622 623 624 625 626 627
    if (!s.ok()) {
      if (iter != nullptr) {
        iter->Invalidate(s);
        return iter;
      }

      return NewErrorInternalIterator<BlockHandle>(s);
    }

M
Maysam Yabandeh 已提交
628
    Statistics* kNullStats = nullptr;
629 630
    const bool total_order_seek =
        read_options.total_order_seek || disable_prefix_seek;
631
    // We don't return pinned data from index blocks, so no need
632
    // to set `block_contents_pinned`.
633 634 635 636 637
    auto it = index_block.GetValue()->NewIterator<IndexBlockIter>(
        internal_comparator(), internal_comparator()->user_comparator(), iter,
        kNullStats, total_order_seek, index_key_includes_seq(),
        index_value_is_full(), false /* block_contents_pinned */,
        prefix_index_.get());
638

639 640 641 642 643
    assert(it != nullptr);
    index_block.TransferTo(it);

    return it;
  }
644

645
  size_t ApproximateMemoryUsage() const override {
646
    size_t usage = ApproximateIndexBlockMemoryUsage();
647 648 649
#ifdef ROCKSDB_MALLOC_USABLE_SIZE
    usage += malloc_usable_size((void*)this);
#else
M
Maysam Yabandeh 已提交
650 651 652
    if (prefix_index_) {
      usage += prefix_index_->ApproximateMemoryUsage();
    }
653 654 655
    usage += sizeof(*this);
#endif  // ROCKSDB_MALLOC_USABLE_SIZE
    return usage;
656 657
  }

658
 private:
659
  HashIndexReader(const BlockBasedTable* t, CachableEntry<Block>&& index_block)
660
      : IndexReaderCommon(t, std::move(index_block)) {}
K
Kai Liu 已提交
661

M
Maysam Yabandeh 已提交
662
  std::unique_ptr<BlockPrefixIndex> prefix_index_;
663 664
};

665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707
Cache::Handle* BlockBasedTable::GetEntryFromCache(
    Cache* block_cache, const Slice& key, Tickers block_cache_miss_ticker,
    Tickers block_cache_hit_ticker, uint64_t* block_cache_miss_stats,
    uint64_t* block_cache_hit_stats, Statistics* statistics,
    GetContext* get_context) const {
  auto cache_handle = block_cache->Lookup(key, statistics);
  if (cache_handle != nullptr) {
    PERF_COUNTER_ADD(block_cache_hit_count, 1);
    PERF_COUNTER_BY_LEVEL_ADD(block_cache_hit_count, 1,
                              static_cast<uint32_t>(rep_->level));
    if (get_context != nullptr) {
      // overall cache hit
      get_context->get_context_stats_.num_cache_hit++;
      // total bytes read from cache
      get_context->get_context_stats_.num_cache_bytes_read +=
          block_cache->GetUsage(cache_handle);
      // block-type specific cache hit
      (*block_cache_hit_stats)++;
    } else {
      // overall cache hit
      RecordTick(statistics, BLOCK_CACHE_HIT);
      // total bytes read from cache
      RecordTick(statistics, BLOCK_CACHE_BYTES_READ,
                 block_cache->GetUsage(cache_handle));
      RecordTick(statistics, block_cache_hit_ticker);
    }
  } else {
    PERF_COUNTER_BY_LEVEL_ADD(block_cache_miss_count, 1,
                              static_cast<uint32_t>(rep_->level));
    if (get_context != nullptr) {
      // overall cache miss
      get_context->get_context_stats_.num_cache_miss++;
      // block-type specific cache miss
      (*block_cache_miss_stats)++;
    } else {
      RecordTick(statistics, BLOCK_CACHE_MISS);
      RecordTick(statistics, block_cache_miss_ticker);
    }
  }

  return cache_handle;
}

708
// Helper function to setup the cache key's prefix for the Table.
709
void BlockBasedTable::SetupCacheKeyPrefix(Rep* rep) {
710 711
  assert(kMaxCacheKeyPrefixSize >= 10);
  rep->cache_key_prefix_size = 0;
712
  rep->compressed_cache_key_prefix_size = 0;
713
  if (rep->table_options.block_cache != nullptr) {
714 715
    GenerateCachePrefix(rep->table_options.block_cache.get(), rep->file->file(),
                        &rep->cache_key_prefix[0], &rep->cache_key_prefix_size);
716
  }
K
krad 已提交
717 718 719 720 721
  if (rep->table_options.persistent_cache != nullptr) {
    GenerateCachePrefix(/*cache=*/nullptr, rep->file->file(),
                        &rep->persistent_cache_key_prefix[0],
                        &rep->persistent_cache_key_prefix_size);
  }
722 723
  if (rep->table_options.block_cache_compressed != nullptr) {
    GenerateCachePrefix(rep->table_options.block_cache_compressed.get(),
724
                        rep->file->file(), &rep->compressed_cache_key_prefix[0],
725 726 727 728
                        &rep->compressed_cache_key_prefix_size);
  }
}

729 730
void BlockBasedTable::GenerateCachePrefix(Cache* cc, RandomAccessFile* file,
                                          char* buffer, size_t* size) {
731 732 733 734 735
  // generate an id from the file
  *size = file->GetUniqueId(buffer, kMaxCacheKeyPrefixSize);

  // If the prefix wasn't generated or was too long,
  // create one from the cache.
K
krad 已提交
736
  if (cc && *size == 0) {
737 738 739 740 741
    char* end = EncodeVarint64(buffer, cc->NewId());
    *size = static_cast<size_t>(end - buffer);
  }
}

742 743
void BlockBasedTable::GenerateCachePrefix(Cache* cc, WritableFile* file,
                                          char* buffer, size_t* size) {
744 745 746 747 748 749 750 751
  // generate an id from the file
  *size = file->GetUniqueId(buffer, kMaxCacheKeyPrefixSize);

  // If the prefix wasn't generated or was too long,
  // create one from the cache.
  if (*size == 0) {
    char* end = EncodeVarint64(buffer, cc->NewId());
    *size = static_cast<size_t>(end - buffer);
752 753 754
  }
}

755 756 757 758 759 760 761 762 763 764 765 766
namespace {
// Return True if table_properties has `user_prop_name` has a `true` value
// or it doesn't contain this property (for backward compatible).
bool IsFeatureSupported(const TableProperties& table_properties,
                        const std::string& user_prop_name, Logger* info_log) {
  auto& props = table_properties.user_collected_properties;
  auto pos = props.find(user_prop_name);
  // Older version doesn't have this value set. Skip this check.
  if (pos != props.end()) {
    if (pos->second == kPropFalse) {
      return false;
    } else if (pos->second != kPropTrue) {
767 768
      ROCKS_LOG_WARN(info_log, "Property %s has invalidate value %s",
                     user_prop_name.c_str(), pos->second.c_str());
769 770 771 772
    }
  }
  return true;
}
773

774 775 776 777 778 779 780
// Caller has to ensure seqno is not nullptr.
Status GetGlobalSequenceNumber(const TableProperties& table_properties,
                               SequenceNumber largest_seqno,
                               SequenceNumber* seqno) {
  const auto& props = table_properties.user_collected_properties;
  const auto version_pos = props.find(ExternalSstFilePropertyNames::kVersion);
  const auto seqno_pos = props.find(ExternalSstFilePropertyNames::kGlobalSeqno);
781

782
  *seqno = kDisableGlobalSequenceNumber;
783 784
  if (version_pos == props.end()) {
    if (seqno_pos != props.end()) {
785
      std::array<char, 200> msg_buf;
786
      // This is not an external sst file, global_seqno is not supported.
787 788
      snprintf(
          msg_buf.data(), msg_buf.max_size(),
789 790
          "A non-external sst file have global seqno property with value %s",
          seqno_pos->second.c_str());
791
      return Status::Corruption(msg_buf.data());
792
    }
793
    return Status::OK();
794 795 796 797 798
  }

  uint32_t version = DecodeFixed32(version_pos->second.c_str());
  if (version < 2) {
    if (seqno_pos != props.end() || version != 1) {
799
      std::array<char, 200> msg_buf;
800
      // This is a v1 external sst file, global_seqno is not supported.
801 802 803 804 805
      snprintf(msg_buf.data(), msg_buf.max_size(),
               "An external sst file with version %u have global seqno "
               "property with value %s",
               version, seqno_pos->second.c_str());
      return Status::Corruption(msg_buf.data());
806
    }
807
    return Status::OK();
808 809
  }

810 811 812 813 814 815 816
  // Since we have a plan to deprecate global_seqno, we do not return failure
  // if seqno_pos == props.end(). We rely on version_pos to detect whether the
  // SST is external.
  SequenceNumber global_seqno(0);
  if (seqno_pos != props.end()) {
    global_seqno = DecodeFixed64(seqno_pos->second.c_str());
  }
817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832
  // SstTableReader open table reader with kMaxSequenceNumber as largest_seqno
  // to denote it is unknown.
  if (largest_seqno < kMaxSequenceNumber) {
    if (global_seqno == 0) {
      global_seqno = largest_seqno;
    }
    if (global_seqno != largest_seqno) {
      std::array<char, 200> msg_buf;
      snprintf(
          msg_buf.data(), msg_buf.max_size(),
          "An external sst file with version %u have global seqno property "
          "with value %s, while largest seqno in the file is %llu",
          version, seqno_pos->second.c_str(),
          static_cast<unsigned long long>(largest_seqno));
      return Status::Corruption(msg_buf.data());
    }
833
  }
834
  *seqno = global_seqno;
835 836

  if (global_seqno > kMaxSequenceNumber) {
837 838 839 840 841 842
    std::array<char, 200> msg_buf;
    snprintf(msg_buf.data(), msg_buf.max_size(),
             "An external sst file with version %u have global seqno property "
             "with value %llu, which is greater than kMaxSequenceNumber",
             version, static_cast<unsigned long long>(global_seqno));
    return Status::Corruption(msg_buf.data());
843 844
  }

845
  return Status::OK();
846
}
847 848
}  // namespace

K
krad 已提交
849 850 851 852 853 854 855 856 857 858 859 860
Slice BlockBasedTable::GetCacheKey(const char* cache_key_prefix,
                                   size_t cache_key_prefix_size,
                                   const BlockHandle& handle, char* cache_key) {
  assert(cache_key != nullptr);
  assert(cache_key_prefix_size != 0);
  assert(cache_key_prefix_size <= kMaxCacheKeyPrefixSize);
  memcpy(cache_key, cache_key_prefix, cache_key_prefix_size);
  char* end =
      EncodeVarint64(cache_key + cache_key_prefix_size, handle.offset());
  return Slice(cache_key, static_cast<size_t>(end - cache_key));
}

L
Lei Jin 已提交
861 862
Status BlockBasedTable::Open(const ImmutableCFOptions& ioptions,
                             const EnvOptions& env_options,
863
                             const BlockBasedTableOptions& table_options,
864
                             const InternalKeyComparator& internal_comparator,
865
                             std::unique_ptr<RandomAccessFileReader>&& file,
866
                             uint64_t file_size,
867
                             std::unique_ptr<TableReader>* table_reader,
868
                             const SliceTransform* prefix_extractor,
869
                             const bool prefetch_index_and_filter_in_cache,
870
                             const bool skip_filters, const int level,
871
                             const bool immortal_table,
872
                             const SequenceNumber largest_seqno,
873
                             TailPrefetchStats* tail_prefetch_stats) {
S
Siying Dong 已提交
874
  table_reader->reset();
875

876
  Status s;
877
  Footer footer;
878 879
  std::unique_ptr<FilePrefetchBuffer> prefetch_buffer;

880 881 882
  // prefetch both index and filters, down to all partitions
  const bool prefetch_all = prefetch_index_and_filter_in_cache || level == 0;
  const bool preload_all = !table_options.cache_index_and_filter_blocks;
883

884 885 886 887 888 889 890 891 892 893 894
  s = PrefetchTail(file.get(), file_size, tail_prefetch_stats, prefetch_all,
                   preload_all, &prefetch_buffer);

  // Read in the following order:
  //    1. Footer
  //    2. [metaindex block]
  //    3. [meta block: properties]
  //    4. [meta block: range deletion tombstone]
  //    5. [meta block: compression dictionary]
  //    6. [meta block: index]
  //    7. [meta block: filter]
895 896
  s = ReadFooterFromFile(file.get(), prefetch_buffer.get(), file_size, &footer,
                         kBlockBasedTableMagicNumber);
897 898 899
  if (!s.ok()) {
    return s;
  }
900
  if (!BlockBasedTableSupportedVersion(footer.version())) {
901
    return Status::Corruption(
902
        "Unknown Footer version. Maybe this file was created with newer "
903 904
        "version of RocksDB?");
  }
J
jorlow@chromium.org 已提交
905

A
Aaron Gao 已提交
906
  // We've successfully read the footer. We are ready to serve requests.
907 908 909
  // Better not mutate rep_ after the creation. eg. internal_prefix_transform
  // raw pointer will be used to create HashIndexReader, whose reset may
  // access a dangling pointer.
910
  Rep* rep = new BlockBasedTable::Rep(ioptions, env_options, table_options,
911
                                      internal_comparator, skip_filters, level,
912
                                      immortal_table);
K
Kai Liu 已提交
913
  rep->file = std::move(file);
I
xxHash  
Igor Canadi 已提交
914
  rep->footer = footer;
915
  rep->index_type = table_options.index_type;
916
  rep->hash_index_allow_collision = table_options.hash_index_allow_collision;
917 918
  // We need to wrap data with internal_prefix_transform to make sure it can
  // handle prefix correctly.
919
  rep->internal_prefix_transform.reset(
920
      new InternalKeySliceTransform(prefix_extractor));
921
  SetupCacheKeyPrefix(rep);
922
  std::unique_ptr<BlockBasedTable> new_table(new BlockBasedTable(rep));
K
Kai Liu 已提交
923

K
krad 已提交
924 925 926 927 928
  // page cache options
  rep->persistent_cache_options =
      PersistentCacheOptions(rep->table_options.persistent_cache,
                             std::string(rep->persistent_cache_key_prefix,
                                         rep->persistent_cache_key_prefix_size),
929
                             rep->ioptions.statistics);
K
krad 已提交
930

931 932 933 934 935
  // Meta-blocks are not dictionary compressed. Explicitly set the dictionary
  // handle to null, otherwise it may be seen as uninitialized during the below
  // meta-block reads.
  rep->compression_dict_handle = BlockHandle::NullBlockHandle();

936
  // Read metaindex
K
Kai Liu 已提交
937
  std::unique_ptr<Block> meta;
S
sdong 已提交
938
  std::unique_ptr<InternalIterator> meta_iter;
939
  s = new_table->ReadMetaBlock(prefetch_buffer.get(), &meta, &meta_iter);
940 941 942
  if (!s.ok()) {
    return s;
  }
K
Kai Liu 已提交
943

944 945
  s = new_table->ReadPropertiesBlock(prefetch_buffer.get(), meta_iter.get(),
                                     largest_seqno);
946 947 948
  if (!s.ok()) {
    return s;
  }
949 950
  s = new_table->ReadRangeDelBlock(prefetch_buffer.get(), meta_iter.get(),
                                   internal_comparator);
951 952 953
  if (!s.ok()) {
    return s;
  }
954 955 956
  s = new_table->PrefetchIndexAndFilterBlocks(
      prefetch_buffer.get(), meta_iter.get(), new_table.get(), prefetch_all,
      table_options, level);
957 958 959 960 961 962 963 964

  if (s.ok()) {
    // Update tail prefetch stats
    assert(prefetch_buffer.get() != nullptr);
    if (tail_prefetch_stats != nullptr) {
      assert(prefetch_buffer->min_offset_read() < file_size);
      tail_prefetch_stats->RecordEffectiveSize(
          static_cast<size_t>(file_size) - prefetch_buffer->min_offset_read());
I
Igor Canadi 已提交
965
    }
966 967

    *table_reader = std::move(new_table);
I
Igor Canadi 已提交
968 969
  }

970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015
  return s;
}

Status BlockBasedTable::PrefetchTail(
    RandomAccessFileReader* file, uint64_t file_size,
    TailPrefetchStats* tail_prefetch_stats, const bool prefetch_all,
    const bool preload_all,
    std::unique_ptr<FilePrefetchBuffer>* prefetch_buffer) {
  size_t tail_prefetch_size = 0;
  if (tail_prefetch_stats != nullptr) {
    // Multiple threads may get a 0 (no history) when running in parallel,
    // but it will get cleared after the first of them finishes.
    tail_prefetch_size = tail_prefetch_stats->GetSuggestedPrefetchSize();
  }
  if (tail_prefetch_size == 0) {
    // Before read footer, readahead backwards to prefetch data. Do more
    // readahead if we're going to read index/filter.
    // TODO: This may incorrectly select small readahead in case partitioned
    // index/filter is enabled and top-level partition pinning is enabled.
    // That's because we need to issue readahead before we read the properties,
    // at which point we don't yet know the index type.
    tail_prefetch_size = prefetch_all || preload_all ? 512 * 1024 : 4 * 1024;
  }
  size_t prefetch_off;
  size_t prefetch_len;
  if (file_size < tail_prefetch_size) {
    prefetch_off = 0;
    prefetch_len = static_cast<size_t>(file_size);
  } else {
    prefetch_off = static_cast<size_t>(file_size - tail_prefetch_size);
    prefetch_len = tail_prefetch_size;
  }
  TEST_SYNC_POINT_CALLBACK("BlockBasedTable::Open::TailPrefetchLen",
                           &tail_prefetch_size);
  Status s;
  // TODO should not have this special logic in the future.
  if (!file->use_direct_io()) {
    prefetch_buffer->reset(new FilePrefetchBuffer(nullptr, 0, 0, false, true));
    s = file->Prefetch(prefetch_off, prefetch_len);
  } else {
    prefetch_buffer->reset(new FilePrefetchBuffer(nullptr, 0, 0, true, true));
    s = (*prefetch_buffer)->Prefetch(file, prefetch_off, prefetch_len);
  }
  return s;
}

1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042
Status VerifyChecksum(const ChecksumType type, const char* buf, size_t len,
                      uint32_t expected) {
  Status s;
  uint32_t actual = 0;
  switch (type) {
    case kNoChecksum:
      break;
    case kCRC32c:
      expected = crc32c::Unmask(expected);
      actual = crc32c::Value(buf, len);
      break;
    case kxxHash:
      actual = XXH32(buf, static_cast<int>(len), 0);
      break;
    case kxxHash64:
      actual = static_cast<uint32_t>(XXH64(buf, static_cast<int>(len), 0) &
                                     uint64_t{0xffffffff});
      break;
    default:
      s = Status::Corruption("unknown checksum type");
  }
  if (s.ok() && actual != expected) {
    s = Status::Corruption("properties block checksum mismatched");
  }
  return s;
}

1043
Status BlockBasedTable::TryReadPropertiesWithGlobalSeqno(
1044
    FilePrefetchBuffer* prefetch_buffer, const Slice& handle_value,
1045 1046 1047 1048 1049 1050 1051 1052 1053 1054
    TableProperties** table_properties) {
  assert(table_properties != nullptr);
  // If this is an external SST file ingested with write_global_seqno set to
  // true, then we expect the checksum mismatch because checksum was written
  // by SstFileWriter, but its global seqno in the properties block may have
  // been changed during ingestion. In this case, we read the properties
  // block, copy it to a memory buffer, change the global seqno to its
  // original value, i.e. 0, and verify the checksum again.
  BlockHandle props_block_handle;
  CacheAllocationPtr tmp_buf;
1055 1056
  Status s = ReadProperties(handle_value, rep_->file.get(), prefetch_buffer,
                            rep_->footer, rep_->ioptions, table_properties,
1057 1058 1059 1060 1061 1062 1063 1064
                            false /* verify_checksum */, &props_block_handle,
                            &tmp_buf, false /* compression_type_missing */,
                            nullptr /* memory_allocator */);
  if (s.ok() && tmp_buf) {
    const auto seqno_pos_iter =
        (*table_properties)
            ->properties_offsets.find(
                ExternalSstFilePropertyNames::kGlobalSeqno);
1065
    size_t block_size = static_cast<size_t>(props_block_handle.size());
1066 1067 1068 1069 1070 1071
    if (seqno_pos_iter != (*table_properties)->properties_offsets.end()) {
      uint64_t global_seqno_offset = seqno_pos_iter->second;
      EncodeFixed64(
          tmp_buf.get() + global_seqno_offset - props_block_handle.offset(), 0);
    }
    uint32_t value = DecodeFixed32(tmp_buf.get() + block_size + 1);
1072
    s = rocksdb::VerifyChecksum(rep_->footer.checksum(), tmp_buf.get(),
1073 1074 1075 1076 1077
                                block_size + 1, value);
  }
  return s;
}

1078
Status BlockBasedTable::ReadPropertiesBlock(
1079
    FilePrefetchBuffer* prefetch_buffer, InternalIterator* meta_iter,
1080
    const SequenceNumber largest_seqno) {
1081
  bool found_properties_block = true;
1082 1083
  Status s;
  s = SeekToPropertiesBlock(meta_iter, &found_properties_block);
1084

1085
  if (!s.ok()) {
1086
    ROCKS_LOG_WARN(rep_->ioptions.info_log,
1087 1088
                   "Error when seeking to properties block from file: %s",
                   s.ToString().c_str());
1089
  } else if (found_properties_block) {
K
Kai Liu 已提交
1090
    s = meta_iter->status();
K
kailiu 已提交
1091
    TableProperties* table_properties = nullptr;
K
Kai Liu 已提交
1092
    if (s.ok()) {
1093
      s = ReadProperties(
1094 1095
          meta_iter->value(), rep_->file.get(), prefetch_buffer, rep_->footer,
          rep_->ioptions, &table_properties, true /* verify_checksum */,
1096 1097 1098 1099 1100
          nullptr /* ret_block_handle */, nullptr /* ret_block_contents */,
          false /* compression_type_missing */, nullptr /* memory_allocator */);
    }

    if (s.IsCorruption()) {
1101 1102
      s = TryReadPropertiesWithGlobalSeqno(prefetch_buffer, meta_iter->value(),
                                           &table_properties);
1103 1104 1105 1106
    }
    std::unique_ptr<TableProperties> props_guard;
    if (table_properties != nullptr) {
      props_guard.reset(table_properties);
K
Kai Liu 已提交
1107
    }
J
jorlow@chromium.org 已提交
1108

K
Kai Liu 已提交
1109
    if (!s.ok()) {
1110
      ROCKS_LOG_WARN(rep_->ioptions.info_log,
1111 1112 1113
                     "Encountered error while reading data from properties "
                     "block %s",
                     s.ToString().c_str());
K
kailiu 已提交
1114
    } else {
1115
      assert(table_properties != nullptr);
1116 1117 1118 1119 1120 1121
      rep_->table_properties.reset(props_guard.release());
      rep_->blocks_maybe_compressed =
          rep_->table_properties->compression_name !=
          CompressionTypeToString(kNoCompression);
      rep_->blocks_definitely_zstd_compressed =
          (rep_->table_properties->compression_name ==
1122
               CompressionTypeToString(kZSTD) ||
1123
           rep_->table_properties->compression_name ==
1124
               CompressionTypeToString(kZSTDNotFinalCompression));
K
Kai Liu 已提交
1125
    }
1126
  } else {
1127
    ROCKS_LOG_ERROR(rep_->ioptions.info_log,
1128
                    "Cannot find Properties block from file.");
K
Kai Liu 已提交
1129
  }
1130
#ifndef ROCKSDB_LITE
1131 1132 1133
  if (rep_->table_properties) {
    ParseSliceTransform(rep_->table_properties->prefix_extractor_name,
                        &(rep_->table_prefix_extractor));
1134 1135
  }
#endif  // ROCKSDB_LITE
K
Kai Liu 已提交
1136

1137
  // Read the table properties, if provided.
1138 1139 1140
  if (rep_->table_properties) {
    rep_->whole_key_filtering &=
        IsFeatureSupported(*(rep_->table_properties),
1141
                           BlockBasedTablePropertyNames::kWholeKeyFiltering,
1142 1143 1144 1145 1146 1147 1148 1149
                           rep_->ioptions.info_log);
    rep_->prefix_filtering &=
        IsFeatureSupported(*(rep_->table_properties),
                           BlockBasedTablePropertyNames::kPrefixFiltering,
                           rep_->ioptions.info_log);

    s = GetGlobalSequenceNumber(*(rep_->table_properties), largest_seqno,
                                &(rep_->global_seqno));
1150
    if (!s.ok()) {
1151
      ROCKS_LOG_ERROR(rep_->ioptions.info_log, "%s", s.ToString().c_str());
1152
    }
1153
  }
1154 1155
  return s;
}
1156

1157
Status BlockBasedTable::ReadRangeDelBlock(
1158
    FilePrefetchBuffer* prefetch_buffer, InternalIterator* meta_iter,
1159 1160
    const InternalKeyComparator& internal_comparator) {
  Status s;
1161
  bool found_range_del_block;
1162 1163
  BlockHandle range_del_handle;
  s = SeekToRangeDelBlock(meta_iter, &found_range_del_block, &range_del_handle);
1164
  if (!s.ok()) {
1165
    ROCKS_LOG_WARN(
1166
        rep_->ioptions.info_log,
1167 1168
        "Error when seeking to range delete tombstones block from file: %s",
        s.ToString().c_str());
1169
  } else if (found_range_del_block && !range_del_handle.IsNull()) {
1170
    ReadOptions read_options;
1171
    std::unique_ptr<InternalIterator> iter(NewDataBlockIterator<DataBlockIter>(
1172
        read_options, range_del_handle, nullptr /* input_iter */,
1173 1174 1175 1176 1177
        false /* is_index */, true /* key_includes_seq */,
        true /* index_key_is_full */, nullptr /* get_context */, Status(),
        prefetch_buffer));
    assert(iter != nullptr);
    s = iter->status();
1178 1179
    if (!s.ok()) {
      ROCKS_LOG_WARN(
1180
          rep_->ioptions.info_log,
1181 1182
          "Encountered error while reading data from range del block %s",
          s.ToString().c_str());
1183
    } else {
1184
      rep_->fragmented_range_dels =
1185 1186
          std::make_shared<FragmentedRangeTombstoneList>(std::move(iter),
                                                         internal_comparator);
1187 1188
    }
  }
1189 1190 1191 1192
  return s;
}

Status BlockBasedTable::ReadCompressionDictBlock(
1193 1194
    FilePrefetchBuffer* prefetch_buffer,
    std::unique_ptr<const BlockContents>* compression_dict_block) const {
1195
  assert(compression_dict_block != nullptr);
1196
  Status s;
1197
  if (!rep_->compression_dict_handle.IsNull()) {
1198 1199 1200
    std::unique_ptr<BlockContents> compression_dict_cont{new BlockContents()};
    PersistentCacheOptions cache_options;
    ReadOptions read_options;
1201
    read_options.verify_checksums = true;
1202
    BlockFetcher compression_block_fetcher(
1203 1204 1205
        rep_->file.get(), prefetch_buffer, rep_->footer, read_options,
        rep_->compression_dict_handle, compression_dict_cont.get(),
        rep_->ioptions, false /* decompress */, false /*maybe_compressed*/,
1206
        UncompressionDict::GetEmptyDict(), cache_options);
1207 1208 1209 1210
    s = compression_block_fetcher.ReadBlockContents();

    if (!s.ok()) {
      ROCKS_LOG_WARN(
1211
          rep_->ioptions.info_log,
1212 1213 1214 1215
          "Encountered error while reading data from compression dictionary "
          "block %s",
          s.ToString().c_str());
    } else {
1216
      *compression_dict_block = std::move(compression_dict_cont);
1217 1218 1219 1220 1221 1222
    }
  }
  return s;
}

Status BlockBasedTable::PrefetchIndexAndFilterBlocks(
1223
    FilePrefetchBuffer* prefetch_buffer, InternalIterator* meta_iter,
1224 1225
    BlockBasedTable* new_table, bool prefetch_all,
    const BlockBasedTableOptions& table_options, const int level) {
1226 1227 1228
  Status s;

  // Find filter handle and filter type
1229
  if (rep_->filter_policy) {
1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247
    for (auto filter_type :
         {Rep::FilterType::kFullFilter, Rep::FilterType::kPartitionedFilter,
          Rep::FilterType::kBlockFilter}) {
      std::string prefix;
      switch (filter_type) {
        case Rep::FilterType::kFullFilter:
          prefix = kFullFilterBlockPrefix;
          break;
        case Rep::FilterType::kPartitionedFilter:
          prefix = kPartitionedFilterBlockPrefix;
          break;
        case Rep::FilterType::kBlockFilter:
          prefix = kFilterBlockPrefix;
          break;
        default:
          assert(0);
      }
      std::string filter_block_key = prefix;
1248 1249
      filter_block_key.append(rep_->filter_policy->Name());
      if (FindMetaBlock(meta_iter, filter_block_key, &rep_->filter_handle)
1250
              .ok()) {
1251
        rep_->filter_type = filter_type;
1252 1253 1254 1255
        break;
      }
    }
  }
1256

1257 1258 1259 1260
  {
    // Find compression dictionary handle
    bool found_compression_dict;
    s = SeekToCompressionDictBlock(meta_iter, &found_compression_dict,
1261
                                   &rep_->compression_dict_handle);
1262 1263
  }

1264
  BlockBasedTableOptions::IndexType index_type = new_table->UpdateIndexType();
1265 1266 1267

  const bool use_cache = table_options.cache_index_and_filter_blocks;

1268 1269 1270 1271 1272 1273 1274
  // prefetch the first level of index
  const bool prefetch_index =
      prefetch_all ||
      (table_options.pin_top_level_index_and_filter &&
       index_type == BlockBasedTableOptions::kTwoLevelIndexSearch);
  // prefetch the first level of filter
  const bool prefetch_filter =
1275 1276 1277
      prefetch_all ||
      (table_options.pin_top_level_index_and_filter &&
       rep_->filter_type == Rep::FilterType::kPartitionedFilter);
1278
  // Partition fitlers cannot be enabled without partition indexes
1279
  assert(!prefetch_filter || prefetch_index);
1280 1281
  // pin both index and filters, down to all partitions
  const bool pin_all =
1282
      rep_->table_options.pin_l0_filter_and_index_blocks_in_cache && level == 0;
1283 1284 1285 1286 1287 1288 1289
  // pin the first level of index
  const bool pin_index =
      pin_all || (table_options.pin_top_level_index_and_filter &&
                  index_type == BlockBasedTableOptions::kTwoLevelIndexSearch);
  // pin the first level of filter
  const bool pin_filter =
      pin_all || (table_options.pin_top_level_index_and_filter &&
1290
                  rep_->filter_type == Rep::FilterType::kPartitionedFilter);
1291 1292 1293 1294 1295 1296 1297

  IndexReader* index_reader = nullptr;
  if (s.ok()) {
    s = new_table->CreateIndexReader(prefetch_buffer, meta_iter, use_cache,
                                     prefetch_index, pin_index, &index_reader);
    if (s.ok()) {
      assert(index_reader != nullptr);
1298
      rep_->index_reader.reset(index_reader);
1299 1300 1301 1302
      // The partitions of partitioned index are always stored in cache. They
      // are hence follow the configuration for pin and prefetch regardless of
      // the value of cache_index_and_filter_blocks
      if (prefetch_all) {
1303
        rep_->index_reader->CacheDependencies(pin_all);
1304 1305 1306 1307 1308 1309 1310
      }
    } else {
      delete index_reader;
      index_reader = nullptr;
    }
  }

1311
  // pre-fetching of blocks is turned on
1312
  // Will use block cache for meta-blocks access
1313
  // Always prefetch index and filter for level 0
1314
  // TODO(ajkr): also prefetch compression dictionary block
1315 1316
  // TODO(ajkr): also pin compression dictionary block when
  // `pin_l0_filter_and_index_blocks_in_cache == true`.
1317
  if (table_options.cache_index_and_filter_blocks) {
1318 1319 1320
    assert(table_options.block_cache != nullptr);
    if (s.ok() && prefetch_filter) {
      // Hack: Call GetFilter() to implicitly add filter to the block_cache
1321
      auto filter_entry =
1322
          new_table->GetFilter(rep_->table_prefix_extractor.get());
1323 1324
      if (filter_entry.GetValue() != nullptr && prefetch_all) {
        filter_entry.GetValue()->CacheDependencies(
1325
            pin_all, rep_->table_prefix_extractor.get());
1326 1327 1328 1329 1330
      }
      // if pin_filter is true then save it in rep_->filter_entry; it will be
      // released in the destructor only, hence it will be pinned in the
      // cache while this reader is alive
      if (pin_filter) {
1331
        rep_->filter_entry = std::move(filter_entry);
K
Kai Liu 已提交
1332
      }
1333 1334
    }
  } else {
1335
    std::unique_ptr<const BlockContents> compression_dict_block;
1336 1337
    if (s.ok()) {
      // Set filter block
1338
      if (rep_->filter_policy) {
M
Maysam Yabandeh 已提交
1339
        const bool is_a_filter_partition = true;
1340 1341 1342 1343
        auto filter = new_table->ReadFilter(
            prefetch_buffer, rep_->filter_handle, !is_a_filter_partition,
            rep_->table_prefix_extractor.get());
        rep_->filter.reset(filter);
1344 1345
        // Refer to the comment above about paritioned indexes always being
        // cached
1346
        if (filter && prefetch_all) {
1347 1348
          filter->CacheDependencies(pin_all,
                                    rep_->table_prefix_extractor.get());
M
Maysam Yabandeh 已提交
1349
        }
1350
      }
1351
      s = ReadCompressionDictBlock(prefetch_buffer, &compression_dict_block);
K
Kai Liu 已提交
1352
    }
1353
    if (s.ok() && !rep_->compression_dict_handle.IsNull()) {
1354 1355
      assert(compression_dict_block != nullptr);
      // TODO(ajkr): find a way to avoid the `compression_dict_block` data copy
1356
      rep_->uncompression_dict.reset(new UncompressionDict(
1357
          compression_dict_block->data.ToString(),
1358
          rep_->blocks_definitely_zstd_compressed, rep_->ioptions.statistics));
1359
    }
K
Kai Liu 已提交
1360
  }
J
jorlow@chromium.org 已提交
1361 1362 1363
  return s;
}

S
Siying Dong 已提交
1364
void BlockBasedTable::SetupForCompaction() {
1365
  switch (rep_->ioptions.access_hint_on_compaction_start) {
1366 1367 1368
    case Options::NONE:
      break;
    case Options::NORMAL:
1369
      rep_->file->file()->Hint(RandomAccessFile::NORMAL);
1370 1371
      break;
    case Options::SEQUENTIAL:
1372
      rep_->file->file()->Hint(RandomAccessFile::SEQUENTIAL);
1373 1374
      break;
    case Options::WILLNEED:
1375
      rep_->file->file()->Hint(RandomAccessFile::WILLNEED);
1376 1377 1378 1379 1380 1381
      break;
    default:
      assert(false);
  }
}

K
kailiu 已提交
1382 1383
std::shared_ptr<const TableProperties> BlockBasedTable::GetTableProperties()
    const {
K
kailiu 已提交
1384
  return rep_->table_properties;
K
Kai Liu 已提交
1385
}
S
Sanjay Ghemawat 已提交
1386

1387 1388 1389 1390 1391 1392 1393 1394
size_t BlockBasedTable::ApproximateMemoryUsage() const {
  size_t usage = 0;
  if (rep_->filter) {
    usage += rep_->filter->ApproximateMemoryUsage();
  }
  if (rep_->index_reader) {
    usage += rep_->index_reader->ApproximateMemoryUsage();
  }
1395 1396 1397
  if (rep_->uncompression_dict) {
    usage += rep_->uncompression_dict->ApproximateMemoryUsage();
  }
1398 1399 1400
  return usage;
}

K
Kai Liu 已提交
1401 1402
// Load the meta-block from the file. On success, return the loaded meta block
// and its iterator.
1403
Status BlockBasedTable::ReadMetaBlock(FilePrefetchBuffer* prefetch_buffer,
S
sdong 已提交
1404 1405
                                      std::unique_ptr<Block>* meta_block,
                                      std::unique_ptr<InternalIterator>* iter) {
S
Sanjay Ghemawat 已提交
1406 1407
  // TODO(sanjay): Skip this if footer.metaindex_handle() size indicates
  // it is an empty block.
1408
  std::unique_ptr<Block> meta;
K
Kai Liu 已提交
1409
  Status s = ReadBlockFromFile(
1410 1411
      rep_->file.get(), prefetch_buffer, rep_->footer, ReadOptions(),
      rep_->footer.metaindex_handle(), &meta, rep_->ioptions,
1412
      true /* decompress */, true /*maybe_compressed*/,
1413
      UncompressionDict::GetEmptyDict(), rep_->persistent_cache_options,
1414
      kDisableGlobalSequenceNumber, 0 /* read_amp_bytes_per_bit */,
1415
      GetMemoryAllocator(rep_->table_options));
K
Kai Liu 已提交
1416

K
Kai Liu 已提交
1417
  if (!s.ok()) {
1418
    ROCKS_LOG_ERROR(rep_->ioptions.info_log,
1419 1420 1421
                    "Encountered error while reading data from properties"
                    " block %s",
                    s.ToString().c_str());
K
Kai Liu 已提交
1422
    return s;
S
Sanjay Ghemawat 已提交
1423
  }
K
Kai Liu 已提交
1424

1425
  *meta_block = std::move(meta);
K
Kai Liu 已提交
1426
  // meta block uses bytewise comparator.
1427 1428
  iter->reset(meta_block->get()->NewIterator<DataBlockIter>(
      BytewiseComparator(), BytewiseComparator()));
K
Kai Liu 已提交
1429
  return Status::OK();
S
Sanjay Ghemawat 已提交
1430 1431
}

1432 1433
Status BlockBasedTable::GetDataBlockFromCache(
    const Slice& block_cache_key, const Slice& compressed_block_cache_key,
1434
    Cache* block_cache, Cache* block_cache_compressed,
1435
    const ReadOptions& read_options, CachableEntry<Block>* block,
1436 1437 1438 1439
    const UncompressionDict& uncompression_dict, bool is_index,
    GetContext* get_context) const {
  const size_t read_amp_bytes_per_bit =
      !is_index ? rep_->table_options.read_amp_bytes_per_bit : 0;
1440 1441 1442
  assert(block);
  assert(block->IsEmpty());

1443
  Status s;
1444
  BlockContents* compressed_block = nullptr;
1445
  Cache::Handle* block_cache_compressed_handle = nullptr;
1446
  Statistics* statistics = rep_->ioptions.statistics;
1447 1448 1449

  // Lookup uncompressed cache first
  if (block_cache != nullptr) {
1450
    auto cache_handle = GetEntryFromCache(
1451
        block_cache, block_cache_key,
M
Maysam Yabandeh 已提交
1452
        is_index ? BLOCK_CACHE_INDEX_MISS : BLOCK_CACHE_DATA_MISS,
1453 1454 1455 1456 1457 1458 1459 1460 1461 1462
        is_index ? BLOCK_CACHE_INDEX_HIT : BLOCK_CACHE_DATA_HIT,
        get_context
            ? (is_index ? &get_context->get_context_stats_.num_cache_index_miss
                        : &get_context->get_context_stats_.num_cache_data_miss)
            : nullptr,
        get_context
            ? (is_index ? &get_context->get_context_stats_.num_cache_index_hit
                        : &get_context->get_context_stats_.num_cache_data_hit)
            : nullptr,
        statistics, get_context);
1463
    if (cache_handle != nullptr) {
1464 1465 1466 1467
      if (is_index) {
        PERF_COUNTER_ADD(block_cache_index_hit_count, 1);
      }

1468 1469 1470
      block->SetCachedValue(
          reinterpret_cast<Block*>(block_cache->Value(cache_handle)),
          block_cache, cache_handle);
1471 1472 1473 1474 1475
      return s;
    }
  }

  // If not found, search from the compressed block cache.
1476
  assert(block->IsEmpty());
1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493

  if (block_cache_compressed == nullptr) {
    return s;
  }

  assert(!compressed_block_cache_key.empty());
  block_cache_compressed_handle =
      block_cache_compressed->Lookup(compressed_block_cache_key);
  // if we found in the compressed cache, then uncompress and insert into
  // uncompressed cache
  if (block_cache_compressed_handle == nullptr) {
    RecordTick(statistics, BLOCK_CACHE_COMPRESSED_MISS);
    return s;
  }

  // found compressed block
  RecordTick(statistics, BLOCK_CACHE_COMPRESSED_HIT);
1494
  compressed_block = reinterpret_cast<BlockContents*>(
1495
      block_cache_compressed->Value(block_cache_compressed_handle));
1496 1497
  CompressionType compression_type = compressed_block->get_compression_type();
  assert(compression_type != kNoCompression);
1498 1499 1500

  // Retrieve the uncompressed contents into a new buffer
  BlockContents contents;
1501
  UncompressionContext context(compression_type);
1502
  UncompressionInfo info(context, uncompression_dict, compression_type);
1503 1504 1505 1506
  s = UncompressBlockContents(
      info, compressed_block->data.data(), compressed_block->data.size(),
      &contents, rep_->table_options.format_version, rep_->ioptions,
      GetMemoryAllocator(rep_->table_options));
1507 1508 1509

  // Insert uncompressed block into block cache
  if (s.ok()) {
1510
    std::unique_ptr<Block> block_holder(
1511 1512
        new Block(std::move(contents), rep_->get_global_seqno(is_index),
                  read_amp_bytes_per_bit, statistics));  // uncompressed block
1513 1514

    if (block_cache != nullptr && block_holder->own_bytes() &&
1515
        read_options.fill_cache) {
1516 1517 1518
      size_t charge = block_holder->ApproximateMemoryUsage();
      Cache::Handle* cache_handle = nullptr;
      s = block_cache->Insert(block_cache_key, block_holder.get(), charge,
1519
                              &DeleteCachedEntry<Block>,
1520
                              &cache_handle);
1521
#ifndef NDEBUG
1522
      block_cache->TEST_mark_as_data_block(block_cache_key, charge);
1523
#endif  // NDEBUG
1524
      if (s.ok()) {
1525 1526 1527 1528
        assert(cache_handle != nullptr);
        block->SetCachedValue(block_holder.release(), block_cache,
                              cache_handle);

1529
        if (get_context != nullptr) {
1530 1531
          get_context->get_context_stats_.num_cache_add++;
          get_context->get_context_stats_.num_cache_bytes_write += charge;
M
Maysam Yabandeh 已提交
1532
        } else {
1533
          RecordTick(statistics, BLOCK_CACHE_ADD);
1534
          RecordTick(statistics, BLOCK_CACHE_BYTES_WRITE, charge);
M
Maysam Yabandeh 已提交
1535
        }
1536 1537
        if (is_index) {
          if (get_context != nullptr) {
1538 1539 1540
            get_context->get_context_stats_.num_cache_index_add++;
            get_context->get_context_stats_.num_cache_index_bytes_insert +=
                charge;
1541 1542
          } else {
            RecordTick(statistics, BLOCK_CACHE_INDEX_ADD);
1543
            RecordTick(statistics, BLOCK_CACHE_INDEX_BYTES_INSERT, charge);
1544 1545 1546
          }
        } else {
          if (get_context != nullptr) {
1547 1548 1549
            get_context->get_context_stats_.num_cache_data_add++;
            get_context->get_context_stats_.num_cache_data_bytes_insert +=
                charge;
1550 1551
          } else {
            RecordTick(statistics, BLOCK_CACHE_DATA_ADD);
1552
            RecordTick(statistics, BLOCK_CACHE_DATA_BYTES_INSERT, charge);
1553 1554
          }
        }
1555 1556 1557
      } else {
        RecordTick(statistics, BLOCK_CACHE_ADD_FAILURES);
      }
1558 1559
    } else {
      block->SetOwnedValue(block_holder.release());
1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570
    }
  }

  // Release hold on compressed cache entry
  block_cache_compressed->Release(block_cache_compressed_handle);
  return s;
}

Status BlockBasedTable::PutDataBlockToCache(
    const Slice& block_cache_key, const Slice& compressed_block_cache_key,
    Cache* block_cache, Cache* block_cache_compressed,
1571
    CachableEntry<Block>* cached_block, BlockContents* raw_block_contents,
1572
    CompressionType raw_block_comp_type,
1573
    const UncompressionDict& uncompression_dict, SequenceNumber seq_no,
1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584
    MemoryAllocator* memory_allocator, bool is_index,
    GetContext* get_context) const {
  const ImmutableCFOptions& ioptions = rep_->ioptions;
  const uint32_t format_version = rep_->table_options.format_version;
  const size_t read_amp_bytes_per_bit =
      !is_index ? rep_->table_options.read_amp_bytes_per_bit : 0;
  const Cache::Priority priority =
      is_index && rep_->table_options
                      .cache_index_and_filter_blocks_with_high_priority
          ? Cache::Priority::HIGH
          : Cache::Priority::LOW;
1585 1586
  assert(cached_block);
  assert(cached_block->IsEmpty());
1587
  assert(raw_block_comp_type == kNoCompression ||
1588 1589 1590
         block_cache_compressed != nullptr);

  Status s;
1591
  Statistics* statistics = ioptions.statistics;
1592 1593

  std::unique_ptr<Block> block_holder;
1594
  if (raw_block_comp_type != kNoCompression) {
1595 1596
    // Retrieve the uncompressed contents into a new buffer
    BlockContents uncompressed_block_contents;
1597
    UncompressionContext context(raw_block_comp_type);
1598
    UncompressionInfo info(context, uncompression_dict, raw_block_comp_type);
1599 1600 1601 1602
    s = UncompressBlockContents(info, raw_block_contents->data.data(),
                                raw_block_contents->data.size(),
                                &uncompressed_block_contents, format_version,
                                ioptions, memory_allocator);
1603 1604 1605
    if (!s.ok()) {
      return s;
    }
1606

1607 1608
    block_holder.reset(new Block(std::move(uncompressed_block_contents), seq_no,
                                 read_amp_bytes_per_bit, statistics));
1609
  } else {
1610 1611
    block_holder.reset(new Block(std::move(*raw_block_contents), seq_no,
                                 read_amp_bytes_per_bit, statistics));
1612 1613 1614 1615
  }

  // Insert compressed block into compressed block cache.
  // Release the hold on the compressed cache entry immediately.
1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630
  if (block_cache_compressed != nullptr &&
      raw_block_comp_type != kNoCompression && raw_block_contents != nullptr &&
      raw_block_contents->own_bytes()) {
#ifndef NDEBUG
    assert(raw_block_contents->is_raw_block);
#endif  // NDEBUG

    // We cannot directly put raw_block_contents because this could point to
    // an object in the stack.
    BlockContents* block_cont_for_comp_cache =
        new BlockContents(std::move(*raw_block_contents));
    s = block_cache_compressed->Insert(
        compressed_block_cache_key, block_cont_for_comp_cache,
        block_cont_for_comp_cache->ApproximateMemoryUsage(),
        &DeleteCachedEntry<BlockContents>);
1631 1632 1633 1634 1635
    if (s.ok()) {
      // Avoid the following code to delete this cached block.
      RecordTick(statistics, BLOCK_CACHE_COMPRESSED_ADD);
    } else {
      RecordTick(statistics, BLOCK_CACHE_COMPRESSED_ADD_FAILURES);
1636
      delete block_cont_for_comp_cache;
1637
    }
1638 1639 1640
  }

  // insert into uncompressed block cache
1641 1642 1643 1644
  if (block_cache != nullptr && block_holder->own_bytes()) {
    size_t charge = block_holder->ApproximateMemoryUsage();
    Cache::Handle* cache_handle = nullptr;
    s = block_cache->Insert(block_cache_key, block_holder.get(), charge,
1645
                            &DeleteCachedEntry<Block>,
1646
                            &cache_handle, priority);
1647
#ifndef NDEBUG
1648
    block_cache->TEST_mark_as_data_block(block_cache_key, charge);
1649
#endif  // NDEBUG
1650
    if (s.ok()) {
1651 1652 1653 1654
      assert(cache_handle != nullptr);
      cached_block->SetCachedValue(block_holder.release(), block_cache,
                                   cache_handle);

1655
      if (get_context != nullptr) {
1656 1657
        get_context->get_context_stats_.num_cache_add++;
        get_context->get_context_stats_.num_cache_bytes_write += charge;
M
Maysam Yabandeh 已提交
1658
      } else {
1659
        RecordTick(statistics, BLOCK_CACHE_ADD);
1660
        RecordTick(statistics, BLOCK_CACHE_BYTES_WRITE, charge);
M
Maysam Yabandeh 已提交
1661
      }
1662 1663
      if (is_index) {
        if (get_context != nullptr) {
1664 1665 1666
          get_context->get_context_stats_.num_cache_index_add++;
          get_context->get_context_stats_.num_cache_index_bytes_insert +=
              charge;
1667 1668
        } else {
          RecordTick(statistics, BLOCK_CACHE_INDEX_ADD);
1669
          RecordTick(statistics, BLOCK_CACHE_INDEX_BYTES_INSERT, charge);
1670 1671 1672
        }
      } else {
        if (get_context != nullptr) {
1673 1674
          get_context->get_context_stats_.num_cache_data_add++;
          get_context->get_context_stats_.num_cache_data_bytes_insert += charge;
1675 1676
        } else {
          RecordTick(statistics, BLOCK_CACHE_DATA_ADD);
1677
          RecordTick(statistics, BLOCK_CACHE_DATA_BYTES_INSERT, charge);
1678 1679
        }
      }
1680
      assert(reinterpret_cast<Block*>(block_cache->Value(
1681
                 cached_block->GetCacheHandle())) == cached_block->GetValue());
1682 1683 1684
    } else {
      RecordTick(statistics, BLOCK_CACHE_ADD_FAILURES);
    }
1685 1686
  } else {
    cached_block->SetOwnedValue(block_holder.release());
1687 1688 1689 1690 1691
  }

  return s;
}

M
Maysam Yabandeh 已提交
1692
FilterBlockReader* BlockBasedTable::ReadFilter(
1693
    FilePrefetchBuffer* prefetch_buffer, const BlockHandle& filter_handle,
1694 1695
    const bool is_a_filter_partition,
    const SliceTransform* prefix_extractor) const {
M
Maysam Yabandeh 已提交
1696
  auto& rep = rep_;
K
Kai Liu 已提交
1697 1698
  // TODO: We might want to unify with ReadBlockFromFile() if we start
  // requiring checksum verification in Table::Open.
I
Igor Canadi 已提交
1699 1700 1701 1702
  if (rep->filter_type == Rep::FilterType::kNoFilter) {
    return nullptr;
  }
  BlockContents block;
S
Siying Dong 已提交
1703

1704 1705 1706
  BlockFetcher block_fetcher(
      rep->file.get(), prefetch_buffer, rep->footer, ReadOptions(),
      filter_handle, &block, rep->ioptions, false /* decompress */,
1707
      false /*maybe_compressed*/, UncompressionDict::GetEmptyDict(),
1708
      rep->persistent_cache_options, GetMemoryAllocator(rep->table_options));
S
Siying Dong 已提交
1709 1710 1711
  Status s = block_fetcher.ReadBlockContents();

  if (!s.ok()) {
I
Igor Canadi 已提交
1712 1713 1714 1715 1716 1717
    // Error reading the block
    return nullptr;
  }

  assert(rep->filter_policy);

M
Maysam Yabandeh 已提交
1718 1719 1720 1721 1722 1723 1724 1725 1726
  auto filter_type = rep->filter_type;
  if (rep->filter_type == Rep::FilterType::kPartitionedFilter &&
      is_a_filter_partition) {
    filter_type = Rep::FilterType::kFullFilter;
  }

  switch (filter_type) {
    case Rep::FilterType::kPartitionedFilter: {
      return new PartitionedFilterBlockReader(
1727
          rep->prefix_filtering ? prefix_extractor : nullptr,
M
Maysam Yabandeh 已提交
1728
          rep->whole_key_filtering, std::move(block), nullptr,
1729 1730
          rep->ioptions.statistics, rep->internal_comparator, this,
          rep_->table_properties == nullptr ||
1731 1732 1733
              rep_->table_properties->index_key_is_user_key == 0,
          rep_->table_properties == nullptr ||
              rep_->table_properties->index_value_is_delta_encoded == 0);
M
Maysam Yabandeh 已提交
1734 1735 1736 1737
    }

    case Rep::FilterType::kBlockFilter:
      return new BlockBasedFilterBlockReader(
1738
          rep->prefix_filtering ? prefix_extractor : nullptr,
M
Maysam Yabandeh 已提交
1739 1740 1741 1742 1743 1744 1745
          rep->table_options, rep->whole_key_filtering, std::move(block),
          rep->ioptions.statistics);

    case Rep::FilterType::kFullFilter: {
      auto filter_bits_reader =
          rep->filter_policy->GetFilterBitsReader(block.data);
      assert(filter_bits_reader != nullptr);
I
Igor Canadi 已提交
1746
      return new FullFilterBlockReader(
1747
          rep->prefix_filtering ? prefix_extractor : nullptr,
1748 1749
          rep->whole_key_filtering, std::move(block), filter_bits_reader,
          rep->ioptions.statistics);
1750
    }
I
Igor Canadi 已提交
1751

M
Maysam Yabandeh 已提交
1752 1753 1754 1755 1756 1757
    default:
      // filter_type is either kNoFilter (exited the function at the first if),
      // or it must be covered in this switch block
      assert(false);
      return nullptr;
  }
S
Sanjay Ghemawat 已提交
1758 1759
}

1760
CachableEntry<FilterBlockReader> BlockBasedTable::GetFilter(
1761 1762
    const SliceTransform* prefix_extractor, FilePrefetchBuffer* prefetch_buffer,
    bool no_io, GetContext* get_context) const {
M
Maysam Yabandeh 已提交
1763 1764
  const BlockHandle& filter_blk_handle = rep_->filter_handle;
  const bool is_a_filter_partition = true;
1765
  return GetFilter(prefetch_buffer, filter_blk_handle, !is_a_filter_partition,
1766
                   no_io, get_context, prefix_extractor);
M
Maysam Yabandeh 已提交
1767 1768
}

1769
CachableEntry<FilterBlockReader> BlockBasedTable::GetFilter(
1770
    FilePrefetchBuffer* prefetch_buffer, const BlockHandle& filter_blk_handle,
1771 1772
    const bool is_a_filter_partition, bool no_io, GetContext* get_context,
    const SliceTransform* prefix_extractor) const {
1773 1774 1775 1776
  // If cache_index_and_filter_blocks is false, filter should be pre-populated.
  // We will return rep_->filter anyway. rep_->filter can be nullptr if filter
  // read fails at Open() time. We don't want to reload again since it will
  // most probably fail again.
M
Maysam Yabandeh 已提交
1777 1778
  if (!is_a_filter_partition &&
      !rep_->table_options.cache_index_and_filter_blocks) {
1779 1780
    return {rep_->filter.get(), nullptr /* cache */,
      nullptr /* cache_handle */, false /* own_value */};
1781 1782
  }

1783 1784 1785
  Cache* block_cache = rep_->table_options.block_cache.get();
  if (rep_->filter_policy == nullptr /* do not use filter */ ||
      block_cache == nullptr /* no block cache at all */) {
1786
    return CachableEntry<FilterBlockReader>();
K
Kai Liu 已提交
1787 1788
  }

1789 1790 1791
  if (!is_a_filter_partition && rep_->filter_entry.IsCached()) {
    return {rep_->filter_entry.GetValue(), nullptr /* cache */,
      nullptr /* cache_handle */, false /* own_value */};
1792 1793 1794 1795
  }

  PERF_TIMER_GUARD(read_filter_block_nanos);

K
Kai Liu 已提交
1796 1797
  // Fetching from the cache
  char cache_key[kMaxCacheKeyPrefixSize + kMaxVarint64Length];
1798
  auto key = GetCacheKey(rep_->cache_key_prefix, rep_->cache_key_prefix_size,
M
Maysam Yabandeh 已提交
1799
                         filter_blk_handle, cache_key);
K
Kai Liu 已提交
1800

L
Lei Jin 已提交
1801
  Statistics* statistics = rep_->ioptions.statistics;
1802
  Cache::Handle* cache_handle = GetEntryFromCache(
1803
      block_cache, key, BLOCK_CACHE_FILTER_MISS, BLOCK_CACHE_FILTER_HIT,
1804 1805 1806 1807 1808
      get_context ? &get_context->get_context_stats_.num_cache_filter_miss
                  : nullptr,
      get_context ? &get_context->get_context_stats_.num_cache_filter_hit
                  : nullptr,
      statistics, get_context);
K
Kai Liu 已提交
1809 1810 1811

  FilterBlockReader* filter = nullptr;
  if (cache_handle != nullptr) {
1812
    PERF_COUNTER_ADD(block_cache_filter_hit_count, 1);
1813 1814
    filter =
        reinterpret_cast<FilterBlockReader*>(block_cache->Value(cache_handle));
K
Kai Liu 已提交
1815 1816 1817 1818
  } else if (no_io) {
    // Do not invoke any io.
    return CachableEntry<FilterBlockReader>();
  } else {
1819 1820
    filter = ReadFilter(prefetch_buffer, filter_blk_handle,
                        is_a_filter_partition, prefix_extractor);
I
Igor Canadi 已提交
1821
    if (filter != nullptr) {
1822
      size_t usage = filter->ApproximateMemoryUsage();
1823
      Status s = block_cache->Insert(
1824
          key, filter, usage, &DeleteCachedFilterEntry, &cache_handle,
1825 1826 1827
          rep_->table_options.cache_index_and_filter_blocks_with_high_priority
              ? Cache::Priority::HIGH
              : Cache::Priority::LOW);
1828
      if (s.ok()) {
1829
        PERF_COUNTER_ADD(filter_block_read_count, 1);
1830
        if (get_context != nullptr) {
1831 1832 1833 1834 1835
          get_context->get_context_stats_.num_cache_add++;
          get_context->get_context_stats_.num_cache_bytes_write += usage;
          get_context->get_context_stats_.num_cache_filter_add++;
          get_context->get_context_stats_.num_cache_filter_bytes_insert +=
              usage;
1836 1837
        } else {
          RecordTick(statistics, BLOCK_CACHE_ADD);
1838
          RecordTick(statistics, BLOCK_CACHE_BYTES_WRITE, usage);
1839
          RecordTick(statistics, BLOCK_CACHE_FILTER_ADD);
1840
          RecordTick(statistics, BLOCK_CACHE_FILTER_BYTES_INSERT, usage);
1841
        }
1842 1843 1844 1845 1846
      } else {
        RecordTick(statistics, BLOCK_CACHE_ADD_FAILURES);
        delete filter;
        return CachableEntry<FilterBlockReader>();
      }
K
Kai Liu 已提交
1847 1848 1849
    }
  }

1850 1851
  return {filter, cache_handle ? block_cache : nullptr, cache_handle,
    false /* own_value */};
K
Kai Liu 已提交
1852 1853
}

1854 1855 1856 1857
CachableEntry<UncompressionDict> BlockBasedTable::GetUncompressionDict(
    FilePrefetchBuffer* prefetch_buffer, bool no_io,
    GetContext* get_context) const {
  if (!rep_->table_options.cache_index_and_filter_blocks) {
1858 1859
    // block cache is either disabled or not used for meta-blocks. In either
    // case, BlockBasedTableReader is the owner of the uncompression dictionary.
1860 1861
    return {rep_->uncompression_dict.get(), nullptr /* cache */,
            nullptr /* cache_handle */, false /* own_value */};
1862
  }
1863
  if (rep_->compression_dict_handle.IsNull()) {
1864
    return CachableEntry<UncompressionDict>();
1865 1866 1867
  }
  char cache_key_buf[kMaxCacheKeyPrefixSize + kMaxVarint64Length];
  auto cache_key =
1868 1869
      GetCacheKey(rep_->cache_key_prefix, rep_->cache_key_prefix_size,
                  rep_->compression_dict_handle, cache_key_buf);
1870
  auto cache_handle = GetEntryFromCache(
1871
      rep_->table_options.block_cache.get(), cache_key,
1872 1873 1874 1875 1876 1877 1878
      BLOCK_CACHE_COMPRESSION_DICT_MISS, BLOCK_CACHE_COMPRESSION_DICT_HIT,
      get_context
          ? &get_context->get_context_stats_.num_cache_compression_dict_miss
          : nullptr,
      get_context
          ? &get_context->get_context_stats_.num_cache_compression_dict_hit
          : nullptr,
1879
      rep_->ioptions.statistics, get_context);
1880 1881 1882
  UncompressionDict* dict = nullptr;
  if (cache_handle != nullptr) {
    dict = reinterpret_cast<UncompressionDict*>(
1883
        rep_->table_options.block_cache->Value(cache_handle));
1884 1885 1886 1887 1888
  } else if (no_io) {
    // Do not invoke any io.
  } else {
    std::unique_ptr<const BlockContents> compression_dict_block;
    Status s =
1889
        ReadCompressionDictBlock(prefetch_buffer, &compression_dict_block);
1890 1891 1892 1893 1894
    size_t usage = 0;
    if (s.ok()) {
      assert(compression_dict_block != nullptr);
      // TODO(ajkr): find a way to avoid the `compression_dict_block` data copy
      dict = new UncompressionDict(compression_dict_block->data.ToString(),
1895 1896
                                   rep_->blocks_definitely_zstd_compressed,
                                   rep_->ioptions.statistics);
1897
      usage = dict->ApproximateMemoryUsage();
1898
      s = rep_->table_options.block_cache->Insert(
1899 1900
          cache_key, dict, usage, &DeleteCachedUncompressionDictEntry,
          &cache_handle,
1901
          rep_->table_options.cache_index_and_filter_blocks_with_high_priority
1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913
              ? Cache::Priority::HIGH
              : Cache::Priority::LOW);
    }
    if (s.ok()) {
      PERF_COUNTER_ADD(compression_dict_block_read_count, 1);
      if (get_context != nullptr) {
        get_context->get_context_stats_.num_cache_add++;
        get_context->get_context_stats_.num_cache_bytes_write += usage;
        get_context->get_context_stats_.num_cache_compression_dict_add++;
        get_context->get_context_stats_
            .num_cache_compression_dict_bytes_insert += usage;
      } else {
1914 1915 1916 1917
        RecordTick(rep_->ioptions.statistics, BLOCK_CACHE_ADD);
        RecordTick(rep_->ioptions.statistics, BLOCK_CACHE_BYTES_WRITE, usage);
        RecordTick(rep_->ioptions.statistics, BLOCK_CACHE_COMPRESSION_DICT_ADD);
        RecordTick(rep_->ioptions.statistics,
1918 1919 1920 1921 1922
                   BLOCK_CACHE_COMPRESSION_DICT_BYTES_INSERT, usage);
      }
    } else {
      // There should be no way to get here if block cache insertion succeeded.
      // Though it is still possible something failed earlier.
1923
      RecordTick(rep_->ioptions.statistics, BLOCK_CACHE_ADD_FAILURES);
1924 1925 1926 1927 1928
      delete dict;
      dict = nullptr;
      assert(cache_handle == nullptr);
    }
  }
1929 1930
  return {dict, cache_handle ? rep_->table_options.block_cache.get() : nullptr,
          cache_handle, false /* own_value */};
1931 1932
}

1933 1934
// disable_prefix_seek should be set to true when prefix_extractor found in SST
// differs from the one in mutable_cf_options and index type is HashBasedIndex
1935
InternalIteratorBase<BlockHandle>* BlockBasedTable::NewIndexIterator(
1936
    const ReadOptions& read_options, bool disable_prefix_seek,
1937
    IndexBlockIter* input_iter, GetContext* get_context) const {
1938 1939
  assert(rep_ != nullptr);
  assert(rep_->index_reader != nullptr);
1940

1941
  // We don't return pinned data from index blocks, so no need
1942
  // to set `block_contents_pinned`.
1943 1944
  return rep_->index_reader->NewIterator(read_options, disable_prefix_seek,
                                         input_iter, get_context);
K
Kai Liu 已提交
1945 1946
}

L
Lei Jin 已提交
1947 1948
// Convert an index iterator value (i.e., an encoded BlockHandle)
// into an iterator over the contents of the corresponding block.
1949 1950
// If input_iter is null, new a iterator
// If input_iter is not null, update this iter and return it
M
Maysam Yabandeh 已提交
1951 1952
template <typename TBlockIter>
TBlockIter* BlockBasedTable::NewDataBlockIterator(
1953 1954 1955 1956
    const ReadOptions& ro, const BlockHandle& handle, TBlockIter* input_iter,
    bool is_index, bool key_includes_seq, bool index_key_is_full,
    GetContext* get_context, Status s,
    FilePrefetchBuffer* prefetch_buffer) const {
1957 1958
  PERF_TIMER_GUARD(new_table_block_iter_nanos);

1959 1960 1961 1962 1963 1964 1965 1966
  TBlockIter* iter = input_iter != nullptr ? input_iter : new TBlockIter;
  if (!s.ok()) {
    iter->Invalidate(s);
    return iter;
  }

  const bool no_io = (ro.read_tier == kBlockCacheTier);
  auto uncompression_dict_storage =
1967
      GetUncompressionDict(prefetch_buffer, no_io, get_context);
1968
  const UncompressionDict& uncompression_dict =
1969 1970 1971
      uncompression_dict_storage.GetValue() == nullptr
          ? UncompressionDict::GetEmptyDict()
          : *uncompression_dict_storage.GetValue();
1972

L
Lei Jin 已提交
1973
  CachableEntry<Block> block;
1974 1975
  s = RetrieveBlock(prefetch_buffer, ro, handle, uncompression_dict, &block,
                    is_index, get_context);
1976

1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991
  if (!s.ok()) {
    assert(block.IsEmpty());
    iter->Invalidate(s);
    return iter;
  }

  assert(block.GetValue() != nullptr);
  constexpr bool kTotalOrderSeek = true;
  // Block contents are pinned and it is still pinned after the iterator
  // is destroyed as long as cleanup functions are moved to another object,
  // when:
  // 1. block cache handle is set to be released in cleanup function, or
  // 2. it's pointing to immortal source. If own_bytes is true then we are
  //    not reading data from the original source, whether immortal or not.
  //    Otherwise, the block is pinned iff the source is immortal.
1992 1993
  const bool block_contents_pinned =
      block.IsCached() ||
1994
      (!block.GetValue()->own_bytes() && rep_->immortal_table);
1995
  iter = block.GetValue()->NewIterator<TBlockIter>(
1996 1997
      &rep_->internal_comparator, rep_->internal_comparator.user_comparator(),
      iter, rep_->ioptions.statistics, kTotalOrderSeek, key_includes_seq,
1998
      index_key_is_full, block_contents_pinned);
1999 2000

  if (!block.IsCached()) {
2001
    if (!ro.fill_cache && rep_->cache_key_prefix_size != 0) {
2002
      // insert a dummy record to block cache to track the memory usage
2003
      Cache* const block_cache = rep_->table_options.block_cache.get();
2004 2005 2006 2007 2008 2009 2010 2011
      Cache::Handle* cache_handle = nullptr;
      // There are two other types of cache keys: 1) SST cache key added in
      // `MaybeReadBlockAndLoadToCache` 2) dummy cache key added in
      // `write_buffer_manager`. Use longer prefix (41 bytes) to differentiate
      // from SST cache key(31 bytes), and use non-zero prefix to
      // differentiate from `write_buffer_manager`
      const size_t kExtraCacheKeyPrefix = kMaxVarint64Length * 4 + 1;
      char cache_key[kExtraCacheKeyPrefix + kMaxVarint64Length];
2012
      // Prefix: use rep_->cache_key_prefix padded by 0s
2013
      memset(cache_key, 0, kExtraCacheKeyPrefix + kMaxVarint64Length);
2014 2015 2016
      assert(rep_->cache_key_prefix_size != 0);
      assert(rep_->cache_key_prefix_size <= kExtraCacheKeyPrefix);
      memcpy(cache_key, rep_->cache_key_prefix, rep_->cache_key_prefix_size);
2017 2018 2019
      char* end = EncodeVarint64(cache_key + kExtraCacheKeyPrefix,
                                 next_cache_key_id_++);
      assert(end - cache_key <=
2020
             static_cast<int>(kExtraCacheKeyPrefix + kMaxVarint64Length));
2021 2022 2023 2024
      const Slice unique_key(cache_key, static_cast<size_t>(end - cache_key));
      s = block_cache->Insert(unique_key, nullptr,
                              block.GetValue()->ApproximateMemoryUsage(),
                              nullptr, &cache_handle);
2025
      if (s.ok()) {
2026 2027 2028
        assert(cache_handle != nullptr);
        iter->RegisterCleanup(&ForceReleaseCachedEntry, block_cache,
                              cache_handle);
2029
      }
2030 2031 2032
    }
  }

2033
  block.TransferTo(iter);
2034 2035 2036
  return iter;
}

2037
Status BlockBasedTable::MaybeReadBlockAndLoadToCache(
2038
    FilePrefetchBuffer* prefetch_buffer, const ReadOptions& ro,
2039
    const BlockHandle& handle, const UncompressionDict& uncompression_dict,
2040 2041
    CachableEntry<Block>* block_entry, bool is_index,
    GetContext* get_context) const {
2042
  assert(block_entry != nullptr);
2043
  const bool no_io = (ro.read_tier == kBlockCacheTier);
2044
  Cache* block_cache = rep_->table_options.block_cache.get();
2045 2046

  // No point to cache compressed blocks if it never goes away
2047
  Cache* block_cache_compressed =
2048 2049
      rep_->immortal_table ? nullptr
                           : rep_->table_options.block_cache_compressed.get();
L
Lei Jin 已提交
2050

2051 2052
  // First, try to get the block from the cache
  //
L
Lei Jin 已提交
2053
  // If either block cache is enabled, we'll try to read from it.
2054
  Status s;
2055 2056 2057 2058
  char cache_key[kMaxCacheKeyPrefixSize + kMaxVarint64Length];
  char compressed_cache_key[kMaxCacheKeyPrefixSize + kMaxVarint64Length];
  Slice key /* key to the block cache */;
  Slice ckey /* key to the compressed block cache */;
L
Lei Jin 已提交
2059 2060 2061
  if (block_cache != nullptr || block_cache_compressed != nullptr) {
    // create key for block cache
    if (block_cache != nullptr) {
2062
      key = GetCacheKey(rep_->cache_key_prefix, rep_->cache_key_prefix_size,
2063
                        handle, cache_key);
L
Lei Jin 已提交
2064 2065 2066
    }

    if (block_cache_compressed != nullptr) {
2067 2068
      ckey = GetCacheKey(rep_->compressed_cache_key_prefix,
                         rep_->compressed_cache_key_prefix_size, handle,
L
Lei Jin 已提交
2069 2070 2071
                         compressed_cache_key);
    }

2072 2073 2074
    s = GetDataBlockFromCache(key, ckey, block_cache, block_cache_compressed,
                              ro, block_entry, uncompression_dict, is_index,
                              get_context);
L
Lei Jin 已提交
2075

2076 2077
    // Can't find the block from the cache. If I/O is allowed, read from the
    // file.
2078
    if (block_entry->GetValue() == nullptr && !no_io && ro.fill_cache) {
2079
      Statistics* statistics = rep_->ioptions.statistics;
2080
      bool do_decompress =
2081
          block_cache_compressed == nullptr && rep_->blocks_maybe_compressed;
2082 2083
      CompressionType raw_block_comp_type;
      BlockContents raw_block_contents;
L
Lei Jin 已提交
2084
      {
2085
        StopWatch sw(rep_->ioptions.env, statistics, READ_BLOCK_GET_MICROS);
2086
        BlockFetcher block_fetcher(
2087 2088 2089 2090 2091 2092
            rep_->file.get(), prefetch_buffer, rep_->footer, ro, handle,
            &raw_block_contents, rep_->ioptions,
            do_decompress /* do uncompress */, rep_->blocks_maybe_compressed,
            uncompression_dict, rep_->persistent_cache_options,
            GetMemoryAllocator(rep_->table_options),
            GetMemoryAllocatorForCompressedBlock(rep_->table_options));
2093 2094
        s = block_fetcher.ReadBlockContents();
        raw_block_comp_type = block_fetcher.get_compression_type();
L
Lei Jin 已提交
2095 2096 2097
      }

      if (s.ok()) {
2098
        SequenceNumber seq_no = rep_->get_global_seqno(is_index);
2099 2100
        // If filling cache is allowed and a cache is configured, try to put the
        // block to the cache.
2101 2102 2103 2104 2105
        s = PutDataBlockToCache(key, ckey, block_cache, block_cache_compressed,
                                block_entry, &raw_block_contents,
                                raw_block_comp_type, uncompression_dict, seq_no,
                                GetMemoryAllocator(rep_->table_options),
                                is_index, get_context);
L
Lei Jin 已提交
2106 2107 2108
      }
    }
  }
2109
  assert(s.ok() || block_entry->GetValue() == nullptr);
2110
  return s;
2111 2112
}

2113
Status BlockBasedTable::RetrieveBlock(
2114
    FilePrefetchBuffer* prefetch_buffer, const ReadOptions& ro,
2115
    const BlockHandle& handle, const UncompressionDict& uncompression_dict,
2116 2117
    CachableEntry<Block>* block_entry, bool is_index,
    GetContext* get_context) const {
2118 2119 2120 2121
  assert(block_entry);
  assert(block_entry->IsEmpty());

  Status s;
2122 2123
  if (!is_index || rep_->table_options.cache_index_and_filter_blocks) {
    s = MaybeReadBlockAndLoadToCache(prefetch_buffer, ro, handle,
2124 2125
                                     uncompression_dict, block_entry, is_index,
                                     get_context);
2126 2127 2128 2129 2130 2131

    if (!s.ok()) {
      return s;
    }

    if (block_entry->GetValue() != nullptr) {
2132
      assert(s.ok());
2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146
      return s;
    }
  }

  assert(block_entry->IsEmpty());

  const bool no_io = ro.read_tier == kBlockCacheTier;
  if (no_io) {
    return Status::Incomplete("no blocking io");
  }

  std::unique_ptr<Block> block;

  {
2147
    StopWatch sw(rep_->ioptions.env, rep_->ioptions.statistics,
2148 2149
                 READ_BLOCK_GET_MICROS);
    s = ReadBlockFromFile(
2150 2151 2152 2153 2154 2155
        rep_->file.get(), prefetch_buffer, rep_->footer, ro, handle, &block,
        rep_->ioptions, rep_->blocks_maybe_compressed,
        rep_->blocks_maybe_compressed, uncompression_dict,
        rep_->persistent_cache_options, rep_->get_global_seqno(is_index),
        !is_index ? rep_->table_options.read_amp_bytes_per_bit : 0,
        GetMemoryAllocator(rep_->table_options));
2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167
  }

  if (!s.ok()) {
    return s;
  }

  block_entry->SetOwnedValue(block.release());

  assert(s.ok());
  return s;
}

2168
BlockBasedTable::PartitionedIndexIteratorState::PartitionedIndexIteratorState(
2169
    const BlockBasedTable* table,
M
Maysam Yabandeh 已提交
2170
    std::unordered_map<uint64_t, CachableEntry<Block>>* block_map,
2171
    bool index_key_includes_seq, bool index_key_is_full)
M
Maysam Yabandeh 已提交
2172 2173
    : table_(table),
      block_map_(block_map),
2174 2175
      index_key_includes_seq_(index_key_includes_seq),
      index_key_is_full_(index_key_is_full) {}
M
Maysam Yabandeh 已提交
2176

2177
InternalIteratorBase<BlockHandle>*
2178
BlockBasedTable::PartitionedIndexIteratorState::NewSecondaryIterator(
2179
    const BlockHandle& handle) {
M
Maysam Yabandeh 已提交
2180
  // Return a block iterator on the index partition
2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191
  auto rep = table_->get_rep();
  auto block = block_map_->find(handle.offset());
  // This is a possible scenario since block cache might not have had space
  // for the partition
  if (block != block_map_->end()) {
    PERF_COUNTER_ADD(block_cache_hit_count, 1);
    RecordTick(rep->ioptions.statistics, BLOCK_CACHE_INDEX_HIT);
    RecordTick(rep->ioptions.statistics, BLOCK_CACHE_HIT);
    Cache* block_cache = rep->table_options.block_cache.get();
    assert(block_cache);
    RecordTick(rep->ioptions.statistics, BLOCK_CACHE_BYTES_READ,
2192
               block_cache->GetUsage(block->second.GetCacheHandle()));
M
Maysam Yabandeh 已提交
2193
    Statistics* kNullStats = nullptr;
2194
    // We don't return pinned data from index blocks, so no need
2195
    // to set `block_contents_pinned`.
2196
    return block->second.GetValue()->NewIterator<IndexBlockIter>(
M
Maysam Yabandeh 已提交
2197
        &rep->internal_comparator, rep->internal_comparator.user_comparator(),
2198
        nullptr, kNullStats, true, index_key_includes_seq_, index_key_is_full_);
2199 2200
  }
  // Create an empty iterator
2201
  return new IndexBlockIter();
2202 2203
}

T
Tyler Harter 已提交
2204 2205
// This will be broken if the user specifies an unusual implementation
// of Options.comparator, or if the user specifies an unusual
2206 2207
// definition of prefixes in BlockBasedTableOptions.filter_policy.
// In particular, we require the following three properties:
T
Tyler Harter 已提交
2208 2209 2210 2211
//
// 1) key.starts_with(prefix(key))
// 2) Compare(prefix(key), key) <= 0.
// 3) If Compare(key1, key2) <= 0, then Compare(prefix(key1), prefix(key2)) <= 0
T
Tyler Harter 已提交
2212
//
K
Kai Liu 已提交
2213 2214 2215
// Otherwise, this method guarantees no I/O will be incurred.
//
// REQUIRES: this method shouldn't be called while the DB lock is held.
2216 2217 2218
bool BlockBasedTable::PrefixMayMatch(
    const Slice& internal_key, const ReadOptions& read_options,
    const SliceTransform* options_prefix_extractor,
2219
    const bool need_upper_bound_check) const {
2220
  if (!rep_->filter_policy) {
2221 2222 2223
    return true;
  }

2224 2225 2226 2227 2228 2229 2230 2231 2232 2233
  const SliceTransform* prefix_extractor;

  if (rep_->table_prefix_extractor == nullptr) {
    if (need_upper_bound_check) {
      return true;
    }
    prefix_extractor = options_prefix_extractor;
  } else {
    prefix_extractor = rep_->table_prefix_extractor.get();
  }
2234
  auto user_key = ExtractUserKey(internal_key);
2235
  if (!prefix_extractor->InDomain(user_key)) {
2236 2237
    return true;
  }
L
Lei Jin 已提交
2238

T
Tyler Harter 已提交
2239 2240 2241
  bool may_match = true;
  Status s;

2242
  // First, try check with full filter
2243
  auto filter_entry = GetFilter(prefix_extractor);
2244
  FilterBlockReader* filter = filter_entry.GetValue();
2245
  bool filter_checked = true;
2246 2247
  if (filter != nullptr) {
    if (!filter->IsBlockBased()) {
M
Maysam Yabandeh 已提交
2248
      const Slice* const const_ikey_ptr = &internal_key;
2249 2250 2251 2252
      may_match = filter->RangeMayExist(
          read_options.iterate_upper_bound, user_key, prefix_extractor,
          rep_->internal_comparator.user_comparator(), const_ikey_ptr,
          &filter_checked, need_upper_bound_check);
2253
    } else {
2254 2255 2256 2257 2258
      // if prefix_extractor changed for block based filter, skip filter
      if (need_upper_bound_check) {
        return true;
      }
      auto prefix = prefix_extractor->Transform(user_key);
M
Maysam Yabandeh 已提交
2259 2260 2261 2262 2263 2264 2265 2266 2267
      InternalKey internal_key_prefix(prefix, kMaxSequenceNumber, kTypeValue);
      auto internal_prefix = internal_key_prefix.Encode();

      // To prevent any io operation in this method, we set `read_tier` to make
      // sure we always read index or filter only when they have already been
      // loaded to memory.
      ReadOptions no_io_read_options;
      no_io_read_options.read_tier = kBlockCacheTier;

2268
      // Then, try find it within each block
2269 2270
      // we already know prefix_extractor and prefix_extractor_name must match
      // because `CheckPrefixMayMatch` first checks `check_filter_ == true`
2271
      std::unique_ptr<InternalIteratorBase<BlockHandle>> iiter(
2272 2273
          NewIndexIterator(no_io_read_options,
                           /* need_upper_bound_check */ false));
2274 2275 2276 2277 2278 2279 2280 2281
      iiter->Seek(internal_prefix);

      if (!iiter->Valid()) {
        // we're past end of file
        // if it's incomplete, it means that we avoided I/O
        // and we're not really sure that we're past the end
        // of the file
        may_match = iiter->status().IsIncomplete();
2282 2283
      } else if ((rep_->table_properties &&
                          rep_->table_properties->index_key_is_user_key
M
Maysam Yabandeh 已提交
2284 2285
                      ? iiter->key()
                      : ExtractUserKey(iiter->key()))
2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303
                     .starts_with(ExtractUserKey(internal_prefix))) {
        // we need to check for this subtle case because our only
        // guarantee is that "the key is a string >= last key in that data
        // block" according to the doc/table_format.txt spec.
        //
        // Suppose iiter->key() starts with the desired prefix; it is not
        // necessarily the case that the corresponding data block will
        // contain the prefix, since iiter->key() need not be in the
        // block.  However, the next data block may contain the prefix, so
        // we return true to play it safe.
        may_match = true;
      } else if (filter->IsBlockBased()) {
        // iiter->key() does NOT start with the desired prefix.  Because
        // Seek() finds the first key that is >= the seek target, this
        // means that iiter->key() > prefix.  Thus, any data blocks coming
        // after the data block corresponding to iiter->key() cannot
        // possibly contain the key.  Thus, the corresponding data block
        // is the only on could potentially contain the prefix.
2304
        BlockHandle handle = iiter->value();
2305 2306
        may_match =
            filter->PrefixMayMatch(prefix, prefix_extractor, handle.offset());
2307
      }
2308
    }
T
Tyler Harter 已提交
2309
  }
T
Tyler Harter 已提交
2310

2311 2312 2313 2314 2315 2316
  if (filter_checked) {
    Statistics* statistics = rep_->ioptions.statistics;
    RecordTick(statistics, BLOOM_FILTER_PREFIX_CHECKED);
    if (!may_match) {
      RecordTick(statistics, BLOOM_FILTER_PREFIX_USEFUL);
    }
T
Tyler Harter 已提交
2317 2318
  }

T
Tyler Harter 已提交
2319 2320 2321
  return may_match;
}

2322 2323
template <class TBlockIter, typename TValue>
void BlockBasedTableIterator<TBlockIter, TValue>::Seek(const Slice& target) {
2324
  is_out_of_bound_ = false;
2325
  if (!CheckPrefixMayMatch(target)) {
2326 2327 2328 2329
    ResetDataIter();
    return;
  }

2330
  bool need_seek_index = true;
2331
  if (block_iter_points_to_real_block_ && block_iter_.Valid()) {
2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347
    // Reseek.
    prev_index_value_ = index_iter_->value();
    // We can avoid an index seek if:
    // 1. The new seek key is larger than the current key
    // 2. The new seek key is within the upper bound of the block
    // Since we don't necessarily know the internal key for either
    // the current key or the upper bound, we check user keys and
    // exclude the equality case. Considering internal keys can
    // improve for the boundary cases, but it would complicate the
    // code.
    if (user_comparator_.Compare(ExtractUserKey(target),
                                 block_iter_.user_key()) > 0 &&
        user_comparator_.Compare(ExtractUserKey(target),
                                 index_iter_->user_key()) < 0) {
      need_seek_index = false;
    }
2348 2349
  }

2350 2351 2352 2353 2354 2355 2356 2357
  if (need_seek_index) {
    index_iter_->Seek(target);
    if (!index_iter_->Valid()) {
      ResetDataIter();
      return;
    }
    InitDataBlock();
  }
2358

M
Maysam Yabandeh 已提交
2359
  block_iter_.Seek(target);
2360 2361

  FindKeyForward();
2362
  CheckOutOfBound();
M
Maysam Yabandeh 已提交
2363 2364 2365
  assert(
      !block_iter_.Valid() ||
      (key_includes_seq_ && icomp_.Compare(target, block_iter_.key()) <= 0) ||
2366 2367
      (!key_includes_seq_ && user_comparator_.Compare(ExtractUserKey(target),
                                                      block_iter_.key()) <= 0));
2368 2369
}

2370 2371 2372
template <class TBlockIter, typename TValue>
void BlockBasedTableIterator<TBlockIter, TValue>::SeekForPrev(
    const Slice& target) {
2373
  is_out_of_bound_ = false;
2374
  if (!CheckPrefixMayMatch(target)) {
2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406
    ResetDataIter();
    return;
  }

  SavePrevIndexValue();

  // Call Seek() rather than SeekForPrev() in the index block, because the
  // target data block will likely to contain the position for `target`, the
  // same as Seek(), rather than than before.
  // For example, if we have three data blocks, each containing two keys:
  //   [2, 4]  [6, 8] [10, 12]
  //  (the keys in the index block would be [4, 8, 12])
  // and the user calls SeekForPrev(7), we need to go to the second block,
  // just like if they call Seek(7).
  // The only case where the block is difference is when they seek to a position
  // in the boundary. For example, if they SeekForPrev(5), we should go to the
  // first block, rather than the second. However, we don't have the information
  // to distinguish the two unless we read the second block. In this case, we'll
  // end up with reading two blocks.
  index_iter_->Seek(target);

  if (!index_iter_->Valid()) {
    index_iter_->SeekToLast();
    if (!index_iter_->Valid()) {
      ResetDataIter();
      block_iter_points_to_real_block_ = false;
      return;
    }
  }

  InitDataBlock();

M
Maysam Yabandeh 已提交
2407
  block_iter_.SeekForPrev(target);
2408 2409

  FindKeyBackward();
M
Maysam Yabandeh 已提交
2410 2411
  assert(!block_iter_.Valid() ||
         icomp_.Compare(target, block_iter_.key()) >= 0);
2412 2413
}

2414 2415
template <class TBlockIter, typename TValue>
void BlockBasedTableIterator<TBlockIter, TValue>::SeekToFirst() {
2416
  is_out_of_bound_ = false;
2417 2418 2419 2420 2421 2422 2423
  SavePrevIndexValue();
  index_iter_->SeekToFirst();
  if (!index_iter_->Valid()) {
    ResetDataIter();
    return;
  }
  InitDataBlock();
M
Maysam Yabandeh 已提交
2424
  block_iter_.SeekToFirst();
2425
  FindKeyForward();
2426
  CheckOutOfBound();
2427 2428
}

2429 2430
template <class TBlockIter, typename TValue>
void BlockBasedTableIterator<TBlockIter, TValue>::SeekToLast() {
2431
  is_out_of_bound_ = false;
2432 2433 2434 2435 2436 2437 2438
  SavePrevIndexValue();
  index_iter_->SeekToLast();
  if (!index_iter_->Valid()) {
    ResetDataIter();
    return;
  }
  InitDataBlock();
M
Maysam Yabandeh 已提交
2439
  block_iter_.SeekToLast();
2440 2441 2442
  FindKeyBackward();
}

2443 2444
template <class TBlockIter, typename TValue>
void BlockBasedTableIterator<TBlockIter, TValue>::Next() {
2445
  assert(block_iter_points_to_real_block_);
M
Maysam Yabandeh 已提交
2446
  block_iter_.Next();
2447 2448 2449
  FindKeyForward();
}

2450 2451
template <class TBlockIter, typename TValue>
bool BlockBasedTableIterator<TBlockIter, TValue>::NextAndGetResult(
2452
    IterateResult* result) {
2453 2454 2455
  Next();
  bool is_valid = Valid();
  if (is_valid) {
2456 2457
    result->key = key();
    result->may_be_out_of_upper_bound = MayBeOutOfUpperBound();
2458 2459 2460 2461
  }
  return is_valid;
}

2462 2463
template <class TBlockIter, typename TValue>
void BlockBasedTableIterator<TBlockIter, TValue>::Prev() {
2464
  assert(block_iter_points_to_real_block_);
M
Maysam Yabandeh 已提交
2465
  block_iter_.Prev();
2466 2467 2468
  FindKeyBackward();
}

2469 2470 2471 2472 2473 2474 2475
// Found that 256 KB readahead size provides the best performance, based on
// experiments, for auto readahead. Experiment data is in PR #3282.
template <class TBlockIter, typename TValue>
const size_t
    BlockBasedTableIterator<TBlockIter, TValue>::kMaxAutoReadaheadSize =
        256 * 1024;

2476 2477 2478
template <class TBlockIter, typename TValue>
void BlockBasedTableIterator<TBlockIter, TValue>::InitDataBlock() {
  BlockHandle data_block_handle = index_iter_->value();
2479
  if (!block_iter_points_to_real_block_ ||
2480
      data_block_handle.offset() != prev_index_value_.offset() ||
2481
      // if previous attempt of reading the block missed cache, try again
M
Maysam Yabandeh 已提交
2482
      block_iter_.status().IsIncomplete()) {
2483 2484 2485 2486 2487
    if (block_iter_points_to_real_block_) {
      ResetDataIter();
    }
    auto* rep = table_->get_rep();

2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520
    // Prefetch additional data for range scans (iterators). Enabled only for
    // user reads.
    // Implicit auto readahead:
    //   Enabled after 2 sequential IOs when ReadOptions.readahead_size == 0.
    // Explicit user requested readahead:
    //   Enabled from the very first IO when ReadOptions.readahead_size is set.
    if (!for_compaction_) {
      if (read_options_.readahead_size == 0) {
        // Implicit auto readahead
        num_file_reads_++;
        if (num_file_reads_ > kMinNumFileReadsToStartAutoReadahead) {
          if (!rep->file->use_direct_io() &&
              (data_block_handle.offset() +
                   static_cast<size_t>(data_block_handle.size()) +
                   kBlockTrailerSize >
               readahead_limit_)) {
            // Buffered I/O
            // Discarding the return status of Prefetch calls intentionally, as
            // we can fallback to reading from disk if Prefetch fails.
            rep->file->Prefetch(data_block_handle.offset(), readahead_size_);
            readahead_limit_ = static_cast<size_t>(data_block_handle.offset() +
                                                   readahead_size_);
            // Keep exponentially increasing readahead size until
            // kMaxAutoReadaheadSize.
            readahead_size_ =
                std::min(kMaxAutoReadaheadSize, readahead_size_ * 2);
          } else if (rep->file->use_direct_io() && !prefetch_buffer_) {
            // Direct I/O
            // Let FilePrefetchBuffer take care of the readahead.
            prefetch_buffer_.reset(
                new FilePrefetchBuffer(rep->file.get(), kInitAutoReadaheadSize,
                                       kMaxAutoReadaheadSize));
          }
2521
        }
2522 2523 2524 2525 2526 2527 2528
      } else if (!prefetch_buffer_) {
        // Explicit user requested readahead
        // The actual condition is:
        // if (read_options_.readahead_size != 0 && !prefetch_buffer_)
        prefetch_buffer_.reset(new FilePrefetchBuffer(
            rep->file.get(), read_options_.readahead_size,
            read_options_.readahead_size));
2529 2530 2531
      }
    }

2532
    Status s;
2533 2534
    table_->NewDataBlockIterator<TBlockIter>(
        read_options_, data_block_handle, &block_iter_, is_index_,
2535
        key_includes_seq_, index_key_is_full_,
M
Maysam Yabandeh 已提交
2536
        /* get_context */ nullptr, s, prefetch_buffer_.get());
2537
    block_iter_points_to_real_block_ = true;
2538 2539 2540 2541 2542
    if (read_options_.iterate_upper_bound != nullptr) {
      data_block_within_upper_bound_ =
          (user_comparator_.Compare(*read_options_.iterate_upper_bound,
                                    index_iter_->user_key()) > 0);
    }
2543 2544 2545
  }
}

2546
template <class TBlockIter, typename TValue>
2547
void BlockBasedTableIterator<TBlockIter, TValue>::FindBlockForward() {
2548 2549
  // TODO the while loop inherits from two-level-iterator. We don't know
  // whether a block can be empty so it can be replaced by an "if".
2550
  do {
M
Maysam Yabandeh 已提交
2551
    if (!block_iter_.status().ok()) {
2552 2553
      return;
    }
2554
    // Whether next data block is out of upper bound, if there is one.
2555 2556 2557
    bool next_block_is_out_of_bound =
        read_options_.iterate_upper_bound != nullptr &&
        block_iter_points_to_real_block_ && !data_block_within_upper_bound_;
2558 2559
    ResetDataIter();
    index_iter_->Next();
2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570
    if (next_block_is_out_of_bound) {
      // The next block is out of bound. No need to read it.
      TEST_SYNC_POINT_CALLBACK("BlockBasedTableIterator:out_of_bound", nullptr);
      // We need to make sure this is not the last data block before setting
      // is_out_of_bound_, since the index key for the last data block can be
      // larger than smallest key of the next file on the same level.
      if (index_iter_->Valid()) {
        is_out_of_bound_ = true;
      }
      return;
    }
2571 2572 2573

    if (index_iter_->Valid()) {
      InitDataBlock();
M
Maysam Yabandeh 已提交
2574
      block_iter_.SeekToFirst();
2575 2576 2577
    } else {
      return;
    }
2578 2579 2580 2581 2582 2583 2584 2585 2586
  } while (!block_iter_.Valid());
}

template <class TBlockIter, typename TValue>
void BlockBasedTableIterator<TBlockIter, TValue>::FindKeyForward() {
  assert(!is_out_of_bound_);

  if (!block_iter_.Valid()) {
    FindBlockForward();
2587 2588 2589
  }
}

2590 2591
template <class TBlockIter, typename TValue>
void BlockBasedTableIterator<TBlockIter, TValue>::FindKeyBackward() {
M
Maysam Yabandeh 已提交
2592 2593
  while (!block_iter_.Valid()) {
    if (!block_iter_.status().ok()) {
2594 2595 2596 2597 2598 2599 2600 2601
      return;
    }

    ResetDataIter();
    index_iter_->Prev();

    if (index_iter_->Valid()) {
      InitDataBlock();
M
Maysam Yabandeh 已提交
2602
      block_iter_.SeekToLast();
2603 2604 2605 2606 2607 2608 2609 2610 2611
    } else {
      return;
    }
  }

  // We could have check lower bound here too, but we opt not to do it for
  // code simplicity.
}

2612 2613 2614 2615 2616 2617 2618 2619 2620
template <class TBlockIter, typename TValue>
void BlockBasedTableIterator<TBlockIter, TValue>::CheckOutOfBound() {
  if (read_options_.iterate_upper_bound != nullptr &&
      block_iter_points_to_real_block_ && block_iter_.Valid()) {
    is_out_of_bound_ = user_comparator_.Compare(
                           *read_options_.iterate_upper_bound, user_key()) <= 0;
  }
}

2621 2622
InternalIterator* BlockBasedTable::NewIterator(
    const ReadOptions& read_options, const SliceTransform* prefix_extractor,
2623
    Arena* arena, bool skip_filters, bool for_compaction) {
2624
  bool need_upper_bound_check =
2625
      PrefixExtractorChanged(rep_->table_properties.get(), prefix_extractor);
M
Maysam Yabandeh 已提交
2626
  const bool kIsNotIndex = false;
2627
  if (arena == nullptr) {
M
Maysam Yabandeh 已提交
2628
    return new BlockBasedTableIterator<DataBlockIter>(
2629
        this, read_options, rep_->internal_comparator,
2630 2631
        NewIndexIterator(
            read_options,
2632
            need_upper_bound_check &&
M
Maysam Yabandeh 已提交
2633
                rep_->index_type == BlockBasedTableOptions::kHashSearch),
2634
        !skip_filters && !read_options.total_order_seek &&
2635 2636
            prefix_extractor != nullptr,
        need_upper_bound_check, prefix_extractor, kIsNotIndex,
2637
        true /*key_includes_seq*/, true /*index_key_is_full*/, for_compaction);
2638
  } else {
M
Maysam Yabandeh 已提交
2639 2640 2641
    auto* mem =
        arena->AllocateAligned(sizeof(BlockBasedTableIterator<DataBlockIter>));
    return new (mem) BlockBasedTableIterator<DataBlockIter>(
2642
        this, read_options, rep_->internal_comparator,
2643
        NewIndexIterator(read_options, need_upper_bound_check),
2644
        !skip_filters && !read_options.total_order_seek &&
2645 2646
            prefix_extractor != nullptr,
        need_upper_bound_check, prefix_extractor, kIsNotIndex,
2647
        true /*key_includes_seq*/, true /*index_key_is_full*/, for_compaction);
2648
  }
J
jorlow@chromium.org 已提交
2649 2650
}

2651
FragmentedRangeTombstoneIterator* BlockBasedTable::NewRangeTombstoneIterator(
2652
    const ReadOptions& read_options) {
2653 2654 2655
  if (rep_->fragmented_range_dels == nullptr) {
    return nullptr;
  }
2656 2657 2658 2659 2660
  SequenceNumber snapshot = kMaxSequenceNumber;
  if (read_options.snapshot != nullptr) {
    snapshot = read_options.snapshot->GetSequenceNumber();
  }
  return new FragmentedRangeTombstoneIterator(
2661
      rep_->fragmented_range_dels, rep_->internal_comparator, snapshot);
2662 2663
}

2664 2665 2666 2667
bool BlockBasedTable::FullFilterKeyMayMatch(
    const ReadOptions& read_options, FilterBlockReader* filter,
    const Slice& internal_key, const bool no_io,
    const SliceTransform* prefix_extractor) const {
2668 2669 2670 2671
  if (filter == nullptr || filter->IsBlockBased()) {
    return true;
  }
  Slice user_key = ExtractUserKey(internal_key);
M
Maysam Yabandeh 已提交
2672
  const Slice* const const_ikey_ptr = &internal_key;
2673
  bool may_match = true;
2674
  if (filter->whole_key_filtering()) {
2675 2676 2677
    may_match = filter->KeyMayMatch(user_key, prefix_extractor, kNotValid,
                                    no_io, const_ikey_ptr);
  } else if (!read_options.total_order_seek && prefix_extractor &&
2678
             rep_->table_properties->prefix_extractor_name.compare(
2679 2680 2681 2682 2683
                 prefix_extractor->Name()) == 0 &&
             prefix_extractor->InDomain(user_key) &&
             !filter->PrefixMayMatch(prefix_extractor->Transform(user_key),
                                     prefix_extractor, kNotValid, false,
                                     const_ikey_ptr)) {
2684 2685 2686 2687
    may_match = false;
  }
  if (may_match) {
    RecordTick(rep_->ioptions.statistics, BLOOM_FILTER_FULL_POSITIVE);
2688
    PERF_COUNTER_BY_LEVEL_ADD(bloom_filter_full_positive, 1, rep_->level);
2689
  }
2690
  return may_match;
2691 2692
}

2693 2694 2695 2696 2697 2698 2699 2700 2701 2702 2703 2704 2705 2706 2707 2708 2709 2710 2711 2712 2713 2714 2715
void BlockBasedTable::FullFilterKeysMayMatch(
    const ReadOptions& read_options, FilterBlockReader* filter,
    MultiGetRange* range, const bool no_io,
    const SliceTransform* prefix_extractor) const {
  if (filter == nullptr || filter->IsBlockBased()) {
    return;
  }
  if (filter->whole_key_filtering()) {
    filter->KeysMayMatch(range, prefix_extractor, kNotValid, no_io);
  } else if (!read_options.total_order_seek && prefix_extractor &&
             rep_->table_properties->prefix_extractor_name.compare(
                 prefix_extractor->Name()) == 0) {
    for (auto iter = range->begin(); iter != range->end(); ++iter) {
      Slice user_key = iter->lkey->user_key();

      if (!prefix_extractor->InDomain(user_key)) {
        range->SkipKey(iter);
      }
    }
    filter->PrefixesMayMatch(range, prefix_extractor, kNotValid, false);
  }
}

2716
Status BlockBasedTable::Get(const ReadOptions& read_options, const Slice& key,
2717 2718 2719
                            GetContext* get_context,
                            const SliceTransform* prefix_extractor,
                            bool skip_filters) {
M
Maysam Yabandeh 已提交
2720
  assert(key.size() >= 8);  // key must be internal key
S
Sanjay Ghemawat 已提交
2721
  Status s;
M
Maysam Yabandeh 已提交
2722
  const bool no_io = read_options.read_tier == kBlockCacheTier;
2723
  CachableEntry<FilterBlockReader> filter_entry;
2724 2725 2726 2727 2728 2729 2730 2731
  bool may_match;
  FilterBlockReader* filter = nullptr;
  {
    if (!skip_filters) {
      filter_entry =
          GetFilter(prefix_extractor, /*prefetch_buffer*/ nullptr,
                    read_options.read_tier == kBlockCacheTier, get_context);
    }
2732
    filter = filter_entry.GetValue();
2733

2734 2735 2736 2737 2738 2739
    // First check the full filter
    // If full filter not useful, Then go into each block
    may_match = FullFilterKeyMayMatch(read_options, filter, key, no_io,
                                      prefix_extractor);
  }
  if (!may_match) {
2740
    RecordTick(rep_->ioptions.statistics, BLOOM_FILTER_USEFUL);
2741
    PERF_COUNTER_BY_LEVEL_ADD(bloom_filter_useful, 1, rep_->level);
2742
  } else {
M
Maysam Yabandeh 已提交
2743
    IndexBlockIter iiter_on_stack;
2744 2745
    // if prefix_extractor found in block differs from options, disable
    // BlockPrefixIndex. Only do this check when index_type is kHashSearch.
2746
    bool need_upper_bound_check = false;
2747
    if (rep_->index_type == BlockBasedTableOptions::kHashSearch) {
2748
      need_upper_bound_check = PrefixExtractorChanged(
2749
          rep_->table_properties.get(), prefix_extractor);
2750
    }
2751 2752
    auto iiter = NewIndexIterator(read_options, need_upper_bound_check,
                                  &iiter_on_stack, get_context);
2753
    std::unique_ptr<InternalIteratorBase<BlockHandle>> iiter_unique_ptr;
M
Maysam Yabandeh 已提交
2754
    if (iiter != &iiter_on_stack) {
M
Maysam Yabandeh 已提交
2755
      iiter_unique_ptr.reset(iiter);
M
Maysam Yabandeh 已提交
2756
    }
2757

2758
    bool matched = false;  // if such user key mathced a key in SST
2759
    bool done = false;
M
Maysam Yabandeh 已提交
2760
    for (iiter->Seek(key); iiter->Valid() && !done; iiter->Next()) {
2761
      BlockHandle handle = iiter->value();
2762

2763 2764
      bool not_exist_in_filter =
          filter != nullptr && filter->IsBlockBased() == true &&
2765 2766
          !filter->KeyMayMatch(ExtractUserKey(key), prefix_extractor,
                               handle.offset(), no_io);
2767 2768 2769 2770 2771 2772

      if (not_exist_in_filter) {
        // Not found
        // TODO: think about interaction with Merge. If a user key cannot
        // cross one data block, we should be fine.
        RecordTick(rep_->ioptions.statistics, BLOOM_FILTER_USEFUL);
2773
        PERF_COUNTER_BY_LEVEL_ADD(bloom_filter_useful, 1, rep_->level);
2774 2775
        break;
      } else {
M
Maysam Yabandeh 已提交
2776 2777
        DataBlockIter biter;
        NewDataBlockIterator<DataBlockIter>(
2778
            read_options, iiter->value(), &biter, false,
2779 2780
            true /* key_includes_seq */, true /* index_key_is_full */,
            get_context);
2781

2782
        if (read_options.read_tier == kBlockCacheTier &&
M
Maysam Yabandeh 已提交
2783
            biter.status().IsIncomplete()) {
2784
          // couldn't get block from block_cache
2785 2786
          // Update Saver.state to Found because we are only looking for
          // whether we can guarantee the key is not there when "no_io" is set
2787
          get_context->MarkKeyMayExist();
2788 2789
          break;
        }
M
Maysam Yabandeh 已提交
2790 2791
        if (!biter.status().ok()) {
          s = biter.status();
2792 2793 2794
          break;
        }

2795 2796 2797 2798 2799 2800 2801 2802 2803
        bool may_exist = biter.SeekForGet(key);
        if (!may_exist) {
          // HashSeek cannot find the key this block and the the iter is not
          // the end of the block, i.e. cannot be in the following blocks
          // either. In this case, the seek_key cannot be found, so we break
          // from the top level for-loop.
          break;
        }

2804
        // Call the *saver function on each entry/block until it returns false
2805
        for (; biter.Valid(); biter.Next()) {
2806
          ParsedInternalKey parsed_key;
M
Maysam Yabandeh 已提交
2807
          if (!ParseInternalKey(biter.key(), &parsed_key)) {
2808 2809 2810
            s = Status::Corruption(Slice());
          }

2811 2812 2813
          if (!get_context->SaveValue(
                  parsed_key, biter.value(), &matched,
                  biter.IsValuePinned() ? &biter : nullptr)) {
2814 2815 2816 2817
            done = true;
            break;
          }
        }
M
Maysam Yabandeh 已提交
2818
        s = biter.status();
S
Sanjay Ghemawat 已提交
2819
      }
M
Maysam Yabandeh 已提交
2820 2821 2822 2823
      if (done) {
        // Avoid the extra Next which is expensive in two-level indexes
        break;
      }
2824
    }
2825 2826
    if (matched && filter != nullptr && !filter->IsBlockBased()) {
      RecordTick(rep_->ioptions.statistics, BLOOM_FILTER_FULL_TRUE_POSITIVE);
2827 2828
      PERF_COUNTER_BY_LEVEL_ADD(bloom_filter_full_true_positive, 1,
                                rep_->level);
2829
    }
2830
    if (s.ok()) {
M
Maysam Yabandeh 已提交
2831
      s = iiter->status();
S
Sanjay Ghemawat 已提交
2832 2833
    }
  }
K
Kai Liu 已提交
2834

S
Sanjay Ghemawat 已提交
2835 2836 2837
  return s;
}

2838 2839 2840 2841 2842 2843 2844 2845 2846 2847 2848 2849 2850 2851 2852 2853 2854
using MultiGetRange = MultiGetContext::Range;
void BlockBasedTable::MultiGet(const ReadOptions& read_options,
                               const MultiGetRange* mget_range,
                               const SliceTransform* prefix_extractor,
                               bool skip_filters) {
  const bool no_io = read_options.read_tier == kBlockCacheTier;
  CachableEntry<FilterBlockReader> filter_entry;
  FilterBlockReader* filter = nullptr;
  MultiGetRange sst_file_range(*mget_range, mget_range->begin(),
                               mget_range->end());
  {
    if (!skip_filters) {
      // TODO: Figure out where the stats should go
      filter_entry = GetFilter(prefix_extractor, /*prefetch_buffer*/ nullptr,
                               read_options.read_tier == kBlockCacheTier,
                               nullptr /*get_context*/);
    }
2855
    filter = filter_entry.GetValue();
2856 2857 2858 2859 2860 2861 2862 2863 2864 2865 2866 2867 2868 2869 2870

    // First check the full filter
    // If full filter not useful, Then go into each block
    FullFilterKeysMayMatch(read_options, filter, &sst_file_range, no_io,
                           prefix_extractor);
  }
  if (skip_filters || !sst_file_range.empty()) {
    IndexBlockIter iiter_on_stack;
    // if prefix_extractor found in block differs from options, disable
    // BlockPrefixIndex. Only do this check when index_type is kHashSearch.
    bool need_upper_bound_check = false;
    if (rep_->index_type == BlockBasedTableOptions::kHashSearch) {
      need_upper_bound_check = PrefixExtractorChanged(
          rep_->table_properties.get(), prefix_extractor);
    }
2871 2872 2873
    auto iiter =
        NewIndexIterator(read_options, need_upper_bound_check, &iiter_on_stack,
                         sst_file_range.begin()->get_context);
2874 2875 2876 2877 2878 2879 2880 2881 2882 2883 2884 2885 2886 2887 2888
    std::unique_ptr<InternalIteratorBase<BlockHandle>> iiter_unique_ptr;
    if (iiter != &iiter_on_stack) {
      iiter_unique_ptr.reset(iiter);
    }

    for (auto miter = sst_file_range.begin(); miter != sst_file_range.end();
         ++miter) {
      Status s;
      GetContext* get_context = miter->get_context;
      const Slice& key = miter->ikey;
      bool matched = false;  // if such user key matched a key in SST
      bool done = false;
      for (iiter->Seek(key); iiter->Valid() && !done; iiter->Next()) {
        DataBlockIter biter;
        NewDataBlockIterator<DataBlockIter>(
2889
            read_options, iiter->value(), &biter, false,
2890 2891 2892 2893 2894 2895 2896 2897 2898 2899 2900 2901 2902 2903 2904 2905 2906 2907 2908 2909 2910 2911 2912 2913 2914 2915 2916 2917 2918 2919 2920 2921 2922 2923 2924 2925 2926 2927 2928 2929 2930 2931 2932 2933 2934 2935 2936 2937 2938 2939 2940 2941 2942 2943 2944 2945 2946
            true /* key_includes_seq */, get_context);

        if (read_options.read_tier == kBlockCacheTier &&
            biter.status().IsIncomplete()) {
          // couldn't get block from block_cache
          // Update Saver.state to Found because we are only looking for
          // whether we can guarantee the key is not there when "no_io" is set
          get_context->MarkKeyMayExist();
          break;
        }
        if (!biter.status().ok()) {
          s = biter.status();
          break;
        }

        bool may_exist = biter.SeekForGet(key);
        if (!may_exist) {
          // HashSeek cannot find the key this block and the the iter is not
          // the end of the block, i.e. cannot be in the following blocks
          // either. In this case, the seek_key cannot be found, so we break
          // from the top level for-loop.
          break;
        }

        // Call the *saver function on each entry/block until it returns false
        for (; biter.Valid(); biter.Next()) {
          ParsedInternalKey parsed_key;
          if (!ParseInternalKey(biter.key(), &parsed_key)) {
            s = Status::Corruption(Slice());
          }

          if (!get_context->SaveValue(
                  parsed_key, biter.value(), &matched,
                  biter.IsValuePinned() ? &biter : nullptr)) {
            done = true;
            break;
          }
        }
        s = biter.status();
        if (done) {
          // Avoid the extra Next which is expensive in two-level indexes
          break;
        }
      }
      if (matched && filter != nullptr && !filter->IsBlockBased()) {
        RecordTick(rep_->ioptions.statistics, BLOOM_FILTER_FULL_TRUE_POSITIVE);
        PERF_COUNTER_BY_LEVEL_ADD(bloom_filter_full_true_positive, 1,
                                  rep_->level);
      }
      if (s.ok()) {
        s = iiter->status();
      }
      *(miter->s) = s;
    }
  }
}

2947 2948 2949
Status BlockBasedTable::Prefetch(const Slice* const begin,
                                 const Slice* const end) {
  auto& comparator = rep_->internal_comparator;
M
Maysam Yabandeh 已提交
2950
  auto user_comparator = comparator.user_comparator();
2951 2952 2953 2954 2955
  // pre-condition
  if (begin && end && comparator.Compare(*begin, *end) > 0) {
    return Status::InvalidArgument(*begin, *end);
  }

M
Maysam Yabandeh 已提交
2956
  IndexBlockIter iiter_on_stack;
2957
  auto iiter = NewIndexIterator(ReadOptions(), false, &iiter_on_stack);
2958
  std::unique_ptr<InternalIteratorBase<BlockHandle>> iiter_unique_ptr;
M
Maysam Yabandeh 已提交
2959
  if (iiter != &iiter_on_stack) {
2960 2961
    iiter_unique_ptr =
        std::unique_ptr<InternalIteratorBase<BlockHandle>>(iiter);
M
Maysam Yabandeh 已提交
2962
  }
2963

M
Maysam Yabandeh 已提交
2964
  if (!iiter->status().ok()) {
2965
    // error opening index iterator
M
Maysam Yabandeh 已提交
2966
    return iiter->status();
2967 2968 2969 2970 2971
  }

  // indicates if we are on the last page that need to be pre-fetched
  bool prefetching_boundary_page = false;

M
Maysam Yabandeh 已提交
2972 2973
  for (begin ? iiter->Seek(*begin) : iiter->SeekToFirst(); iiter->Valid();
       iiter->Next()) {
2974
    BlockHandle block_handle = iiter->value();
2975 2976
    const bool is_user_key = rep_->table_properties &&
                             rep_->table_properties->index_key_is_user_key > 0;
M
Maysam Yabandeh 已提交
2977 2978 2979 2980
    if (end &&
        ((!is_user_key && comparator.Compare(iiter->key(), *end) >= 0) ||
         (is_user_key &&
          user_comparator->Compare(iiter->key(), ExtractUserKey(*end)) >= 0))) {
2981 2982 2983 2984 2985 2986 2987 2988 2989 2990
      if (prefetching_boundary_page) {
        break;
      }

      // The index entry represents the last key in the data block.
      // We should load this page into memory as well, but no more
      prefetching_boundary_page = true;
    }

    // Load the block specified by the block_handle into the block cache
M
Maysam Yabandeh 已提交
2991
    DataBlockIter biter;
2992
    NewDataBlockIterator<DataBlockIter>(ReadOptions(), block_handle, &biter);
2993 2994 2995 2996 2997 2998 2999 3000 3001 3002

    if (!biter.status().ok()) {
      // there was an unexpected error while pre-fetching
      return biter.status();
    }
  }

  return Status::OK();
}

A
Aaron G 已提交
3003 3004 3005 3006 3007
Status BlockBasedTable::VerifyChecksum() {
  Status s;
  // Check Meta blocks
  std::unique_ptr<Block> meta;
  std::unique_ptr<InternalIterator> meta_iter;
3008
  s = ReadMetaBlock(nullptr /* prefetch buffer */, &meta, &meta_iter);
A
Aaron G 已提交
3009
  if (s.ok()) {
3010
    s = VerifyChecksumInMetaBlocks(meta_iter.get());
A
Aaron G 已提交
3011 3012 3013 3014 3015 3016 3017
    if (!s.ok()) {
      return s;
    }
  } else {
    return s;
  }
  // Check Data blocks
M
Maysam Yabandeh 已提交
3018
  IndexBlockIter iiter_on_stack;
3019
  InternalIteratorBase<BlockHandle>* iiter =
3020
      NewIndexIterator(ReadOptions(), false, &iiter_on_stack);
3021
  std::unique_ptr<InternalIteratorBase<BlockHandle>> iiter_unique_ptr;
A
Aaron G 已提交
3022
  if (iiter != &iiter_on_stack) {
3023 3024
    iiter_unique_ptr =
        std::unique_ptr<InternalIteratorBase<BlockHandle>>(iiter);
A
Aaron G 已提交
3025 3026 3027 3028 3029 3030 3031 3032 3033
  }
  if (!iiter->status().ok()) {
    // error opening index iterator
    return iiter->status();
  }
  s = VerifyChecksumInBlocks(iiter);
  return s;
}

3034 3035
Status BlockBasedTable::VerifyChecksumInBlocks(
    InternalIteratorBase<BlockHandle>* index_iter) {
A
Aaron G 已提交
3036 3037 3038 3039 3040 3041
  Status s;
  for (index_iter->SeekToFirst(); index_iter->Valid(); index_iter->Next()) {
    s = index_iter->status();
    if (!s.ok()) {
      break;
    }
3042 3043
    BlockHandle handle = index_iter->value();
    BlockContents contents;
3044 3045 3046 3047
    BlockFetcher block_fetcher(
        rep_->file.get(), nullptr /* prefetch buffer */, rep_->footer,
        ReadOptions(), handle, &contents, rep_->ioptions,
        false /* decompress */, false /*maybe_compressed*/,
3048
        UncompressionDict::GetEmptyDict(), rep_->persistent_cache_options);
3049 3050 3051 3052 3053 3054 3055 3056
    s = block_fetcher.ReadBlockContents();
    if (!s.ok()) {
      break;
    }
  }
  return s;
}

3057
Status BlockBasedTable::VerifyChecksumInMetaBlocks(
3058 3059 3060 3061
    InternalIteratorBase<Slice>* index_iter) {
  Status s;
  for (index_iter->SeekToFirst(); index_iter->Valid(); index_iter->Next()) {
    s = index_iter->status();
A
Aaron G 已提交
3062 3063 3064
    if (!s.ok()) {
      break;
    }
3065 3066 3067
    BlockHandle handle;
    Slice input = index_iter->value();
    s = handle.DecodeFrom(&input);
A
Aaron G 已提交
3068
    BlockContents contents;
3069 3070 3071 3072
    BlockFetcher block_fetcher(
        rep_->file.get(), nullptr /* prefetch buffer */, rep_->footer,
        ReadOptions(), handle, &contents, rep_->ioptions,
        false /* decompress */, false /*maybe_compressed*/,
3073
        UncompressionDict::GetEmptyDict(), rep_->persistent_cache_options);
S
Siying Dong 已提交
3074
    s = block_fetcher.ReadBlockContents();
3075 3076
    if (s.IsCorruption() && index_iter->key() == kPropertiesBlock) {
      TableProperties* table_properties;
3077
      s = TryReadPropertiesWithGlobalSeqno(nullptr /* prefetch_buffer */,
3078 3079 3080 3081
                                           index_iter->value(),
                                           &table_properties);
      delete table_properties;
    }
A
Aaron G 已提交
3082 3083 3084 3085 3086 3087 3088
    if (!s.ok()) {
      break;
    }
  }
  return s;
}

3089 3090 3091 3092 3093 3094 3095 3096 3097
bool BlockBasedTable::TEST_BlockInCache(const BlockHandle& handle) const {
  assert(rep_ != nullptr);

  Cache* const cache = rep_->table_options.block_cache.get();
  if (cache == nullptr) {
    return false;
  }

  char cache_key_storage[kMaxCacheKeyPrefixSize + kMaxVarint64Length];
3098 3099 3100
  Slice cache_key =
      GetCacheKey(rep_->cache_key_prefix, rep_->cache_key_prefix_size, handle,
                  cache_key_storage);
3101 3102 3103 3104 3105 3106 3107 3108 3109 3110 3111

  Cache::Handle* const cache_handle = cache->Lookup(cache_key);
  if (cache_handle == nullptr) {
    return false;
  }

  cache->Release(cache_handle);

  return true;
}

S
Siying Dong 已提交
3112 3113
bool BlockBasedTable::TEST_KeyInCache(const ReadOptions& options,
                                      const Slice& key) {
3114 3115
  std::unique_ptr<InternalIteratorBase<BlockHandle>> iiter(
      NewIndexIterator(options));
I
Igor Canadi 已提交
3116 3117 3118
  iiter->Seek(key);
  assert(iiter->Valid());

3119
  return TEST_BlockInCache(iiter->value());
3120
}
S
Sanjay Ghemawat 已提交
3121

3122
BlockBasedTableOptions::IndexType BlockBasedTable::UpdateIndexType() {
3123 3124
  // Some old version of block-based tables don't have index type present in
  // table properties. If that's the case we can safely use the kBinarySearch.
3125 3126
  BlockBasedTableOptions::IndexType index_type_on_file =
      BlockBasedTableOptions::kBinarySearch;
3127 3128 3129 3130 3131 3132
  if (rep_->table_properties) {
    auto& props = rep_->table_properties->user_collected_properties;
    auto pos = props.find(BlockBasedTablePropertyNames::kIndexType);
    if (pos != props.end()) {
      index_type_on_file = static_cast<BlockBasedTableOptions::IndexType>(
          DecodeFixed32(pos->second.c_str()));
3133 3134
      // update index_type with the true type
      rep_->index_type = index_type_on_file;
3135 3136
    }
  }
3137 3138 3139 3140 3141 3142 3143 3144 3145 3146
  return index_type_on_file;
}

// REQUIRES: The following fields of rep_ should have already been populated:
//  1. file
//  2. index_handle,
//  3. options
//  4. internal_comparator
//  5. index_type
Status BlockBasedTable::CreateIndexReader(
3147 3148 3149
    FilePrefetchBuffer* prefetch_buffer,
    InternalIterator* preloaded_meta_index_iter, bool use_cache, bool prefetch,
    bool pin, IndexReader** index_reader) {
3150
  auto index_type_on_file = rep_->index_type;
3151

3152 3153
  // kHashSearch requires non-empty prefix_extractor but bypass checking
  // prefix_extractor here since we have no access to MutableCFOptions.
3154
  // Add need_upper_bound_check flag in  BlockBasedTable::NewIndexIterator.
3155 3156
  // If prefix_extractor does not match prefix_extractor_name from table
  // properties, turn off Hash Index by setting total_order_seek to true
3157

K
Kai Liu 已提交
3158
  switch (index_type_on_file) {
M
Maysam Yabandeh 已提交
3159
    case BlockBasedTableOptions::kTwoLevelIndexSearch: {
3160 3161
      return PartitionIndexReader::Create(this, prefetch_buffer, use_cache,
                                          prefetch, pin, index_reader);
M
Maysam Yabandeh 已提交
3162
    }
3163
    case BlockBasedTableOptions::kBinarySearch: {
3164 3165
      return BinarySearchIndexReader::Create(this, prefetch_buffer, use_cache,
                                             prefetch, pin, index_reader);
3166 3167
    }
    case BlockBasedTableOptions::kHashSearch: {
K
Kai Liu 已提交
3168
      std::unique_ptr<Block> meta_guard;
S
sdong 已提交
3169
      std::unique_ptr<InternalIterator> meta_iter_guard;
K
Kai Liu 已提交
3170 3171
      auto meta_index_iter = preloaded_meta_index_iter;
      if (meta_index_iter == nullptr) {
3172
        auto s = ReadMetaBlock(prefetch_buffer, &meta_guard, &meta_iter_guard);
K
Kai Liu 已提交
3173
        if (!s.ok()) {
3174 3175
          // we simply fall back to binary search in case there is any
          // problem with prefix hash index loading.
3176 3177 3178
          ROCKS_LOG_WARN(rep_->ioptions.info_log,
                         "Unable to read the metaindex block."
                         " Fall back to binary search index.");
3179 3180
          return BinarySearchIndexReader::Create(
              this, prefetch_buffer, use_cache, prefetch, pin, index_reader);
K
Kai Liu 已提交
3181 3182 3183 3184
        }
        meta_index_iter = meta_iter_guard.get();
      }

3185 3186
      return HashIndexReader::Create(this, prefetch_buffer, meta_index_iter,
                                     use_cache, prefetch, pin, index_reader);
3187 3188 3189
    }
    default: {
      std::string error_message =
3190
          "Unrecognized index type: " + ToString(index_type_on_file);
3191
      return Status::InvalidArgument(error_message.c_str());
3192 3193 3194 3195
    }
  }
}

S
Siying Dong 已提交
3196
uint64_t BlockBasedTable::ApproximateOffsetOf(const Slice& key) {
3197
  std::unique_ptr<InternalIteratorBase<BlockHandle>> index_iter(
3198
      NewIndexIterator(ReadOptions()));
K
Kai Liu 已提交
3199

J
jorlow@chromium.org 已提交
3200 3201 3202
  index_iter->Seek(key);
  uint64_t result;
  if (index_iter->Valid()) {
3203 3204
    BlockHandle handle = index_iter->value();
    result = handle.offset();
J
jorlow@chromium.org 已提交
3205
  } else {
K
Kai Liu 已提交
3206 3207 3208
    // key is past the last key in the file. If table_properties is not
    // available, approximate the offset by returning the offset of the
    // metaindex block (which is right near the end of the file).
3209 3210 3211 3212
    result = 0;
    if (rep_->table_properties) {
      result = rep_->table_properties->data_size;
    }
K
Kai Liu 已提交
3213 3214
    // table_properties is not present in the table.
    if (result == 0) {
I
xxHash  
Igor Canadi 已提交
3215
      result = rep_->footer.metaindex_handle().offset();
K
Kai Liu 已提交
3216
    }
J
jorlow@chromium.org 已提交
3217 3218 3219 3220
  }
  return result;
}

3221 3222 3223 3224
bool BlockBasedTable::TEST_filter_block_preloaded() const {
  return rep_->filter != nullptr;
}

3225 3226 3227 3228
bool BlockBasedTable::TEST_IndexBlockInCache() const {
  assert(rep_ != nullptr);

  return TEST_BlockInCache(rep_->footer.index_handle());
3229 3230
}

O
omegaga 已提交
3231 3232
Status BlockBasedTable::GetKVPairsFromDataBlocks(
    std::vector<KVPairBlock>* kv_pair_blocks) {
3233
  std::unique_ptr<InternalIteratorBase<BlockHandle>> blockhandles_iter(
O
omegaga 已提交
3234 3235 3236 3237 3238 3239 3240 3241 3242 3243 3244 3245 3246 3247 3248 3249 3250
      NewIndexIterator(ReadOptions()));

  Status s = blockhandles_iter->status();
  if (!s.ok()) {
    // Cannot read Index Block
    return s;
  }

  for (blockhandles_iter->SeekToFirst(); blockhandles_iter->Valid();
       blockhandles_iter->Next()) {
    s = blockhandles_iter->status();

    if (!s.ok()) {
      break;
    }

    std::unique_ptr<InternalIterator> datablock_iter;
M
Maysam Yabandeh 已提交
3251
    datablock_iter.reset(NewDataBlockIterator<DataBlockIter>(
3252
        ReadOptions(), blockhandles_iter->value()));
O
omegaga 已提交
3253 3254 3255 3256 3257 3258 3259 3260 3261 3262 3263 3264 3265 3266 3267 3268 3269 3270 3271 3272 3273 3274 3275 3276 3277 3278 3279 3280
    s = datablock_iter->status();

    if (!s.ok()) {
      // Error reading the block - Skipped
      continue;
    }

    KVPairBlock kv_pair_block;
    for (datablock_iter->SeekToFirst(); datablock_iter->Valid();
         datablock_iter->Next()) {
      s = datablock_iter->status();
      if (!s.ok()) {
        // Error reading the block - Skipped
        break;
      }
      const Slice& key = datablock_iter->key();
      const Slice& value = datablock_iter->value();
      std::string key_copy = std::string(key.data(), key.size());
      std::string value_copy = std::string(value.data(), value.size());

      kv_pair_block.push_back(
          std::make_pair(std::move(key_copy), std::move(value_copy)));
    }
    kv_pair_blocks->push_back(std::move(kv_pair_block));
  }
  return Status::OK();
}

3281 3282
Status BlockBasedTable::DumpTable(WritableFile* out_file,
                                  const SliceTransform* prefix_extractor) {
3283 3284 3285 3286 3287 3288 3289 3290 3291 3292 3293 3294 3295
  // Output Footer
  out_file->Append(
      "Footer Details:\n"
      "--------------------------------------\n"
      "  ");
  out_file->Append(rep_->footer.ToString().c_str());
  out_file->Append("\n");

  // Output MetaIndex
  out_file->Append(
      "Metaindex Details:\n"
      "--------------------------------------\n");
  std::unique_ptr<Block> meta;
S
sdong 已提交
3296
  std::unique_ptr<InternalIterator> meta_iter;
3297
  Status s = ReadMetaBlock(nullptr /* prefetch_buffer */, &meta, &meta_iter);
3298 3299 3300 3301 3302 3303 3304 3305 3306 3307
  if (s.ok()) {
    for (meta_iter->SeekToFirst(); meta_iter->Valid(); meta_iter->Next()) {
      s = meta_iter->status();
      if (!s.ok()) {
        return s;
      }
      if (meta_iter->key() == rocksdb::kPropertiesBlock) {
        out_file->Append("  Properties block handle: ");
        out_file->Append(meta_iter->value().ToString(true).c_str());
        out_file->Append("\n");
3308 3309 3310 3311
      } else if (meta_iter->key() == rocksdb::kCompressionDictBlock) {
        out_file->Append("  Compression dictionary block handle: ");
        out_file->Append(meta_iter->value().ToString(true).c_str());
        out_file->Append("\n");
3312 3313 3314 3315 3316
      } else if (strstr(meta_iter->key().ToString().c_str(),
                        "filter.rocksdb.") != nullptr) {
        out_file->Append("  Filter block handle: ");
        out_file->Append(meta_iter->value().ToString(true).c_str());
        out_file->Append("\n");
3317 3318 3319 3320
      } else if (meta_iter->key() == rocksdb::kRangeDelBlock) {
        out_file->Append("  Range deletion block handle: ");
        out_file->Append(meta_iter->value().ToString(true).c_str());
        out_file->Append("\n");
3321 3322 3323 3324 3325 3326 3327 3328 3329 3330 3331 3332 3333 3334 3335 3336 3337 3338 3339
      }
    }
    out_file->Append("\n");
  } else {
    return s;
  }

  // Output TableProperties
  const rocksdb::TableProperties* table_properties;
  table_properties = rep_->table_properties.get();

  if (table_properties != nullptr) {
    out_file->Append(
        "Table Properties:\n"
        "--------------------------------------\n"
        "  ");
    out_file->Append(table_properties->ToString("\n  ", ": ").c_str());
    out_file->Append("\n");

3340 3341 3342 3343 3344 3345 3346 3347 3348 3349 3350 3351 3352 3353 3354
    // Output Filter blocks
    if (!rep_->filter && !table_properties->filter_policy_name.empty()) {
      // Support only BloomFilter as off now
      rocksdb::BlockBasedTableOptions table_options;
      table_options.filter_policy.reset(rocksdb::NewBloomFilterPolicy(1));
      if (table_properties->filter_policy_name.compare(
              table_options.filter_policy->Name()) == 0) {
        std::string filter_block_key = kFilterBlockPrefix;
        filter_block_key.append(table_properties->filter_policy_name);
        BlockHandle handle;
        if (FindMetaBlock(meta_iter.get(), filter_block_key, &handle).ok()) {
          BlockContents block;
          BlockFetcher block_fetcher(
              rep_->file.get(), nullptr /* prefetch_buffer */, rep_->footer,
              ReadOptions(), handle, &block, rep_->ioptions,
3355
              false /*decompress*/, false /*maybe_compressed*/,
3356
              UncompressionDict::GetEmptyDict(),
3357
              rep_->persistent_cache_options);
3358 3359 3360 3361 3362 3363 3364
          s = block_fetcher.ReadBlockContents();
          if (!s.ok()) {
            rep_->filter.reset(new BlockBasedFilterBlockReader(
                prefix_extractor, table_options,
                table_options.whole_key_filtering, std::move(block),
                rep_->ioptions.statistics));
          }
3365 3366 3367 3368 3369 3370 3371 3372 3373 3374 3375 3376 3377 3378 3379 3380 3381 3382
        }
      }
    }
  }
  if (rep_->filter) {
    out_file->Append(
        "Filter Details:\n"
        "--------------------------------------\n"
        "  ");
    out_file->Append(rep_->filter->ToString().c_str());
    out_file->Append("\n");
  }

  // Output Index block
  s = DumpIndexBlock(out_file);
  if (!s.ok()) {
    return s;
  }
3383 3384

  // Output compression dictionary
3385 3386
  if (!rep_->compression_dict_handle.IsNull()) {
    std::unique_ptr<const BlockContents> compression_dict_block;
3387
    s = ReadCompressionDictBlock(nullptr /* prefetch_buffer */,
3388 3389 3390 3391 3392 3393
                                 &compression_dict_block);
    if (!s.ok()) {
      return s;
    }
    assert(compression_dict_block != nullptr);
    auto compression_dict = compression_dict_block->data;
3394 3395 3396 3397 3398 3399 3400 3401 3402 3403 3404
    out_file->Append(
        "Compression Dictionary:\n"
        "--------------------------------------\n");
    out_file->Append("  size (bytes): ");
    out_file->Append(rocksdb::ToString(compression_dict.size()));
    out_file->Append("\n\n");
    out_file->Append("  HEX    ");
    out_file->Append(compression_dict.ToString(true).c_str());
    out_file->Append("\n\n");
  }

3405
  // Output range deletions block
A
Andrew Kryczka 已提交
3406
  auto* range_del_iter = NewRangeTombstoneIterator(ReadOptions());
A
Andrew Kryczka 已提交
3407 3408 3409 3410 3411 3412 3413 3414 3415 3416 3417
  if (range_del_iter != nullptr) {
    range_del_iter->SeekToFirst();
    if (range_del_iter->Valid()) {
      out_file->Append(
          "Range deletions:\n"
          "--------------------------------------\n"
          "  ");
      for (; range_del_iter->Valid(); range_del_iter->Next()) {
        DumpKeyValue(range_del_iter->key(), range_del_iter->value(), out_file);
      }
      out_file->Append("\n");
3418
    }
A
Andrew Kryczka 已提交
3419
    delete range_del_iter;
3420
  }
3421 3422 3423 3424 3425 3426
  // Output Data blocks
  s = DumpDataBlocks(out_file);

  return s;
}

3427
void BlockBasedTable::Close() {
3428 3429 3430
  if (rep_->closed) {
    return;
  }
3431 3432 3433 3434 3435

  Cache* const cache = rep_->table_options.block_cache.get();

  // cleanup index, filter, and compression dictionary blocks
  // to avoid accessing dangling pointers
3436 3437
  if (!rep_->table_options.no_block_cache) {
    char cache_key[kMaxCacheKeyPrefixSize + kMaxVarint64Length];
3438

3439 3440
    // Get the filter block key
    auto key = GetCacheKey(rep_->cache_key_prefix, rep_->cache_key_prefix_size,
M
Maysam Yabandeh 已提交
3441
                           rep_->filter_handle, cache_key);
3442 3443 3444 3445 3446 3447 3448 3449
    cache->Erase(key);

    if (!rep_->compression_dict_handle.IsNull()) {
      // Get the compression dictionary block key
      key = GetCacheKey(rep_->cache_key_prefix, rep_->cache_key_prefix_size,
                        rep_->compression_dict_handle, cache_key);
      cache->Erase(key);
    }
3450
  }
3451

3452
  rep_->closed = true;
3453 3454
}

3455 3456 3457 3458
Status BlockBasedTable::DumpIndexBlock(WritableFile* out_file) {
  out_file->Append(
      "Index Details:\n"
      "--------------------------------------\n");
3459
  std::unique_ptr<InternalIteratorBase<BlockHandle>> blockhandles_iter(
S
sdong 已提交
3460
      NewIndexIterator(ReadOptions()));
3461 3462 3463 3464 3465 3466 3467 3468 3469 3470 3471 3472 3473 3474 3475
  Status s = blockhandles_iter->status();
  if (!s.ok()) {
    out_file->Append("Can not read Index Block \n\n");
    return s;
  }

  out_file->Append("  Block key hex dump: Data block handle\n");
  out_file->Append("  Block key ascii\n\n");
  for (blockhandles_iter->SeekToFirst(); blockhandles_iter->Valid();
       blockhandles_iter->Next()) {
    s = blockhandles_iter->status();
    if (!s.ok()) {
      break;
    }
    Slice key = blockhandles_iter->key();
M
Maysam Yabandeh 已提交
3476
    Slice user_key;
3477
    InternalKey ikey;
3478 3479 3480 3481
    if (rep_->table_properties &&
        rep_->table_properties->index_key_is_user_key != 0) {
      user_key = key;
    } else {
M
Maysam Yabandeh 已提交
3482 3483 3484
      ikey.DecodeFrom(key);
      user_key = ikey.user_key();
    }
3485 3486

    out_file->Append("  HEX    ");
M
Maysam Yabandeh 已提交
3487
    out_file->Append(user_key.ToString(true).c_str());
3488 3489 3490 3491
    out_file->Append(": ");
    out_file->Append(blockhandles_iter->value().ToString(true).c_str());
    out_file->Append("\n");

M
Maysam Yabandeh 已提交
3492
    std::string str_key = user_key.ToString();
3493 3494 3495 3496 3497 3498 3499 3500 3501 3502 3503 3504 3505 3506 3507
    std::string res_key("");
    char cspace = ' ';
    for (size_t i = 0; i < str_key.size(); i++) {
      res_key.append(&str_key[i], 1);
      res_key.append(1, cspace);
    }
    out_file->Append("  ASCII  ");
    out_file->Append(res_key.c_str());
    out_file->Append("\n  ------\n");
  }
  out_file->Append("\n");
  return Status::OK();
}

Status BlockBasedTable::DumpDataBlocks(WritableFile* out_file) {
3508
  std::unique_ptr<InternalIteratorBase<BlockHandle>> blockhandles_iter(
S
sdong 已提交
3509
      NewIndexIterator(ReadOptions()));
3510 3511 3512 3513 3514 3515
  Status s = blockhandles_iter->status();
  if (!s.ok()) {
    out_file->Append("Can not read Index Block \n\n");
    return s;
  }

3516 3517 3518 3519
  uint64_t datablock_size_min = std::numeric_limits<uint64_t>::max();
  uint64_t datablock_size_max = 0;
  uint64_t datablock_size_sum = 0;

3520 3521 3522 3523 3524 3525 3526 3527
  size_t block_id = 1;
  for (blockhandles_iter->SeekToFirst(); blockhandles_iter->Valid();
       block_id++, blockhandles_iter->Next()) {
    s = blockhandles_iter->status();
    if (!s.ok()) {
      break;
    }

3528
    BlockHandle bh = blockhandles_iter->value();
3529 3530 3531 3532 3533
    uint64_t datablock_size = bh.size();
    datablock_size_min = std::min(datablock_size_min, datablock_size);
    datablock_size_max = std::max(datablock_size_max, datablock_size);
    datablock_size_sum += datablock_size;

3534
    out_file->Append("Data Block # ");
S
sdong 已提交
3535
    out_file->Append(rocksdb::ToString(block_id));
3536 3537 3538 3539 3540
    out_file->Append(" @ ");
    out_file->Append(blockhandles_iter->value().ToString(true).c_str());
    out_file->Append("\n");
    out_file->Append("--------------------------------------\n");

S
sdong 已提交
3541
    std::unique_ptr<InternalIterator> datablock_iter;
M
Maysam Yabandeh 已提交
3542
    datablock_iter.reset(NewDataBlockIterator<DataBlockIter>(
3543
        ReadOptions(), blockhandles_iter->value()));
3544 3545 3546 3547 3548 3549 3550 3551 3552 3553 3554 3555 3556 3557
    s = datablock_iter->status();

    if (!s.ok()) {
      out_file->Append("Error reading the block - Skipped \n\n");
      continue;
    }

    for (datablock_iter->SeekToFirst(); datablock_iter->Valid();
         datablock_iter->Next()) {
      s = datablock_iter->status();
      if (!s.ok()) {
        out_file->Append("Error reading the block - Skipped \n");
        break;
      }
3558
      DumpKeyValue(datablock_iter->key(), datablock_iter->value(), out_file);
3559 3560 3561
    }
    out_file->Append("\n");
  }
3562 3563 3564 3565 3566 3567 3568 3569 3570 3571 3572 3573 3574 3575 3576 3577 3578 3579

  uint64_t num_datablocks = block_id - 1;
  if (num_datablocks) {
    double datablock_size_avg =
        static_cast<double>(datablock_size_sum) / num_datablocks;
    out_file->Append("Data Block Summary:\n");
    out_file->Append("--------------------------------------");
    out_file->Append("\n  # data blocks: ");
    out_file->Append(rocksdb::ToString(num_datablocks));
    out_file->Append("\n  min data block size: ");
    out_file->Append(rocksdb::ToString(datablock_size_min));
    out_file->Append("\n  max data block size: ");
    out_file->Append(rocksdb::ToString(datablock_size_max));
    out_file->Append("\n  avg data block size: ");
    out_file->Append(rocksdb::ToString(datablock_size_avg));
    out_file->Append("\n");
  }

3580 3581 3582
  return Status::OK();
}

3583 3584 3585 3586 3587 3588 3589 3590 3591 3592 3593 3594 3595 3596 3597 3598
void BlockBasedTable::DumpKeyValue(const Slice& key, const Slice& value,
                                   WritableFile* out_file) {
  InternalKey ikey;
  ikey.DecodeFrom(key);

  out_file->Append("  HEX    ");
  out_file->Append(ikey.user_key().ToString(true).c_str());
  out_file->Append(": ");
  out_file->Append(value.ToString(true).c_str());
  out_file->Append("\n");

  std::string str_key = ikey.user_key().ToString();
  std::string str_value = value.ToString();
  std::string res_key(""), res_value("");
  char cspace = ' ';
  for (size_t i = 0; i < str_key.size(); i++) {
3599 3600 3601 3602 3603
    if (str_key[i] == '\0') {
      res_key.append("\\0", 2);
    } else {
      res_key.append(&str_key[i], 1);
    }
3604 3605 3606
    res_key.append(1, cspace);
  }
  for (size_t i = 0; i < str_value.size(); i++) {
3607 3608 3609 3610 3611
    if (str_value[i] == '\0') {
      res_value.append("\\0", 2);
    } else {
      res_value.append(&str_value[i], 1);
    }
3612 3613 3614 3615 3616 3617 3618 3619 3620 3621
    res_value.append(1, cspace);
  }

  out_file->Append("  ASCII  ");
  out_file->Append(res_key.c_str());
  out_file->Append(": ");
  out_file->Append(res_value.c_str());
  out_file->Append("\n  ------\n");
}

3622 3623
namespace {

A
Andrew Kryczka 已提交
3624
void DeleteCachedFilterEntry(const Slice& /*key*/, void* value) {
3625 3626 3627
  FilterBlockReader* filter = reinterpret_cast<FilterBlockReader*>(value);
  if (filter->statistics() != nullptr) {
    RecordTick(filter->statistics(), BLOCK_CACHE_FILTER_BYTES_EVICT,
3628
               filter->ApproximateMemoryUsage());
3629 3630 3631 3632
  }
  delete filter;
}

3633 3634 3635 3636 3637 3638 3639
void DeleteCachedUncompressionDictEntry(const Slice& /*key*/, void* value) {
  UncompressionDict* dict = reinterpret_cast<UncompressionDict*>(value);
  RecordTick(dict->statistics(), BLOCK_CACHE_COMPRESSION_DICT_BYTES_EVICT,
             dict->ApproximateMemoryUsage());
  delete dict;
}

3640 3641
}  // anonymous namespace

3642
}  // namespace rocksdb