block_based_table_reader.cc 148.9 KB
Newer Older
1
//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
S
Siying Dong 已提交
2 3 4
//  This source code is licensed under both the GPLv2 (found in the
//  COPYING file in the root directory) and Apache 2.0 License
//  (found in the LICENSE.Apache file in the root directory).
5
//
J
jorlow@chromium.org 已提交
6 7 8
// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file. See the AUTHORS file for names of contributors.
9
#include "table/block_based/block_based_table_reader.h"
10

11
#include <algorithm>
12
#include <array>
13
#include <limits>
14 15
#include <string>
#include <utility>
O
omegaga 已提交
16
#include <vector>
17

T
Tyler Harter 已提交
18
#include "db/dbformat.h"
19
#include "db/pinned_iterators_manager.h"
T
Tyler Harter 已提交
20

21
#include "rocksdb/cache.h"
22 23 24
#include "rocksdb/comparator.h"
#include "rocksdb/env.h"
#include "rocksdb/filter_policy.h"
25
#include "rocksdb/iterator.h"
26 27
#include "rocksdb/options.h"
#include "rocksdb/statistics.h"
S
Siying Dong 已提交
28
#include "rocksdb/table.h"
29
#include "rocksdb/table_properties.h"
30

31 32 33 34 35 36 37
#include "table/block_based/block.h"
#include "table/block_based/block_based_filter_block.h"
#include "table/block_based/block_based_table_factory.h"
#include "table/block_based/block_prefix_index.h"
#include "table/block_based/filter_block.h"
#include "table/block_based/full_filter_block.h"
#include "table/block_based/partitioned_filter_block.h"
38
#include "table/block_fetcher.h"
J
jorlow@chromium.org 已提交
39
#include "table/format.h"
K
krad 已提交
40
#include "table/get_context.h"
S
sdong 已提交
41
#include "table/internal_iterator.h"
42
#include "table/meta_blocks.h"
43
#include "table/multiget_context.h"
K
krad 已提交
44
#include "table/persistent_cache_helper.h"
45
#include "table/sst_file_writer_collectors.h"
J
jorlow@chromium.org 已提交
46
#include "table/two_level_iterator.h"
47

48
#include "monitoring/perf_context_imp.h"
49
#include "test_util/sync_point.h"
J
jorlow@chromium.org 已提交
50
#include "util/coding.h"
51
#include "util/crc32c.h"
52
#include "util/file_reader_writer.h"
53
#include "util/stop_watch.h"
54
#include "util/string_util.h"
55
#include "util/xxhash.h"
J
jorlow@chromium.org 已提交
56

57
namespace rocksdb {
J
jorlow@chromium.org 已提交
58

I
xxHash  
Igor Canadi 已提交
59
extern const uint64_t kBlockBasedTableMagicNumber;
K
Kai Liu 已提交
60 61
extern const std::string kHashIndexPrefixesBlock;
extern const std::string kHashIndexPrefixesMetadataBlock;
62 63 64

typedef BlockBasedTable::IndexReader IndexReader;

M
Maysam Yabandeh 已提交
65 66 67 68 69
BlockBasedTable::~BlockBasedTable() {
  Close();
  delete rep_;
}

70 71
std::atomic<uint64_t> BlockBasedTable::next_cache_key_id_(0);

72 73 74 75 76
namespace {
// Read the block identified by "handle" from "file".
// The only relevant option is options.verify_checksums for now.
// On failure return non-OK.
// On success fill *result and return OK - caller owns *result
77
// @param uncompression_dict Data for presetting the compression library's
78
//    dictionary.
79 80 81 82
Status ReadBlockFromFile(
    RandomAccessFileReader* file, FilePrefetchBuffer* prefetch_buffer,
    const Footer& footer, const ReadOptions& options, const BlockHandle& handle,
    std::unique_ptr<Block>* result, const ImmutableCFOptions& ioptions,
83
    bool do_uncompress, bool maybe_compressed, BlockType block_type,
84
    const UncompressionDict& uncompression_dict,
85
    const PersistentCacheOptions& cache_options, SequenceNumber global_seqno,
86 87
    size_t read_amp_bytes_per_bit, MemoryAllocator* memory_allocator,
    bool for_compaction = false) {
88
  BlockContents contents;
89 90 91 92
  BlockFetcher block_fetcher(
      file, prefetch_buffer, footer, options, handle, &contents, ioptions,
      do_uncompress, maybe_compressed, block_type, uncompression_dict,
      cache_options, memory_allocator, nullptr, for_compaction);
S
Siying Dong 已提交
93
  Status s = block_fetcher.ReadBlockContents();
94
  if (s.ok()) {
95 96
    result->reset(new Block(std::move(contents), global_seqno,
                            read_amp_bytes_per_bit, ioptions.statistics));
97 98 99 100 101
  }

  return s;
}

Y
Yi Wu 已提交
102
inline MemoryAllocator* GetMemoryAllocator(
103
    const BlockBasedTableOptions& table_options) {
104 105 106
  return table_options.block_cache.get()
             ? table_options.block_cache->memory_allocator()
             : nullptr;
107 108
}

109 110 111 112 113 114 115
inline MemoryAllocator* GetMemoryAllocatorForCompressedBlock(
    const BlockBasedTableOptions& table_options) {
  return table_options.block_cache_compressed.get()
             ? table_options.block_cache_compressed->memory_allocator()
             : nullptr;
}

116 117
// Delete the entry resided in the cache.
template <class Entry>
A
Andrew Kryczka 已提交
118
void DeleteCachedEntry(const Slice& /*key*/, void* value) {
119 120 121 122
  auto entry = reinterpret_cast<Entry*>(value);
  delete entry;
}

123
void DeleteCachedFilterEntry(const Slice& key, void* value);
124
void DeleteCachedUncompressionDictEntry(const Slice& key, void* value);
125

126 127 128 129
// Release the cached entry and decrement its ref count.
void ForceReleaseCachedEntry(void* arg, void* h) {
  Cache* cache = reinterpret_cast<Cache*>(arg);
  Cache::Handle* handle = reinterpret_cast<Cache::Handle*>(h);
130
  cache->Release(handle, true /* force_erase */);
131 132
}

133 134 135 136 137 138 139 140
// Release the cached entry and decrement its ref count.
// Do not force erase
void ReleaseCachedEntry(void* arg, void* h) {
  Cache* cache = reinterpret_cast<Cache*>(arg);
  Cache::Handle* handle = reinterpret_cast<Cache::Handle*>(h);
  cache->Release(handle, false /* force_erase */);
}

141 142 143
// For hash based index, return true if prefix_extractor and
// prefix_extractor_block mismatch, false otherwise. This flag will be used
// as total_order_seek via NewIndexIterator
144 145
bool PrefixExtractorChanged(const TableProperties* table_properties,
                            const SliceTransform* prefix_extractor) {
146 147 148
  // BlockBasedTableOptions::kHashSearch requires prefix_extractor to be set.
  // Turn off hash index in prefix_extractor is not set; if  prefix_extractor
  // is set but prefix_extractor_block is not set, also disable hash index
149 150
  if (prefix_extractor == nullptr || table_properties == nullptr ||
      table_properties->prefix_extractor_name.empty()) {
151 152
    return true;
  }
153

154
  // prefix_extractor and prefix_extractor_block are both non-empty
155 156
  if (table_properties->prefix_extractor_name.compare(
          prefix_extractor->Name()) != 0) {
157 158 159 160 161 162
    return true;
  } else {
    return false;
  }
}

163 164
}  // namespace

165 166 167 168 169
// Encapsulates common functionality for the various index reader
// implementations. Provides access to the index block regardless of whether
// it is owned by the reader or stored in the cache, or whether it is pinned
// in the cache or not.
class BlockBasedTable::IndexReaderCommon : public BlockBasedTable::IndexReader {
170
 public:
171 172
  IndexReaderCommon(const BlockBasedTable* t,
                    CachableEntry<Block>&& index_block)
173
      : table_(t), index_block_(std::move(index_block)) {
174 175 176
    assert(table_ != nullptr);
  }

177
 protected:
178
  static Status ReadIndexBlock(const BlockBasedTable* table,
179 180 181
                               FilePrefetchBuffer* prefetch_buffer,
                               const ReadOptions& read_options,
                               GetContext* get_context,
182
                               BlockCacheLookupContext* lookup_context,
183
                               CachableEntry<Block>* index_block);
184

185
  const BlockBasedTable* table() const { return table_; }
186 187 188 189 190 191 192 193 194 195 196 197 198

  const InternalKeyComparator* internal_comparator() const {
    assert(table_ != nullptr);
    assert(table_->get_rep() != nullptr);

    return &table_->get_rep()->internal_comparator;
  }

  bool index_key_includes_seq() const {
    assert(table_ != nullptr);
    assert(table_->get_rep() != nullptr);

    const TableProperties* const properties =
199
        table_->get_rep()->table_properties.get();
200 201 202 203 204 205 206 207 208

    return properties == nullptr || !properties->index_key_is_user_key;
  }

  bool index_value_is_full() const {
    assert(table_ != nullptr);
    assert(table_->get_rep() != nullptr);

    const TableProperties* const properties =
209
        table_->get_rep()->table_properties.get();
210 211 212 213

    return properties == nullptr || !properties->index_value_is_delta_encoded;
  }

214
  Status GetOrReadIndexBlock(bool no_io, GetContext* get_context,
215
                             BlockCacheLookupContext* lookup_context,
216 217 218 219
                             CachableEntry<Block>* index_block) const;

  size_t ApproximateIndexBlockMemoryUsage() const {
    assert(!index_block_.GetOwnValue() || index_block_.GetValue() != nullptr);
220 221 222
    return index_block_.GetOwnValue()
               ? index_block_.GetValue()->ApproximateMemoryUsage()
               : 0;
223 224
  }

225
 private:
226
  const BlockBasedTable* table_;
227 228 229 230
  CachableEntry<Block> index_block_;
};

Status BlockBasedTable::IndexReaderCommon::ReadIndexBlock(
231
    const BlockBasedTable* table, FilePrefetchBuffer* prefetch_buffer,
232
    const ReadOptions& read_options, GetContext* get_context,
233
    BlockCacheLookupContext* lookup_context,
234
    CachableEntry<Block>* index_block) {
235 236 237 238 239 240 241 242 243
  PERF_TIMER_GUARD(read_index_block_nanos);

  assert(table != nullptr);
  assert(index_block != nullptr);
  assert(index_block->IsEmpty());

  const Rep* const rep = table->get_rep();
  assert(rep != nullptr);

244 245
  const Status s = table->RetrieveBlock(
      prefetch_buffer, read_options, rep->footer.index_handle(),
246
      UncompressionDict::GetEmptyDict(), index_block, BlockType::kIndex,
247
      get_context, lookup_context);
248 249 250 251 252

  return s;
}

Status BlockBasedTable::IndexReaderCommon::GetOrReadIndexBlock(
253
    bool no_io, GetContext* get_context,
254
    BlockCacheLookupContext* lookup_context,
255
    CachableEntry<Block>* index_block) const {
256 257 258
  assert(index_block != nullptr);

  if (!index_block_.IsEmpty()) {
259
    index_block->SetUnownedValue(index_block_.GetValue());
260 261 262
    return Status::OK();
  }

263 264 265 266 267
  ReadOptions read_options;
  if (no_io) {
    read_options.read_tier = kBlockCacheTier;
  }

268 269
  return ReadIndexBlock(table_, /*prefetch_buffer=*/nullptr, read_options,
                        get_context, lookup_context, index_block);
270 271
}

M
Maysam Yabandeh 已提交
272
// Index that allows binary search lookup in a two-level index structure.
273
class PartitionIndexReader : public BlockBasedTable::IndexReaderCommon {
M
Maysam Yabandeh 已提交
274 275 276 277 278
 public:
  // Read the partition index from the file and create an instance for
  // `PartitionIndexReader`.
  // On success, index_reader will be populated; otherwise it will remain
  // unmodified.
279
  static Status Create(const BlockBasedTable* table,
280
                       FilePrefetchBuffer* prefetch_buffer, bool use_cache,
281 282
                       bool prefetch, bool pin, IndexReader** index_reader,
                       BlockCacheLookupContext* lookup_context) {
283 284 285 286 287 288 289
    assert(table != nullptr);
    assert(table->get_rep());
    assert(!pin || prefetch);
    assert(index_reader != nullptr);

    CachableEntry<Block> index_block;
    if (prefetch || !use_cache) {
290 291 292
      const Status s =
          ReadIndexBlock(table, prefetch_buffer, ReadOptions(),
                         /*get_context=*/nullptr, lookup_context, &index_block);
293 294 295
      if (!s.ok()) {
        return s;
      }
M
Maysam Yabandeh 已提交
296

297 298 299
      if (use_cache && !pin) {
        index_block.Reset();
      }
M
Maysam Yabandeh 已提交
300 301
    }

302 303 304
    *index_reader = new PartitionIndexReader(table, std::move(index_block));

    return Status::OK();
M
Maysam Yabandeh 已提交
305 306 307
  }

  // return a two-level iterator: first level is on the partition index
308
  InternalIteratorBase<BlockHandle>* NewIterator(
309
      const ReadOptions& read_options, bool /* disable_prefix_seek */,
310 311
      IndexBlockIter* iter, GetContext* get_context,
      BlockCacheLookupContext* lookup_context) override {
312
    const bool no_io = (read_options.read_tier == kBlockCacheTier);
313
    CachableEntry<Block> index_block;
314 315
    const Status s =
        GetOrReadIndexBlock(no_io, get_context, lookup_context, &index_block);
316 317 318 319 320 321 322 323 324 325 326
    if (!s.ok()) {
      if (iter != nullptr) {
        iter->Invalidate(s);
        return iter;
      }

      return NewErrorInternalIterator<BlockHandle>(s);
    }

    InternalIteratorBase<BlockHandle>* it = nullptr;

M
Maysam Yabandeh 已提交
327
    Statistics* kNullStats = nullptr;
M
Maysam Yabandeh 已提交
328
    // Filters are already checked before seeking the index
329
    if (!partition_map_.empty()) {
330
      // We don't return pinned data from index blocks, so no need
331
      // to set `block_contents_pinned`.
332
      it = NewTwoLevelIterator(
333
          new BlockBasedTable::PartitionedIndexIteratorState(
334 335 336 337 338 339
              table(), &partition_map_, index_key_includes_seq(),
              index_value_is_full()),
          index_block.GetValue()->NewIterator<IndexBlockIter>(
              internal_comparator(), internal_comparator()->user_comparator(),
              nullptr, kNullStats, true, index_key_includes_seq(),
              index_value_is_full()));
340
    } else {
341 342 343
      ReadOptions ro;
      ro.fill_cache = read_options.fill_cache;
      // We don't return pinned data from index blocks, so no need
344
      // to set `block_contents_pinned`.
345 346 347 348 349 350
      it = new BlockBasedTableIterator<IndexBlockIter, BlockHandle>(
          table(), ro, *internal_comparator(),
          index_block.GetValue()->NewIterator<IndexBlockIter>(
              internal_comparator(), internal_comparator()->user_comparator(),
              nullptr, kNullStats, true, index_key_includes_seq(),
              index_value_is_full()),
351
          false, true, /* prefix_extractor */ nullptr, BlockType::kIndex,
352 353 354
          index_key_includes_seq(), index_value_is_full(),
          lookup_context ? lookup_context->caller
                         : TableReaderCaller::kUncategorized);
355
    }
356 357 358 359 360 361

    assert(it != nullptr);
    index_block.TransferTo(it);

    return it;

M
Maysam Yabandeh 已提交
362
    // TODO(myabandeh): Update TwoLevelIterator to be able to make use of
M
Maysam Yabandeh 已提交
363 364 365 366 367
    // on-stack BlockIter while the state is on heap. Currentlly it assumes
    // the first level iter is always on heap and will attempt to delete it
    // in its destructor.
  }

368
  void CacheDependencies(bool pin) override {
M
Maysam Yabandeh 已提交
369
    // Before read partitions, prefetch them to avoid lots of IOs
370
    BlockCacheLookupContext lookup_context{TableReaderCaller::kPrefetch};
371
    auto rep = table()->rep_;
M
Maysam Yabandeh 已提交
372
    IndexBlockIter biter;
M
Maysam Yabandeh 已提交
373
    BlockHandle handle;
M
Maysam Yabandeh 已提交
374
    Statistics* kNullStats = nullptr;
375 376

    CachableEntry<Block> index_block;
377
    Status s = GetOrReadIndexBlock(false /* no_io */, nullptr /* get_context */,
378
                                   &lookup_context, &index_block);
379 380 381
    if (!s.ok()) {
      ROCKS_LOG_WARN(rep->ioptions.info_log,
                     "Error retrieving top-level index block while trying to "
382 383
                     "cache index partitions: %s",
                     s.ToString().c_str());
384 385 386 387
      return;
    }

    // We don't return pinned data from index blocks, so no need
388
    // to set `block_contents_pinned`.
389 390 391
    index_block.GetValue()->NewIterator<IndexBlockIter>(
        internal_comparator(), internal_comparator()->user_comparator(), &biter,
        kNullStats, true, index_key_includes_seq(), index_value_is_full());
M
Maysam Yabandeh 已提交
392 393 394
    // Index partitions are assumed to be consecuitive. Prefetch them all.
    // Read the first block offset
    biter.SeekToFirst();
395 396 397 398
    if (!biter.Valid()) {
      // Empty index.
      return;
    }
399
    handle = biter.value();
M
Maysam Yabandeh 已提交
400 401 402 403
    uint64_t prefetch_off = handle.offset();

    // Read the last block's offset
    biter.SeekToLast();
404 405 406 407
    if (!biter.Valid()) {
      // Empty index.
      return;
    }
408
    handle = biter.value();
M
Maysam Yabandeh 已提交
409 410 411
    uint64_t last_off = handle.offset() + handle.size() + kBlockTrailerSize;
    uint64_t prefetch_len = last_off - prefetch_off;
    std::unique_ptr<FilePrefetchBuffer> prefetch_buffer;
412
    auto& file = rep->file;
M
Maysam Yabandeh 已提交
413
    prefetch_buffer.reset(new FilePrefetchBuffer());
414 415
    s = prefetch_buffer->Prefetch(file.get(), prefetch_off,
                                  static_cast<size_t>(prefetch_len));
M
Maysam Yabandeh 已提交
416 417 418 419 420

    // After prefetch, read the partitions one by one
    biter.SeekToFirst();
    auto ro = ReadOptions();
    for (; biter.Valid(); biter.Next()) {
421
      handle = biter.value();
422
      CachableEntry<Block> block;
423 424
      // TODO: Support counter batch update for partitioned index and
      // filter blocks
425 426
      s = table()->MaybeReadBlockAndLoadToCache(
          prefetch_buffer.get(), ro, handle, UncompressionDict::GetEmptyDict(),
427
          &block, BlockType::kIndex, /*get_context=*/nullptr, &lookup_context);
M
Maysam Yabandeh 已提交
428

429 430 431
      assert(s.ok() || block.GetValue() == nullptr);
      if (s.ok() && block.GetValue() != nullptr) {
        if (block.IsCached()) {
432
          if (pin) {
433
            partition_map_[handle.offset()] = std::move(block);
434
          }
M
Maysam Yabandeh 已提交
435 436 437
        }
      }
    }
M
Maysam Yabandeh 已提交
438 439
  }

440
  size_t ApproximateMemoryUsage() const override {
441
    size_t usage = ApproximateIndexBlockMemoryUsage();
442 443 444 445 446 447 448
#ifdef ROCKSDB_MALLOC_USABLE_SIZE
    usage += malloc_usable_size((void*)this);
#else
    usage += sizeof(*this);
#endif  // ROCKSDB_MALLOC_USABLE_SIZE
    // TODO(myabandeh): more accurate estimate of partition_map_ mem usage
    return usage;
M
Maysam Yabandeh 已提交
449 450 451
  }

 private:
452 453
  PartitionIndexReader(const BlockBasedTable* t,
                       CachableEntry<Block>&& index_block)
454
      : IndexReaderCommon(t, std::move(index_block)) {}
455

456
  std::unordered_map<uint64_t, CachableEntry<Block>> partition_map_;
M
Maysam Yabandeh 已提交
457 458
};

459 460 461
// Index that allows binary search lookup for the first key of each block.
// This class can be viewed as a thin wrapper for `Block` class which already
// supports binary search.
462
class BinarySearchIndexReader : public BlockBasedTable::IndexReaderCommon {
463 464 465
 public:
  // Read index from the file and create an intance for
  // `BinarySearchIndexReader`.
466 467
  // On success, index_reader will be populated; otherwise it will remain
  // unmodified.
468
  static Status Create(const BlockBasedTable* table,
469
                       FilePrefetchBuffer* prefetch_buffer, bool use_cache,
470 471
                       bool prefetch, bool pin, IndexReader** index_reader,
                       BlockCacheLookupContext* lookup_context) {
472 473 474 475 476 477 478
    assert(table != nullptr);
    assert(table->get_rep());
    assert(!pin || prefetch);
    assert(index_reader != nullptr);

    CachableEntry<Block> index_block;
    if (prefetch || !use_cache) {
479 480 481
      const Status s =
          ReadIndexBlock(table, prefetch_buffer, ReadOptions(),
                         /*get_context=*/nullptr, lookup_context, &index_block);
482 483 484
      if (!s.ok()) {
        return s;
      }
485

486 487 488
      if (use_cache && !pin) {
        index_block.Reset();
      }
489 490
    }

491 492 493
    *index_reader = new BinarySearchIndexReader(table, std::move(index_block));

    return Status::OK();
494 495
  }

496
  InternalIteratorBase<BlockHandle>* NewIterator(
497
      const ReadOptions& read_options, bool /* disable_prefix_seek */,
498 499
      IndexBlockIter* iter, GetContext* get_context,
      BlockCacheLookupContext* lookup_context) override {
500
    const bool no_io = (read_options.read_tier == kBlockCacheTier);
501
    CachableEntry<Block> index_block;
502 503
    const Status s =
        GetOrReadIndexBlock(no_io, get_context, lookup_context, &index_block);
504 505 506 507 508 509 510 511 512
    if (!s.ok()) {
      if (iter != nullptr) {
        iter->Invalidate(s);
        return iter;
      }

      return NewErrorInternalIterator<BlockHandle>(s);
    }

M
Maysam Yabandeh 已提交
513
    Statistics* kNullStats = nullptr;
514
    // We don't return pinned data from index blocks, so no need
515
    // to set `block_contents_pinned`.
516 517 518
    auto it = index_block.GetValue()->NewIterator<IndexBlockIter>(
        internal_comparator(), internal_comparator()->user_comparator(), iter,
        kNullStats, true, index_key_includes_seq(), index_value_is_full());
519

520 521 522 523 524
    assert(it != nullptr);
    index_block.TransferTo(it);

    return it;
  }
525

526
  size_t ApproximateMemoryUsage() const override {
527
    size_t usage = ApproximateIndexBlockMemoryUsage();
528 529 530 531 532 533
#ifdef ROCKSDB_MALLOC_USABLE_SIZE
    usage += malloc_usable_size((void*)this);
#else
    usage += sizeof(*this);
#endif  // ROCKSDB_MALLOC_USABLE_SIZE
    return usage;
534 535
  }

536
 private:
537
  BinarySearchIndexReader(const BlockBasedTable* t,
538
                          CachableEntry<Block>&& index_block)
539
      : IndexReaderCommon(t, std::move(index_block)) {}
540 541 542 543
};

// Index that leverages an internal hash table to quicken the lookup for a given
// key.
544
class HashIndexReader : public BlockBasedTable::IndexReaderCommon {
545
 public:
546
  static Status Create(const BlockBasedTable* table,
547 548
                       FilePrefetchBuffer* prefetch_buffer,
                       InternalIterator* meta_index_iter, bool use_cache,
549 550
                       bool prefetch, bool pin, IndexReader** index_reader,
                       BlockCacheLookupContext* lookup_context) {
551 552 553 554 555 556 557 558 559
    assert(table != nullptr);
    assert(index_reader != nullptr);
    assert(!pin || prefetch);

    auto rep = table->get_rep();
    assert(rep != nullptr);

    CachableEntry<Block> index_block;
    if (prefetch || !use_cache) {
560 561 562
      const Status s =
          ReadIndexBlock(table, prefetch_buffer, ReadOptions(),
                         /*get_context=*/nullptr, lookup_context, &index_block);
563 564 565
      if (!s.ok()) {
        return s;
      }
566

567 568 569
      if (use_cache && !pin) {
        index_block.Reset();
      }
570 571
    }

572 573 574 575
    // Note, failure to create prefix hash index does not need to be a
    // hard error. We can still fall back to the original binary search index.
    // So, Create will succeed regardless, from this point on.

576
    auto new_index_reader = new HashIndexReader(table, std::move(index_block));
577 578
    *index_reader = new_index_reader;

K
Kai Liu 已提交
579 580
    // Get prefixes block
    BlockHandle prefixes_handle;
581 582
    Status s = FindMetaBlock(meta_index_iter, kHashIndexPrefixesBlock,
                             &prefixes_handle);
K
Kai Liu 已提交
583
    if (!s.ok()) {
584 585
      // TODO: log error
      return Status::OK();
K
Kai Liu 已提交
586 587 588 589 590 591 592
    }

    // Get index metadata block
    BlockHandle prefixes_meta_handle;
    s = FindMetaBlock(meta_index_iter, kHashIndexPrefixesMetadataBlock,
                      &prefixes_meta_handle);
    if (!s.ok()) {
593 594
      // TODO: log error
      return Status::OK();
K
Kai Liu 已提交
595 596
    }

597 598 599 600 601
    RandomAccessFileReader* const file = rep->file.get();
    const Footer& footer = rep->footer;
    const ImmutableCFOptions& ioptions = rep->ioptions;
    const PersistentCacheOptions& cache_options = rep->persistent_cache_options;
    MemoryAllocator* const memory_allocator =
602
        GetMemoryAllocator(rep->table_options);
603

K
Kai Liu 已提交
604 605
    // Read contents for the blocks
    BlockContents prefixes_contents;
S
Siying Dong 已提交
606 607
    BlockFetcher prefixes_block_fetcher(
        file, prefetch_buffer, footer, ReadOptions(), prefixes_handle,
608
        &prefixes_contents, ioptions, true /*decompress*/,
609 610
        true /*maybe_compressed*/, BlockType::kHashIndexPrefixes,
        UncompressionDict::GetEmptyDict(), cache_options, memory_allocator);
S
Siying Dong 已提交
611
    s = prefixes_block_fetcher.ReadBlockContents();
K
Kai Liu 已提交
612 613 614 615
    if (!s.ok()) {
      return s;
    }
    BlockContents prefixes_meta_contents;
S
Siying Dong 已提交
616 617
    BlockFetcher prefixes_meta_block_fetcher(
        file, prefetch_buffer, footer, ReadOptions(), prefixes_meta_handle,
618
        &prefixes_meta_contents, ioptions, true /*decompress*/,
619 620
        true /*maybe_compressed*/, BlockType::kHashIndexMetadata,
        UncompressionDict::GetEmptyDict(), cache_options, memory_allocator);
621
    s = prefixes_meta_block_fetcher.ReadBlockContents();
K
Kai Liu 已提交
622
    if (!s.ok()) {
623 624
      // TODO: log error
      return Status::OK();
K
Kai Liu 已提交
625 626
    }

627
    BlockPrefixIndex* prefix_index = nullptr;
628 629
    s = BlockPrefixIndex::Create(rep->internal_prefix_transform.get(),
                                 prefixes_contents.data,
630 631 632
                                 prefixes_meta_contents.data, &prefix_index);
    // TODO: log error
    if (s.ok()) {
M
Maysam Yabandeh 已提交
633
      new_index_reader->prefix_index_.reset(prefix_index);
K
Kai Liu 已提交
634 635
    }

636
    return Status::OK();
637 638
  }

639
  InternalIteratorBase<BlockHandle>* NewIterator(
640
      const ReadOptions& read_options, bool disable_prefix_seek,
641 642
      IndexBlockIter* iter, GetContext* get_context,
      BlockCacheLookupContext* lookup_context) override {
643
    const bool no_io = (read_options.read_tier == kBlockCacheTier);
644
    CachableEntry<Block> index_block;
645 646
    const Status s =
        GetOrReadIndexBlock(no_io, get_context, lookup_context, &index_block);
647 648 649 650 651 652 653 654 655
    if (!s.ok()) {
      if (iter != nullptr) {
        iter->Invalidate(s);
        return iter;
      }

      return NewErrorInternalIterator<BlockHandle>(s);
    }

M
Maysam Yabandeh 已提交
656
    Statistics* kNullStats = nullptr;
657 658
    const bool total_order_seek =
        read_options.total_order_seek || disable_prefix_seek;
659
    // We don't return pinned data from index blocks, so no need
660
    // to set `block_contents_pinned`.
661 662 663 664 665
    auto it = index_block.GetValue()->NewIterator<IndexBlockIter>(
        internal_comparator(), internal_comparator()->user_comparator(), iter,
        kNullStats, total_order_seek, index_key_includes_seq(),
        index_value_is_full(), false /* block_contents_pinned */,
        prefix_index_.get());
666

667 668 669 670 671
    assert(it != nullptr);
    index_block.TransferTo(it);

    return it;
  }
672

673
  size_t ApproximateMemoryUsage() const override {
674
    size_t usage = ApproximateIndexBlockMemoryUsage();
675 676 677
#ifdef ROCKSDB_MALLOC_USABLE_SIZE
    usage += malloc_usable_size((void*)this);
#else
M
Maysam Yabandeh 已提交
678 679 680
    if (prefix_index_) {
      usage += prefix_index_->ApproximateMemoryUsage();
    }
681 682 683
    usage += sizeof(*this);
#endif  // ROCKSDB_MALLOC_USABLE_SIZE
    return usage;
684 685
  }

686
 private:
687
  HashIndexReader(const BlockBasedTable* t, CachableEntry<Block>&& index_block)
688
      : IndexReaderCommon(t, std::move(index_block)) {}
K
Kai Liu 已提交
689

M
Maysam Yabandeh 已提交
690
  std::unique_ptr<BlockPrefixIndex> prefix_index_;
691 692
};

693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864
void BlockBasedTable::UpdateCacheHitMetrics(BlockType block_type,
                                            GetContext* get_context,
                                            size_t usage) const {
  Statistics* const statistics = rep_->ioptions.statistics;

  PERF_COUNTER_ADD(block_cache_hit_count, 1);
  PERF_COUNTER_BY_LEVEL_ADD(block_cache_hit_count, 1,
                            static_cast<uint32_t>(rep_->level));

  if (get_context) {
    ++get_context->get_context_stats_.num_cache_hit;
    get_context->get_context_stats_.num_cache_bytes_read += usage;
  } else {
    RecordTick(statistics, BLOCK_CACHE_HIT);
    RecordTick(statistics, BLOCK_CACHE_BYTES_READ, usage);
  }

  switch (block_type) {
    case BlockType::kFilter:
      PERF_COUNTER_ADD(block_cache_filter_hit_count, 1);

      if (get_context) {
        ++get_context->get_context_stats_.num_cache_filter_hit;
      } else {
        RecordTick(statistics, BLOCK_CACHE_FILTER_HIT);
      }
      break;

    case BlockType::kCompressionDictionary:
      // TODO: introduce perf counter for compression dictionary hit count
      if (get_context) {
        ++get_context->get_context_stats_.num_cache_compression_dict_hit;
      } else {
        RecordTick(statistics, BLOCK_CACHE_COMPRESSION_DICT_HIT);
      }
      break;

    case BlockType::kIndex:
      PERF_COUNTER_ADD(block_cache_index_hit_count, 1);

      if (get_context) {
        ++get_context->get_context_stats_.num_cache_index_hit;
      } else {
        RecordTick(statistics, BLOCK_CACHE_INDEX_HIT);
      }
      break;

    default:
      // TODO: introduce dedicated tickers/statistics/counters
      // for range tombstones
      if (get_context) {
        ++get_context->get_context_stats_.num_cache_data_hit;
      } else {
        RecordTick(statistics, BLOCK_CACHE_DATA_HIT);
      }
      break;
  }
}

void BlockBasedTable::UpdateCacheMissMetrics(BlockType block_type,
                                             GetContext* get_context) const {
  Statistics* const statistics = rep_->ioptions.statistics;

  // TODO: introduce aggregate (not per-level) block cache miss count
  PERF_COUNTER_BY_LEVEL_ADD(block_cache_miss_count, 1,
                            static_cast<uint32_t>(rep_->level));

  if (get_context) {
    ++get_context->get_context_stats_.num_cache_miss;
  } else {
    RecordTick(statistics, BLOCK_CACHE_MISS);
  }

  // TODO: introduce perf counters for misses per block type
  switch (block_type) {
    case BlockType::kFilter:
      if (get_context) {
        ++get_context->get_context_stats_.num_cache_filter_miss;
      } else {
        RecordTick(statistics, BLOCK_CACHE_FILTER_MISS);
      }
      break;

    case BlockType::kCompressionDictionary:
      if (get_context) {
        ++get_context->get_context_stats_.num_cache_compression_dict_miss;
      } else {
        RecordTick(statistics, BLOCK_CACHE_COMPRESSION_DICT_MISS);
      }
      break;

    case BlockType::kIndex:
      if (get_context) {
        ++get_context->get_context_stats_.num_cache_index_miss;
      } else {
        RecordTick(statistics, BLOCK_CACHE_INDEX_MISS);
      }
      break;

    default:
      // TODO: introduce dedicated tickers/statistics/counters
      // for range tombstones
      if (get_context) {
        ++get_context->get_context_stats_.num_cache_data_miss;
      } else {
        RecordTick(statistics, BLOCK_CACHE_DATA_MISS);
      }
      break;
  }
}

void BlockBasedTable::UpdateCacheInsertionMetrics(BlockType block_type,
                                                  GetContext* get_context,
                                                  size_t usage) const {
  Statistics* const statistics = rep_->ioptions.statistics;

  // TODO: introduce perf counters for block cache insertions
  if (get_context) {
    ++get_context->get_context_stats_.num_cache_add;
    get_context->get_context_stats_.num_cache_bytes_write += usage;
  } else {
    RecordTick(statistics, BLOCK_CACHE_ADD);
    RecordTick(statistics, BLOCK_CACHE_BYTES_WRITE, usage);
  }

  switch (block_type) {
    case BlockType::kFilter:
      if (get_context) {
        ++get_context->get_context_stats_.num_cache_filter_add;
        get_context->get_context_stats_.num_cache_filter_bytes_insert += usage;
      } else {
        RecordTick(statistics, BLOCK_CACHE_FILTER_ADD);
        RecordTick(statistics, BLOCK_CACHE_FILTER_BYTES_INSERT, usage);
      }
      break;

    case BlockType::kCompressionDictionary:
      if (get_context) {
        ++get_context->get_context_stats_.num_cache_compression_dict_add;
        get_context->get_context_stats_
            .num_cache_compression_dict_bytes_insert += usage;
      } else {
        RecordTick(statistics, BLOCK_CACHE_COMPRESSION_DICT_ADD);
        RecordTick(statistics, BLOCK_CACHE_COMPRESSION_DICT_BYTES_INSERT,
                   usage);
      }
      break;

    case BlockType::kIndex:
      if (get_context) {
        ++get_context->get_context_stats_.num_cache_index_add;
        get_context->get_context_stats_.num_cache_index_bytes_insert += usage;
      } else {
        RecordTick(statistics, BLOCK_CACHE_INDEX_ADD);
        RecordTick(statistics, BLOCK_CACHE_INDEX_BYTES_INSERT, usage);
      }
      break;

    default:
      // TODO: introduce dedicated tickers/statistics/counters
      // for range tombstones
      if (get_context) {
        ++get_context->get_context_stats_.num_cache_data_add;
        get_context->get_context_stats_.num_cache_data_bytes_insert += usage;
      } else {
        RecordTick(statistics, BLOCK_CACHE_DATA_ADD);
        RecordTick(statistics, BLOCK_CACHE_DATA_BYTES_INSERT, usage);
      }
      break;
  }
}

865
Cache::Handle* BlockBasedTable::GetEntryFromCache(
866
    Cache* block_cache, const Slice& key, BlockType block_type,
867
    GetContext* get_context) const {
868 869
  auto cache_handle = block_cache->Lookup(key, rep_->ioptions.statistics);

870
  if (cache_handle != nullptr) {
871 872
    UpdateCacheHitMetrics(block_type, get_context,
                          block_cache->GetUsage(cache_handle));
873
  } else {
874
    UpdateCacheMissMetrics(block_type, get_context);
875 876 877 878 879
  }

  return cache_handle;
}

880
// Helper function to setup the cache key's prefix for the Table.
881
void BlockBasedTable::SetupCacheKeyPrefix(Rep* rep) {
882 883
  assert(kMaxCacheKeyPrefixSize >= 10);
  rep->cache_key_prefix_size = 0;
884
  rep->compressed_cache_key_prefix_size = 0;
885
  if (rep->table_options.block_cache != nullptr) {
886 887
    GenerateCachePrefix(rep->table_options.block_cache.get(), rep->file->file(),
                        &rep->cache_key_prefix[0], &rep->cache_key_prefix_size);
888
  }
K
krad 已提交
889 890 891 892 893
  if (rep->table_options.persistent_cache != nullptr) {
    GenerateCachePrefix(/*cache=*/nullptr, rep->file->file(),
                        &rep->persistent_cache_key_prefix[0],
                        &rep->persistent_cache_key_prefix_size);
  }
894 895
  if (rep->table_options.block_cache_compressed != nullptr) {
    GenerateCachePrefix(rep->table_options.block_cache_compressed.get(),
896
                        rep->file->file(), &rep->compressed_cache_key_prefix[0],
897 898 899 900
                        &rep->compressed_cache_key_prefix_size);
  }
}

901 902
void BlockBasedTable::GenerateCachePrefix(Cache* cc, RandomAccessFile* file,
                                          char* buffer, size_t* size) {
903 904 905 906 907
  // generate an id from the file
  *size = file->GetUniqueId(buffer, kMaxCacheKeyPrefixSize);

  // If the prefix wasn't generated or was too long,
  // create one from the cache.
K
krad 已提交
908
  if (cc && *size == 0) {
909 910 911 912 913
    char* end = EncodeVarint64(buffer, cc->NewId());
    *size = static_cast<size_t>(end - buffer);
  }
}

914 915
void BlockBasedTable::GenerateCachePrefix(Cache* cc, WritableFile* file,
                                          char* buffer, size_t* size) {
916 917 918 919 920 921 922 923
  // generate an id from the file
  *size = file->GetUniqueId(buffer, kMaxCacheKeyPrefixSize);

  // If the prefix wasn't generated or was too long,
  // create one from the cache.
  if (*size == 0) {
    char* end = EncodeVarint64(buffer, cc->NewId());
    *size = static_cast<size_t>(end - buffer);
924 925 926
  }
}

927 928 929 930 931 932 933 934 935 936 937 938
namespace {
// Return True if table_properties has `user_prop_name` has a `true` value
// or it doesn't contain this property (for backward compatible).
bool IsFeatureSupported(const TableProperties& table_properties,
                        const std::string& user_prop_name, Logger* info_log) {
  auto& props = table_properties.user_collected_properties;
  auto pos = props.find(user_prop_name);
  // Older version doesn't have this value set. Skip this check.
  if (pos != props.end()) {
    if (pos->second == kPropFalse) {
      return false;
    } else if (pos->second != kPropTrue) {
939 940
      ROCKS_LOG_WARN(info_log, "Property %s has invalidate value %s",
                     user_prop_name.c_str(), pos->second.c_str());
941 942 943 944
    }
  }
  return true;
}
945

946 947 948 949 950 951 952
// Caller has to ensure seqno is not nullptr.
Status GetGlobalSequenceNumber(const TableProperties& table_properties,
                               SequenceNumber largest_seqno,
                               SequenceNumber* seqno) {
  const auto& props = table_properties.user_collected_properties;
  const auto version_pos = props.find(ExternalSstFilePropertyNames::kVersion);
  const auto seqno_pos = props.find(ExternalSstFilePropertyNames::kGlobalSeqno);
953

954
  *seqno = kDisableGlobalSequenceNumber;
955 956
  if (version_pos == props.end()) {
    if (seqno_pos != props.end()) {
957
      std::array<char, 200> msg_buf;
958
      // This is not an external sst file, global_seqno is not supported.
959 960
      snprintf(
          msg_buf.data(), msg_buf.max_size(),
961 962
          "A non-external sst file have global seqno property with value %s",
          seqno_pos->second.c_str());
963
      return Status::Corruption(msg_buf.data());
964
    }
965
    return Status::OK();
966 967 968 969 970
  }

  uint32_t version = DecodeFixed32(version_pos->second.c_str());
  if (version < 2) {
    if (seqno_pos != props.end() || version != 1) {
971
      std::array<char, 200> msg_buf;
972
      // This is a v1 external sst file, global_seqno is not supported.
973 974 975 976 977
      snprintf(msg_buf.data(), msg_buf.max_size(),
               "An external sst file with version %u have global seqno "
               "property with value %s",
               version, seqno_pos->second.c_str());
      return Status::Corruption(msg_buf.data());
978
    }
979
    return Status::OK();
980 981
  }

982 983 984 985 986 987 988
  // Since we have a plan to deprecate global_seqno, we do not return failure
  // if seqno_pos == props.end(). We rely on version_pos to detect whether the
  // SST is external.
  SequenceNumber global_seqno(0);
  if (seqno_pos != props.end()) {
    global_seqno = DecodeFixed64(seqno_pos->second.c_str());
  }
989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004
  // SstTableReader open table reader with kMaxSequenceNumber as largest_seqno
  // to denote it is unknown.
  if (largest_seqno < kMaxSequenceNumber) {
    if (global_seqno == 0) {
      global_seqno = largest_seqno;
    }
    if (global_seqno != largest_seqno) {
      std::array<char, 200> msg_buf;
      snprintf(
          msg_buf.data(), msg_buf.max_size(),
          "An external sst file with version %u have global seqno property "
          "with value %s, while largest seqno in the file is %llu",
          version, seqno_pos->second.c_str(),
          static_cast<unsigned long long>(largest_seqno));
      return Status::Corruption(msg_buf.data());
    }
1005
  }
1006
  *seqno = global_seqno;
1007 1008

  if (global_seqno > kMaxSequenceNumber) {
1009 1010 1011 1012 1013 1014
    std::array<char, 200> msg_buf;
    snprintf(msg_buf.data(), msg_buf.max_size(),
             "An external sst file with version %u have global seqno property "
             "with value %llu, which is greater than kMaxSequenceNumber",
             version, static_cast<unsigned long long>(global_seqno));
    return Status::Corruption(msg_buf.data());
1015 1016
  }

1017
  return Status::OK();
1018
}
1019 1020
}  // namespace

K
krad 已提交
1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032
Slice BlockBasedTable::GetCacheKey(const char* cache_key_prefix,
                                   size_t cache_key_prefix_size,
                                   const BlockHandle& handle, char* cache_key) {
  assert(cache_key != nullptr);
  assert(cache_key_prefix_size != 0);
  assert(cache_key_prefix_size <= kMaxCacheKeyPrefixSize);
  memcpy(cache_key, cache_key_prefix, cache_key_prefix_size);
  char* end =
      EncodeVarint64(cache_key + cache_key_prefix_size, handle.offset());
  return Slice(cache_key, static_cast<size_t>(end - cache_key));
}

1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043
Status BlockBasedTable::Open(
    const ImmutableCFOptions& ioptions, const EnvOptions& env_options,
    const BlockBasedTableOptions& table_options,
    const InternalKeyComparator& internal_comparator,
    std::unique_ptr<RandomAccessFileReader>&& file, uint64_t file_size,
    std::unique_ptr<TableReader>* table_reader,
    const SliceTransform* prefix_extractor,
    const bool prefetch_index_and_filter_in_cache, const bool skip_filters,
    const int level, const bool immortal_table,
    const SequenceNumber largest_seqno, TailPrefetchStats* tail_prefetch_stats,
    BlockCacheTracer* const block_cache_tracer) {
S
Siying Dong 已提交
1044
  table_reader->reset();
1045

1046
  Status s;
1047
  Footer footer;
1048 1049
  std::unique_ptr<FilePrefetchBuffer> prefetch_buffer;

1050 1051 1052
  // prefetch both index and filters, down to all partitions
  const bool prefetch_all = prefetch_index_and_filter_in_cache || level == 0;
  const bool preload_all = !table_options.cache_index_and_filter_blocks;
1053

1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064
  s = PrefetchTail(file.get(), file_size, tail_prefetch_stats, prefetch_all,
                   preload_all, &prefetch_buffer);

  // Read in the following order:
  //    1. Footer
  //    2. [metaindex block]
  //    3. [meta block: properties]
  //    4. [meta block: range deletion tombstone]
  //    5. [meta block: compression dictionary]
  //    6. [meta block: index]
  //    7. [meta block: filter]
1065 1066
  s = ReadFooterFromFile(file.get(), prefetch_buffer.get(), file_size, &footer,
                         kBlockBasedTableMagicNumber);
1067 1068 1069
  if (!s.ok()) {
    return s;
  }
1070
  if (!BlockBasedTableSupportedVersion(footer.version())) {
1071
    return Status::Corruption(
1072
        "Unknown Footer version. Maybe this file was created with newer "
1073 1074
        "version of RocksDB?");
  }
J
jorlow@chromium.org 已提交
1075

A
Aaron Gao 已提交
1076
  // We've successfully read the footer. We are ready to serve requests.
1077 1078 1079
  // Better not mutate rep_ after the creation. eg. internal_prefix_transform
  // raw pointer will be used to create HashIndexReader, whose reset may
  // access a dangling pointer.
1080
  BlockCacheLookupContext lookup_context{TableReaderCaller::kPrefetch};
1081
  Rep* rep = new BlockBasedTable::Rep(ioptions, env_options, table_options,
1082
                                      internal_comparator, skip_filters, level,
1083
                                      immortal_table);
K
Kai Liu 已提交
1084
  rep->file = std::move(file);
I
xxHash  
Igor Canadi 已提交
1085
  rep->footer = footer;
1086
  rep->index_type = table_options.index_type;
1087
  rep->hash_index_allow_collision = table_options.hash_index_allow_collision;
1088 1089
  // We need to wrap data with internal_prefix_transform to make sure it can
  // handle prefix correctly.
1090
  rep->internal_prefix_transform.reset(
1091
      new InternalKeySliceTransform(prefix_extractor));
1092
  SetupCacheKeyPrefix(rep);
1093 1094
  std::unique_ptr<BlockBasedTable> new_table(
      new BlockBasedTable(rep, block_cache_tracer));
K
Kai Liu 已提交
1095

K
krad 已提交
1096 1097 1098 1099 1100
  // page cache options
  rep->persistent_cache_options =
      PersistentCacheOptions(rep->table_options.persistent_cache,
                             std::string(rep->persistent_cache_key_prefix,
                                         rep->persistent_cache_key_prefix_size),
1101
                             rep->ioptions.statistics);
K
krad 已提交
1102

1103 1104 1105 1106 1107
  // Meta-blocks are not dictionary compressed. Explicitly set the dictionary
  // handle to null, otherwise it may be seen as uninitialized during the below
  // meta-block reads.
  rep->compression_dict_handle = BlockHandle::NullBlockHandle();

1108
  // Read metaindex
K
Kai Liu 已提交
1109
  std::unique_ptr<Block> meta;
S
sdong 已提交
1110
  std::unique_ptr<InternalIterator> meta_iter;
1111
  s = new_table->ReadMetaBlock(prefetch_buffer.get(), &meta, &meta_iter);
1112 1113 1114
  if (!s.ok()) {
    return s;
  }
K
Kai Liu 已提交
1115

1116 1117
  s = new_table->ReadPropertiesBlock(prefetch_buffer.get(), meta_iter.get(),
                                     largest_seqno);
1118 1119 1120
  if (!s.ok()) {
    return s;
  }
1121
  s = new_table->ReadRangeDelBlock(prefetch_buffer.get(), meta_iter.get(),
1122
                                   internal_comparator, &lookup_context);
1123 1124 1125
  if (!s.ok()) {
    return s;
  }
1126 1127
  s = new_table->PrefetchIndexAndFilterBlocks(
      prefetch_buffer.get(), meta_iter.get(), new_table.get(), prefetch_all,
1128
      table_options, level, &lookup_context);
1129 1130 1131 1132 1133 1134 1135 1136

  if (s.ok()) {
    // Update tail prefetch stats
    assert(prefetch_buffer.get() != nullptr);
    if (tail_prefetch_stats != nullptr) {
      assert(prefetch_buffer->min_offset_read() < file_size);
      tail_prefetch_stats->RecordEffectiveSize(
          static_cast<size_t>(file_size) - prefetch_buffer->min_offset_read());
I
Igor Canadi 已提交
1137
    }
1138 1139

    *table_reader = std::move(new_table);
I
Igor Canadi 已提交
1140 1141
  }

1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187
  return s;
}

Status BlockBasedTable::PrefetchTail(
    RandomAccessFileReader* file, uint64_t file_size,
    TailPrefetchStats* tail_prefetch_stats, const bool prefetch_all,
    const bool preload_all,
    std::unique_ptr<FilePrefetchBuffer>* prefetch_buffer) {
  size_t tail_prefetch_size = 0;
  if (tail_prefetch_stats != nullptr) {
    // Multiple threads may get a 0 (no history) when running in parallel,
    // but it will get cleared after the first of them finishes.
    tail_prefetch_size = tail_prefetch_stats->GetSuggestedPrefetchSize();
  }
  if (tail_prefetch_size == 0) {
    // Before read footer, readahead backwards to prefetch data. Do more
    // readahead if we're going to read index/filter.
    // TODO: This may incorrectly select small readahead in case partitioned
    // index/filter is enabled and top-level partition pinning is enabled.
    // That's because we need to issue readahead before we read the properties,
    // at which point we don't yet know the index type.
    tail_prefetch_size = prefetch_all || preload_all ? 512 * 1024 : 4 * 1024;
  }
  size_t prefetch_off;
  size_t prefetch_len;
  if (file_size < tail_prefetch_size) {
    prefetch_off = 0;
    prefetch_len = static_cast<size_t>(file_size);
  } else {
    prefetch_off = static_cast<size_t>(file_size - tail_prefetch_size);
    prefetch_len = tail_prefetch_size;
  }
  TEST_SYNC_POINT_CALLBACK("BlockBasedTable::Open::TailPrefetchLen",
                           &tail_prefetch_size);
  Status s;
  // TODO should not have this special logic in the future.
  if (!file->use_direct_io()) {
    prefetch_buffer->reset(new FilePrefetchBuffer(nullptr, 0, 0, false, true));
    s = file->Prefetch(prefetch_off, prefetch_len);
  } else {
    prefetch_buffer->reset(new FilePrefetchBuffer(nullptr, 0, 0, true, true));
    s = (*prefetch_buffer)->Prefetch(file, prefetch_off, prefetch_len);
  }
  return s;
}

1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214
Status VerifyChecksum(const ChecksumType type, const char* buf, size_t len,
                      uint32_t expected) {
  Status s;
  uint32_t actual = 0;
  switch (type) {
    case kNoChecksum:
      break;
    case kCRC32c:
      expected = crc32c::Unmask(expected);
      actual = crc32c::Value(buf, len);
      break;
    case kxxHash:
      actual = XXH32(buf, static_cast<int>(len), 0);
      break;
    case kxxHash64:
      actual = static_cast<uint32_t>(XXH64(buf, static_cast<int>(len), 0) &
                                     uint64_t{0xffffffff});
      break;
    default:
      s = Status::Corruption("unknown checksum type");
  }
  if (s.ok() && actual != expected) {
    s = Status::Corruption("properties block checksum mismatched");
  }
  return s;
}

1215
Status BlockBasedTable::TryReadPropertiesWithGlobalSeqno(
1216
    FilePrefetchBuffer* prefetch_buffer, const Slice& handle_value,
1217 1218 1219 1220 1221 1222 1223 1224 1225 1226
    TableProperties** table_properties) {
  assert(table_properties != nullptr);
  // If this is an external SST file ingested with write_global_seqno set to
  // true, then we expect the checksum mismatch because checksum was written
  // by SstFileWriter, but its global seqno in the properties block may have
  // been changed during ingestion. In this case, we read the properties
  // block, copy it to a memory buffer, change the global seqno to its
  // original value, i.e. 0, and verify the checksum again.
  BlockHandle props_block_handle;
  CacheAllocationPtr tmp_buf;
1227 1228
  Status s = ReadProperties(handle_value, rep_->file.get(), prefetch_buffer,
                            rep_->footer, rep_->ioptions, table_properties,
1229 1230 1231 1232 1233 1234 1235 1236
                            false /* verify_checksum */, &props_block_handle,
                            &tmp_buf, false /* compression_type_missing */,
                            nullptr /* memory_allocator */);
  if (s.ok() && tmp_buf) {
    const auto seqno_pos_iter =
        (*table_properties)
            ->properties_offsets.find(
                ExternalSstFilePropertyNames::kGlobalSeqno);
1237
    size_t block_size = static_cast<size_t>(props_block_handle.size());
1238 1239 1240 1241 1242 1243
    if (seqno_pos_iter != (*table_properties)->properties_offsets.end()) {
      uint64_t global_seqno_offset = seqno_pos_iter->second;
      EncodeFixed64(
          tmp_buf.get() + global_seqno_offset - props_block_handle.offset(), 0);
    }
    uint32_t value = DecodeFixed32(tmp_buf.get() + block_size + 1);
1244
    s = rocksdb::VerifyChecksum(rep_->footer.checksum(), tmp_buf.get(),
1245 1246 1247 1248 1249
                                block_size + 1, value);
  }
  return s;
}

1250
Status BlockBasedTable::ReadPropertiesBlock(
1251
    FilePrefetchBuffer* prefetch_buffer, InternalIterator* meta_iter,
1252
    const SequenceNumber largest_seqno) {
1253
  bool found_properties_block = true;
1254 1255
  Status s;
  s = SeekToPropertiesBlock(meta_iter, &found_properties_block);
1256

1257
  if (!s.ok()) {
1258
    ROCKS_LOG_WARN(rep_->ioptions.info_log,
1259 1260
                   "Error when seeking to properties block from file: %s",
                   s.ToString().c_str());
1261
  } else if (found_properties_block) {
K
Kai Liu 已提交
1262
    s = meta_iter->status();
K
kailiu 已提交
1263
    TableProperties* table_properties = nullptr;
K
Kai Liu 已提交
1264
    if (s.ok()) {
1265
      s = ReadProperties(
1266 1267
          meta_iter->value(), rep_->file.get(), prefetch_buffer, rep_->footer,
          rep_->ioptions, &table_properties, true /* verify_checksum */,
1268 1269 1270 1271 1272
          nullptr /* ret_block_handle */, nullptr /* ret_block_contents */,
          false /* compression_type_missing */, nullptr /* memory_allocator */);
    }

    if (s.IsCorruption()) {
1273 1274
      s = TryReadPropertiesWithGlobalSeqno(prefetch_buffer, meta_iter->value(),
                                           &table_properties);
1275 1276 1277 1278
    }
    std::unique_ptr<TableProperties> props_guard;
    if (table_properties != nullptr) {
      props_guard.reset(table_properties);
K
Kai Liu 已提交
1279
    }
J
jorlow@chromium.org 已提交
1280

K
Kai Liu 已提交
1281
    if (!s.ok()) {
1282
      ROCKS_LOG_WARN(rep_->ioptions.info_log,
1283 1284 1285
                     "Encountered error while reading data from properties "
                     "block %s",
                     s.ToString().c_str());
K
kailiu 已提交
1286
    } else {
1287
      assert(table_properties != nullptr);
1288 1289 1290 1291 1292 1293
      rep_->table_properties.reset(props_guard.release());
      rep_->blocks_maybe_compressed =
          rep_->table_properties->compression_name !=
          CompressionTypeToString(kNoCompression);
      rep_->blocks_definitely_zstd_compressed =
          (rep_->table_properties->compression_name ==
1294
               CompressionTypeToString(kZSTD) ||
1295
           rep_->table_properties->compression_name ==
1296
               CompressionTypeToString(kZSTDNotFinalCompression));
K
Kai Liu 已提交
1297
    }
1298
  } else {
1299
    ROCKS_LOG_ERROR(rep_->ioptions.info_log,
1300
                    "Cannot find Properties block from file.");
K
Kai Liu 已提交
1301
  }
1302
#ifndef ROCKSDB_LITE
1303 1304 1305
  if (rep_->table_properties) {
    ParseSliceTransform(rep_->table_properties->prefix_extractor_name,
                        &(rep_->table_prefix_extractor));
1306 1307
  }
#endif  // ROCKSDB_LITE
K
Kai Liu 已提交
1308

1309
  // Read the table properties, if provided.
1310 1311 1312
  if (rep_->table_properties) {
    rep_->whole_key_filtering &=
        IsFeatureSupported(*(rep_->table_properties),
1313
                           BlockBasedTablePropertyNames::kWholeKeyFiltering,
1314 1315 1316 1317 1318 1319 1320 1321
                           rep_->ioptions.info_log);
    rep_->prefix_filtering &=
        IsFeatureSupported(*(rep_->table_properties),
                           BlockBasedTablePropertyNames::kPrefixFiltering,
                           rep_->ioptions.info_log);

    s = GetGlobalSequenceNumber(*(rep_->table_properties), largest_seqno,
                                &(rep_->global_seqno));
1322
    if (!s.ok()) {
1323
      ROCKS_LOG_ERROR(rep_->ioptions.info_log, "%s", s.ToString().c_str());
1324
    }
1325
  }
1326 1327
  return s;
}
1328

1329
Status BlockBasedTable::ReadRangeDelBlock(
1330
    FilePrefetchBuffer* prefetch_buffer, InternalIterator* meta_iter,
1331 1332
    const InternalKeyComparator& internal_comparator,
    BlockCacheLookupContext* lookup_context) {
1333
  Status s;
1334
  bool found_range_del_block;
1335 1336
  BlockHandle range_del_handle;
  s = SeekToRangeDelBlock(meta_iter, &found_range_del_block, &range_del_handle);
1337
  if (!s.ok()) {
1338
    ROCKS_LOG_WARN(
1339
        rep_->ioptions.info_log,
1340 1341
        "Error when seeking to range delete tombstones block from file: %s",
        s.ToString().c_str());
1342
  } else if (found_range_del_block && !range_del_handle.IsNull()) {
1343
    ReadOptions read_options;
1344
    std::unique_ptr<InternalIterator> iter(NewDataBlockIterator<DataBlockIter>(
1345 1346 1347 1348
        read_options, range_del_handle,
        /*input_iter=*/nullptr, BlockType::kRangeDeletion,
        /*key_includes_seq=*/true, /*index_key_is_full=*/true,
        /*get_context=*/nullptr, lookup_context, Status(), prefetch_buffer));
1349 1350
    assert(iter != nullptr);
    s = iter->status();
1351 1352
    if (!s.ok()) {
      ROCKS_LOG_WARN(
1353
          rep_->ioptions.info_log,
1354 1355
          "Encountered error while reading data from range del block %s",
          s.ToString().c_str());
1356
    } else {
1357
      rep_->fragmented_range_dels =
1358 1359
          std::make_shared<FragmentedRangeTombstoneList>(std::move(iter),
                                                         internal_comparator);
1360 1361
    }
  }
1362 1363 1364 1365
  return s;
}

Status BlockBasedTable::ReadCompressionDictBlock(
1366 1367
    FilePrefetchBuffer* prefetch_buffer,
    std::unique_ptr<const BlockContents>* compression_dict_block) const {
1368
  assert(compression_dict_block != nullptr);
1369
  Status s;
1370
  if (!rep_->compression_dict_handle.IsNull()) {
1371 1372 1373
    std::unique_ptr<BlockContents> compression_dict_cont{new BlockContents()};
    PersistentCacheOptions cache_options;
    ReadOptions read_options;
1374
    read_options.verify_checksums = true;
1375
    BlockFetcher compression_block_fetcher(
1376 1377 1378
        rep_->file.get(), prefetch_buffer, rep_->footer, read_options,
        rep_->compression_dict_handle, compression_dict_cont.get(),
        rep_->ioptions, false /* decompress */, false /*maybe_compressed*/,
1379 1380
        BlockType::kCompressionDictionary, UncompressionDict::GetEmptyDict(),
        cache_options);
1381 1382 1383 1384
    s = compression_block_fetcher.ReadBlockContents();

    if (!s.ok()) {
      ROCKS_LOG_WARN(
1385
          rep_->ioptions.info_log,
1386 1387 1388 1389
          "Encountered error while reading data from compression dictionary "
          "block %s",
          s.ToString().c_str());
    } else {
1390
      *compression_dict_block = std::move(compression_dict_cont);
1391 1392 1393 1394 1395 1396
    }
  }
  return s;
}

Status BlockBasedTable::PrefetchIndexAndFilterBlocks(
1397
    FilePrefetchBuffer* prefetch_buffer, InternalIterator* meta_iter,
1398
    BlockBasedTable* new_table, bool prefetch_all,
1399 1400
    const BlockBasedTableOptions& table_options, const int level,
    BlockCacheLookupContext* lookup_context) {
1401 1402 1403
  Status s;

  // Find filter handle and filter type
1404
  if (rep_->filter_policy) {
1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422
    for (auto filter_type :
         {Rep::FilterType::kFullFilter, Rep::FilterType::kPartitionedFilter,
          Rep::FilterType::kBlockFilter}) {
      std::string prefix;
      switch (filter_type) {
        case Rep::FilterType::kFullFilter:
          prefix = kFullFilterBlockPrefix;
          break;
        case Rep::FilterType::kPartitionedFilter:
          prefix = kPartitionedFilterBlockPrefix;
          break;
        case Rep::FilterType::kBlockFilter:
          prefix = kFilterBlockPrefix;
          break;
        default:
          assert(0);
      }
      std::string filter_block_key = prefix;
1423 1424
      filter_block_key.append(rep_->filter_policy->Name());
      if (FindMetaBlock(meta_iter, filter_block_key, &rep_->filter_handle)
1425
              .ok()) {
1426
        rep_->filter_type = filter_type;
1427 1428 1429 1430
        break;
      }
    }
  }
1431

1432 1433 1434 1435
  {
    // Find compression dictionary handle
    bool found_compression_dict;
    s = SeekToCompressionDictBlock(meta_iter, &found_compression_dict,
1436
                                   &rep_->compression_dict_handle);
1437 1438
  }

1439
  BlockBasedTableOptions::IndexType index_type = new_table->UpdateIndexType();
1440 1441 1442

  const bool use_cache = table_options.cache_index_and_filter_blocks;

1443 1444 1445 1446 1447 1448 1449
  // prefetch the first level of index
  const bool prefetch_index =
      prefetch_all ||
      (table_options.pin_top_level_index_and_filter &&
       index_type == BlockBasedTableOptions::kTwoLevelIndexSearch);
  // prefetch the first level of filter
  const bool prefetch_filter =
1450 1451 1452
      prefetch_all ||
      (table_options.pin_top_level_index_and_filter &&
       rep_->filter_type == Rep::FilterType::kPartitionedFilter);
1453
  // Partition fitlers cannot be enabled without partition indexes
1454
  assert(!prefetch_filter || prefetch_index);
1455 1456
  // pin both index and filters, down to all partitions
  const bool pin_all =
1457
      rep_->table_options.pin_l0_filter_and_index_blocks_in_cache && level == 0;
1458 1459 1460 1461 1462 1463 1464
  // pin the first level of index
  const bool pin_index =
      pin_all || (table_options.pin_top_level_index_and_filter &&
                  index_type == BlockBasedTableOptions::kTwoLevelIndexSearch);
  // pin the first level of filter
  const bool pin_filter =
      pin_all || (table_options.pin_top_level_index_and_filter &&
1465
                  rep_->filter_type == Rep::FilterType::kPartitionedFilter);
1466 1467 1468 1469

  IndexReader* index_reader = nullptr;
  if (s.ok()) {
    s = new_table->CreateIndexReader(prefetch_buffer, meta_iter, use_cache,
1470 1471
                                     prefetch_index, pin_index, &index_reader,
                                     lookup_context);
1472 1473
    if (s.ok()) {
      assert(index_reader != nullptr);
1474
      rep_->index_reader.reset(index_reader);
1475 1476 1477 1478
      // The partitions of partitioned index are always stored in cache. They
      // are hence follow the configuration for pin and prefetch regardless of
      // the value of cache_index_and_filter_blocks
      if (prefetch_all) {
1479
        rep_->index_reader->CacheDependencies(pin_all);
1480 1481 1482 1483 1484 1485 1486
      }
    } else {
      delete index_reader;
      index_reader = nullptr;
    }
  }

1487
  // pre-fetching of blocks is turned on
1488
  // Will use block cache for meta-blocks access
1489
  // Always prefetch index and filter for level 0
1490
  // TODO(ajkr): also prefetch compression dictionary block
1491 1492
  // TODO(ajkr): also pin compression dictionary block when
  // `pin_l0_filter_and_index_blocks_in_cache == true`.
1493
  if (table_options.cache_index_and_filter_blocks) {
1494 1495 1496
    assert(table_options.block_cache != nullptr);
    if (s.ok() && prefetch_filter) {
      // Hack: Call GetFilter() to implicitly add filter to the block_cache
1497
      auto filter_entry =
1498 1499 1500
          new_table->GetFilter(rep_->table_prefix_extractor.get(),
                               /*prefetch_buffer=*/nullptr, /*no_io=*/false,
                               /*get_context=*/nullptr, lookup_context);
1501 1502
      if (filter_entry.GetValue() != nullptr && prefetch_all) {
        filter_entry.GetValue()->CacheDependencies(
1503
            pin_all, rep_->table_prefix_extractor.get());
1504 1505 1506 1507 1508
      }
      // if pin_filter is true then save it in rep_->filter_entry; it will be
      // released in the destructor only, hence it will be pinned in the
      // cache while this reader is alive
      if (pin_filter) {
1509
        rep_->filter_entry = std::move(filter_entry);
K
Kai Liu 已提交
1510
      }
1511 1512
    }
  } else {
1513
    std::unique_ptr<const BlockContents> compression_dict_block;
1514 1515
    if (s.ok()) {
      // Set filter block
1516
      if (rep_->filter_policy) {
M
Maysam Yabandeh 已提交
1517
        const bool is_a_filter_partition = true;
1518 1519 1520 1521
        auto filter = new_table->ReadFilter(
            prefetch_buffer, rep_->filter_handle, !is_a_filter_partition,
            rep_->table_prefix_extractor.get());
        rep_->filter.reset(filter);
1522 1523
        // Refer to the comment above about paritioned indexes always being
        // cached
1524
        if (filter && prefetch_all) {
1525 1526
          filter->CacheDependencies(pin_all,
                                    rep_->table_prefix_extractor.get());
M
Maysam Yabandeh 已提交
1527
        }
1528
      }
1529
      s = ReadCompressionDictBlock(prefetch_buffer, &compression_dict_block);
K
Kai Liu 已提交
1530
    }
1531
    if (s.ok() && !rep_->compression_dict_handle.IsNull()) {
1532 1533
      assert(compression_dict_block != nullptr);
      // TODO(ajkr): find a way to avoid the `compression_dict_block` data copy
1534
      rep_->uncompression_dict.reset(new UncompressionDict(
1535
          compression_dict_block->data.ToString(),
1536
          rep_->blocks_definitely_zstd_compressed, rep_->ioptions.statistics));
1537
    }
K
Kai Liu 已提交
1538
  }
J
jorlow@chromium.org 已提交
1539 1540 1541
  return s;
}

S
Siying Dong 已提交
1542
void BlockBasedTable::SetupForCompaction() {
1543
  switch (rep_->ioptions.access_hint_on_compaction_start) {
1544 1545 1546
    case Options::NONE:
      break;
    case Options::NORMAL:
1547
      rep_->file->file()->Hint(RandomAccessFile::NORMAL);
1548 1549
      break;
    case Options::SEQUENTIAL:
1550
      rep_->file->file()->Hint(RandomAccessFile::SEQUENTIAL);
1551 1552
      break;
    case Options::WILLNEED:
1553
      rep_->file->file()->Hint(RandomAccessFile::WILLNEED);
1554 1555 1556 1557 1558 1559
      break;
    default:
      assert(false);
  }
}

K
kailiu 已提交
1560 1561
std::shared_ptr<const TableProperties> BlockBasedTable::GetTableProperties()
    const {
K
kailiu 已提交
1562
  return rep_->table_properties;
K
Kai Liu 已提交
1563
}
S
Sanjay Ghemawat 已提交
1564

1565 1566 1567 1568 1569 1570 1571 1572
size_t BlockBasedTable::ApproximateMemoryUsage() const {
  size_t usage = 0;
  if (rep_->filter) {
    usage += rep_->filter->ApproximateMemoryUsage();
  }
  if (rep_->index_reader) {
    usage += rep_->index_reader->ApproximateMemoryUsage();
  }
1573 1574 1575
  if (rep_->uncompression_dict) {
    usage += rep_->uncompression_dict->ApproximateMemoryUsage();
  }
1576 1577 1578
  return usage;
}

K
Kai Liu 已提交
1579 1580
// Load the meta-block from the file. On success, return the loaded meta block
// and its iterator.
1581
Status BlockBasedTable::ReadMetaBlock(FilePrefetchBuffer* prefetch_buffer,
S
sdong 已提交
1582 1583
                                      std::unique_ptr<Block>* meta_block,
                                      std::unique_ptr<InternalIterator>* iter) {
S
Sanjay Ghemawat 已提交
1584 1585
  // TODO(sanjay): Skip this if footer.metaindex_handle() size indicates
  // it is an empty block.
1586
  std::unique_ptr<Block> meta;
K
Kai Liu 已提交
1587
  Status s = ReadBlockFromFile(
1588 1589
      rep_->file.get(), prefetch_buffer, rep_->footer, ReadOptions(),
      rep_->footer.metaindex_handle(), &meta, rep_->ioptions,
1590
      true /* decompress */, true /*maybe_compressed*/, BlockType::kMetaIndex,
1591
      UncompressionDict::GetEmptyDict(), rep_->persistent_cache_options,
1592
      kDisableGlobalSequenceNumber, 0 /* read_amp_bytes_per_bit */,
1593
      GetMemoryAllocator(rep_->table_options));
K
Kai Liu 已提交
1594

K
Kai Liu 已提交
1595
  if (!s.ok()) {
1596
    ROCKS_LOG_ERROR(rep_->ioptions.info_log,
1597 1598 1599
                    "Encountered error while reading data from properties"
                    " block %s",
                    s.ToString().c_str());
K
Kai Liu 已提交
1600
    return s;
S
Sanjay Ghemawat 已提交
1601
  }
K
Kai Liu 已提交
1602

1603
  *meta_block = std::move(meta);
K
Kai Liu 已提交
1604
  // meta block uses bytewise comparator.
1605 1606
  iter->reset(meta_block->get()->NewIterator<DataBlockIter>(
      BytewiseComparator(), BytewiseComparator()));
K
Kai Liu 已提交
1607
  return Status::OK();
S
Sanjay Ghemawat 已提交
1608 1609
}

1610 1611
Status BlockBasedTable::GetDataBlockFromCache(
    const Slice& block_cache_key, const Slice& compressed_block_cache_key,
1612
    Cache* block_cache, Cache* block_cache_compressed,
1613
    const ReadOptions& read_options, CachableEntry<Block>* block,
1614
    const UncompressionDict& uncompression_dict, BlockType block_type,
1615 1616
    GetContext* get_context) const {
  const size_t read_amp_bytes_per_bit =
1617 1618 1619
      block_type == BlockType::kData
          ? rep_->table_options.read_amp_bytes_per_bit
          : 0;
1620 1621 1622
  assert(block);
  assert(block->IsEmpty());

1623
  Status s;
1624
  BlockContents* compressed_block = nullptr;
1625 1626 1627 1628
  Cache::Handle* block_cache_compressed_handle = nullptr;

  // Lookup uncompressed cache first
  if (block_cache != nullptr) {
1629 1630
    auto cache_handle = GetEntryFromCache(block_cache, block_cache_key,
                                          block_type, get_context);
1631 1632 1633 1634
    if (cache_handle != nullptr) {
      block->SetCachedValue(
          reinterpret_cast<Block*>(block_cache->Value(cache_handle)),
          block_cache, cache_handle);
1635 1636 1637 1638 1639
      return s;
    }
  }

  // If not found, search from the compressed block cache.
1640
  assert(block->IsEmpty());
1641 1642 1643 1644 1645 1646 1647 1648

  if (block_cache_compressed == nullptr) {
    return s;
  }

  assert(!compressed_block_cache_key.empty());
  block_cache_compressed_handle =
      block_cache_compressed->Lookup(compressed_block_cache_key);
1649 1650 1651

  Statistics* statistics = rep_->ioptions.statistics;

1652 1653 1654 1655 1656 1657 1658 1659 1660
  // if we found in the compressed cache, then uncompress and insert into
  // uncompressed cache
  if (block_cache_compressed_handle == nullptr) {
    RecordTick(statistics, BLOCK_CACHE_COMPRESSED_MISS);
    return s;
  }

  // found compressed block
  RecordTick(statistics, BLOCK_CACHE_COMPRESSED_HIT);
1661
  compressed_block = reinterpret_cast<BlockContents*>(
1662
      block_cache_compressed->Value(block_cache_compressed_handle));
1663 1664
  CompressionType compression_type = compressed_block->get_compression_type();
  assert(compression_type != kNoCompression);
1665 1666 1667

  // Retrieve the uncompressed contents into a new buffer
  BlockContents contents;
1668
  UncompressionContext context(compression_type);
1669
  UncompressionInfo info(context, uncompression_dict, compression_type);
1670 1671 1672 1673
  s = UncompressBlockContents(
      info, compressed_block->data.data(), compressed_block->data.size(),
      &contents, rep_->table_options.format_version, rep_->ioptions,
      GetMemoryAllocator(rep_->table_options));
1674 1675 1676

  // Insert uncompressed block into block cache
  if (s.ok()) {
1677
    std::unique_ptr<Block> block_holder(
1678
        new Block(std::move(contents), rep_->get_global_seqno(block_type),
1679
                  read_amp_bytes_per_bit, statistics));  // uncompressed block
1680 1681

    if (block_cache != nullptr && block_holder->own_bytes() &&
1682
        read_options.fill_cache) {
1683 1684 1685
      size_t charge = block_holder->ApproximateMemoryUsage();
      Cache::Handle* cache_handle = nullptr;
      s = block_cache->Insert(block_cache_key, block_holder.get(), charge,
1686
                              &DeleteCachedEntry<Block>, &cache_handle);
1687
#ifndef NDEBUG
1688
      block_cache->TEST_mark_as_data_block(block_cache_key, charge);
1689
#endif  // NDEBUG
1690
      if (s.ok()) {
1691 1692 1693 1694
        assert(cache_handle != nullptr);
        block->SetCachedValue(block_holder.release(), block_cache,
                              cache_handle);

1695
        UpdateCacheInsertionMetrics(block_type, get_context, charge);
1696 1697 1698
      } else {
        RecordTick(statistics, BLOCK_CACHE_ADD_FAILURES);
      }
1699 1700
    } else {
      block->SetOwnedValue(block_holder.release());
1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711
    }
  }

  // Release hold on compressed cache entry
  block_cache_compressed->Release(block_cache_compressed_handle);
  return s;
}

Status BlockBasedTable::PutDataBlockToCache(
    const Slice& block_cache_key, const Slice& compressed_block_cache_key,
    Cache* block_cache, Cache* block_cache_compressed,
1712
    CachableEntry<Block>* cached_block, BlockContents* raw_block_contents,
1713
    CompressionType raw_block_comp_type,
1714
    const UncompressionDict& uncompression_dict, SequenceNumber seq_no,
1715
    MemoryAllocator* memory_allocator, BlockType block_type,
1716 1717 1718 1719
    GetContext* get_context) const {
  const ImmutableCFOptions& ioptions = rep_->ioptions;
  const uint32_t format_version = rep_->table_options.format_version;
  const size_t read_amp_bytes_per_bit =
1720 1721 1722
      block_type == BlockType::kData
          ? rep_->table_options.read_amp_bytes_per_bit
          : 0;
1723
  const Cache::Priority priority =
1724 1725 1726 1727
      rep_->table_options.cache_index_and_filter_blocks_with_high_priority &&
              (block_type == BlockType::kFilter ||
               block_type == BlockType::kCompressionDictionary ||
               block_type == BlockType::kIndex)
1728 1729
          ? Cache::Priority::HIGH
          : Cache::Priority::LOW;
1730 1731
  assert(cached_block);
  assert(cached_block->IsEmpty());
1732
  assert(raw_block_comp_type == kNoCompression ||
1733 1734 1735
         block_cache_compressed != nullptr);

  Status s;
1736
  Statistics* statistics = ioptions.statistics;
1737 1738

  std::unique_ptr<Block> block_holder;
1739
  if (raw_block_comp_type != kNoCompression) {
1740 1741
    // Retrieve the uncompressed contents into a new buffer
    BlockContents uncompressed_block_contents;
1742
    UncompressionContext context(raw_block_comp_type);
1743
    UncompressionInfo info(context, uncompression_dict, raw_block_comp_type);
1744 1745 1746 1747
    s = UncompressBlockContents(info, raw_block_contents->data.data(),
                                raw_block_contents->data.size(),
                                &uncompressed_block_contents, format_version,
                                ioptions, memory_allocator);
1748 1749 1750
    if (!s.ok()) {
      return s;
    }
1751

1752 1753
    block_holder.reset(new Block(std::move(uncompressed_block_contents), seq_no,
                                 read_amp_bytes_per_bit, statistics));
1754
  } else {
1755 1756
    block_holder.reset(new Block(std::move(*raw_block_contents), seq_no,
                                 read_amp_bytes_per_bit, statistics));
1757 1758 1759 1760
  }

  // Insert compressed block into compressed block cache.
  // Release the hold on the compressed cache entry immediately.
1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775
  if (block_cache_compressed != nullptr &&
      raw_block_comp_type != kNoCompression && raw_block_contents != nullptr &&
      raw_block_contents->own_bytes()) {
#ifndef NDEBUG
    assert(raw_block_contents->is_raw_block);
#endif  // NDEBUG

    // We cannot directly put raw_block_contents because this could point to
    // an object in the stack.
    BlockContents* block_cont_for_comp_cache =
        new BlockContents(std::move(*raw_block_contents));
    s = block_cache_compressed->Insert(
        compressed_block_cache_key, block_cont_for_comp_cache,
        block_cont_for_comp_cache->ApproximateMemoryUsage(),
        &DeleteCachedEntry<BlockContents>);
1776 1777 1778 1779 1780
    if (s.ok()) {
      // Avoid the following code to delete this cached block.
      RecordTick(statistics, BLOCK_CACHE_COMPRESSED_ADD);
    } else {
      RecordTick(statistics, BLOCK_CACHE_COMPRESSED_ADD_FAILURES);
1781
      delete block_cont_for_comp_cache;
1782
    }
1783 1784 1785
  }

  // insert into uncompressed block cache
1786 1787 1788 1789
  if (block_cache != nullptr && block_holder->own_bytes()) {
    size_t charge = block_holder->ApproximateMemoryUsage();
    Cache::Handle* cache_handle = nullptr;
    s = block_cache->Insert(block_cache_key, block_holder.get(), charge,
1790
                            &DeleteCachedEntry<Block>, &cache_handle, priority);
1791
#ifndef NDEBUG
1792
    block_cache->TEST_mark_as_data_block(block_cache_key, charge);
1793
#endif  // NDEBUG
1794
    if (s.ok()) {
1795 1796 1797 1798
      assert(cache_handle != nullptr);
      cached_block->SetCachedValue(block_holder.release(), block_cache,
                                   cache_handle);

1799
      UpdateCacheInsertionMetrics(block_type, get_context, charge);
1800 1801 1802
    } else {
      RecordTick(statistics, BLOCK_CACHE_ADD_FAILURES);
    }
1803 1804
  } else {
    cached_block->SetOwnedValue(block_holder.release());
1805 1806 1807 1808 1809
  }

  return s;
}

M
Maysam Yabandeh 已提交
1810
FilterBlockReader* BlockBasedTable::ReadFilter(
1811
    FilePrefetchBuffer* prefetch_buffer, const BlockHandle& filter_handle,
1812 1813
    const bool is_a_filter_partition,
    const SliceTransform* prefix_extractor) const {
M
Maysam Yabandeh 已提交
1814
  auto& rep = rep_;
K
Kai Liu 已提交
1815 1816
  // TODO: We might want to unify with ReadBlockFromFile() if we start
  // requiring checksum verification in Table::Open.
I
Igor Canadi 已提交
1817 1818 1819 1820
  if (rep->filter_type == Rep::FilterType::kNoFilter) {
    return nullptr;
  }
  BlockContents block;
S
Siying Dong 已提交
1821

1822 1823 1824
  BlockFetcher block_fetcher(
      rep->file.get(), prefetch_buffer, rep->footer, ReadOptions(),
      filter_handle, &block, rep->ioptions, false /* decompress */,
1825 1826 1827
      false /*maybe_compressed*/, BlockType::kFilter,
      UncompressionDict::GetEmptyDict(), rep->persistent_cache_options,
      GetMemoryAllocator(rep->table_options));
S
Siying Dong 已提交
1828 1829 1830
  Status s = block_fetcher.ReadBlockContents();

  if (!s.ok()) {
I
Igor Canadi 已提交
1831 1832 1833 1834 1835 1836
    // Error reading the block
    return nullptr;
  }

  assert(rep->filter_policy);

M
Maysam Yabandeh 已提交
1837 1838 1839 1840 1841 1842 1843 1844 1845
  auto filter_type = rep->filter_type;
  if (rep->filter_type == Rep::FilterType::kPartitionedFilter &&
      is_a_filter_partition) {
    filter_type = Rep::FilterType::kFullFilter;
  }

  switch (filter_type) {
    case Rep::FilterType::kPartitionedFilter: {
      return new PartitionedFilterBlockReader(
1846
          rep->prefix_filtering ? prefix_extractor : nullptr,
M
Maysam Yabandeh 已提交
1847
          rep->whole_key_filtering, std::move(block), nullptr,
1848 1849
          rep->ioptions.statistics, rep->internal_comparator, this,
          rep_->table_properties == nullptr ||
1850 1851 1852
              rep_->table_properties->index_key_is_user_key == 0,
          rep_->table_properties == nullptr ||
              rep_->table_properties->index_value_is_delta_encoded == 0);
M
Maysam Yabandeh 已提交
1853 1854 1855 1856
    }

    case Rep::FilterType::kBlockFilter:
      return new BlockBasedFilterBlockReader(
1857
          rep->prefix_filtering ? prefix_extractor : nullptr,
M
Maysam Yabandeh 已提交
1858 1859 1860 1861 1862 1863 1864
          rep->table_options, rep->whole_key_filtering, std::move(block),
          rep->ioptions.statistics);

    case Rep::FilterType::kFullFilter: {
      auto filter_bits_reader =
          rep->filter_policy->GetFilterBitsReader(block.data);
      assert(filter_bits_reader != nullptr);
I
Igor Canadi 已提交
1865
      return new FullFilterBlockReader(
1866
          rep->prefix_filtering ? prefix_extractor : nullptr,
1867 1868
          rep->whole_key_filtering, std::move(block), filter_bits_reader,
          rep->ioptions.statistics);
1869
    }
I
Igor Canadi 已提交
1870

M
Maysam Yabandeh 已提交
1871 1872 1873 1874 1875 1876
    default:
      // filter_type is either kNoFilter (exited the function at the first if),
      // or it must be covered in this switch block
      assert(false);
      return nullptr;
  }
S
Sanjay Ghemawat 已提交
1877 1878
}

1879
CachableEntry<FilterBlockReader> BlockBasedTable::GetFilter(
1880
    const SliceTransform* prefix_extractor, FilePrefetchBuffer* prefetch_buffer,
1881 1882
    bool no_io, GetContext* get_context,
    BlockCacheLookupContext* lookup_context) const {
M
Maysam Yabandeh 已提交
1883 1884
  const BlockHandle& filter_blk_handle = rep_->filter_handle;
  const bool is_a_filter_partition = true;
1885
  return GetFilter(prefetch_buffer, filter_blk_handle, !is_a_filter_partition,
1886
                   no_io, get_context, lookup_context, prefix_extractor);
M
Maysam Yabandeh 已提交
1887 1888
}

1889
CachableEntry<FilterBlockReader> BlockBasedTable::GetFilter(
1890
    FilePrefetchBuffer* prefetch_buffer, const BlockHandle& filter_blk_handle,
1891
    const bool is_a_filter_partition, bool no_io, GetContext* get_context,
1892
    BlockCacheLookupContext* lookup_context,
1893
    const SliceTransform* prefix_extractor) const {
1894 1895 1896 1897
  // If cache_index_and_filter_blocks is false, filter should be pre-populated.
  // We will return rep_->filter anyway. rep_->filter can be nullptr if filter
  // read fails at Open() time. We don't want to reload again since it will
  // most probably fail again.
M
Maysam Yabandeh 已提交
1898 1899
  if (!is_a_filter_partition &&
      !rep_->table_options.cache_index_and_filter_blocks) {
1900 1901
    return {rep_->filter.get(), /*cache=*/nullptr, /*cache_handle=*/nullptr,
            /*own_value=*/false};
1902 1903
  }

1904 1905 1906
  Cache* block_cache = rep_->table_options.block_cache.get();
  if (rep_->filter_policy == nullptr /* do not use filter */ ||
      block_cache == nullptr /* no block cache at all */) {
1907
    return CachableEntry<FilterBlockReader>();
K
Kai Liu 已提交
1908 1909
  }

1910
  if (!is_a_filter_partition && rep_->filter_entry.IsCached()) {
1911
    return {rep_->filter_entry.GetValue(), /*cache=*/nullptr,
1912
            /*cache_handle=*/nullptr, /*own_value=*/false};
1913 1914 1915 1916
  }

  PERF_TIMER_GUARD(read_filter_block_nanos);

K
Kai Liu 已提交
1917 1918
  // Fetching from the cache
  char cache_key[kMaxCacheKeyPrefixSize + kMaxVarint64Length];
1919
  auto key = GetCacheKey(rep_->cache_key_prefix, rep_->cache_key_prefix_size,
M
Maysam Yabandeh 已提交
1920
                         filter_blk_handle, cache_key);
K
Kai Liu 已提交
1921

1922 1923
  Cache::Handle* cache_handle =
      GetEntryFromCache(block_cache, key, BlockType::kFilter, get_context);
K
Kai Liu 已提交
1924 1925

  FilterBlockReader* filter = nullptr;
1926 1927 1928
  size_t usage = 0;
  bool is_cache_hit = false;
  bool return_empty_reader = false;
K
Kai Liu 已提交
1929
  if (cache_handle != nullptr) {
1930 1931
    filter =
        reinterpret_cast<FilterBlockReader*>(block_cache->Value(cache_handle));
1932 1933
    usage = filter->ApproximateMemoryUsage();
    is_cache_hit = true;
K
Kai Liu 已提交
1934 1935
  } else if (no_io) {
    // Do not invoke any io.
1936
    return_empty_reader = true;
K
Kai Liu 已提交
1937
  } else {
1938 1939
    filter = ReadFilter(prefetch_buffer, filter_blk_handle,
                        is_a_filter_partition, prefix_extractor);
I
Igor Canadi 已提交
1940
    if (filter != nullptr) {
1941
      usage = filter->ApproximateMemoryUsage();
1942
      Status s = block_cache->Insert(
1943
          key, filter, usage, &DeleteCachedFilterEntry, &cache_handle,
1944 1945 1946
          rep_->table_options.cache_index_and_filter_blocks_with_high_priority
              ? Cache::Priority::HIGH
              : Cache::Priority::LOW);
1947
      if (s.ok()) {
1948
        UpdateCacheInsertionMetrics(BlockType::kFilter, get_context, usage);
1949
      } else {
1950
        RecordTick(rep_->ioptions.statistics, BLOCK_CACHE_ADD_FAILURES);
1951
        delete filter;
1952
        return_empty_reader = true;
1953
      }
K
Kai Liu 已提交
1954 1955 1956
    }
  }

1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974
  if (block_cache_tracer_ && lookup_context) {
    // Avoid making copy of block_key and cf_name when constructing the access
    // record.
    BlockCacheTraceRecord access_record(
        rep_->ioptions.env->NowMicros(),
        /*block_key=*/"", TraceType::kBlockTraceFilterBlock,
        /*block_size=*/usage, rep_->cf_id_for_tracing(),
        /*cf_name=*/"", rep_->level_for_tracing(),
        rep_->sst_number_for_tracing(), lookup_context->caller, is_cache_hit,
        /*no_insert=*/no_io);
    block_cache_tracer_->WriteBlockAccess(access_record, key,
                                          rep_->cf_name_for_tracing(),
                                          /*referenced_key=*/nullptr);
  }

  if (return_empty_reader) {
    return CachableEntry<FilterBlockReader>();
  }
1975
  return {filter, cache_handle ? block_cache : nullptr, cache_handle,
1976
          /*own_value=*/false};
K
Kai Liu 已提交
1977 1978
}

1979
CachableEntry<UncompressionDict> BlockBasedTable::GetUncompressionDict(
1980
    FilePrefetchBuffer* prefetch_buffer, bool no_io, GetContext* get_context,
1981
    BlockCacheLookupContext* lookup_context) const {
1982
  if (!rep_->table_options.cache_index_and_filter_blocks) {
1983 1984
    // block cache is either disabled or not used for meta-blocks. In either
    // case, BlockBasedTableReader is the owner of the uncompression dictionary.
1985 1986
    return {rep_->uncompression_dict.get(), nullptr /* cache */,
            nullptr /* cache_handle */, false /* own_value */};
1987
  }
1988
  if (rep_->compression_dict_handle.IsNull()) {
1989
    return CachableEntry<UncompressionDict>();
1990 1991 1992
  }
  char cache_key_buf[kMaxCacheKeyPrefixSize + kMaxVarint64Length];
  auto cache_key =
1993 1994
      GetCacheKey(rep_->cache_key_prefix, rep_->cache_key_prefix_size,
                  rep_->compression_dict_handle, cache_key_buf);
1995 1996 1997
  auto cache_handle =
      GetEntryFromCache(rep_->table_options.block_cache.get(), cache_key,
                        BlockType::kCompressionDictionary, get_context);
1998
  UncompressionDict* dict = nullptr;
1999 2000
  bool is_cache_hit = false;
  size_t usage = 0;
2001 2002
  if (cache_handle != nullptr) {
    dict = reinterpret_cast<UncompressionDict*>(
2003
        rep_->table_options.block_cache->Value(cache_handle));
2004 2005
    is_cache_hit = true;
    usage = dict->ApproximateMemoryUsage();
2006 2007 2008 2009 2010
  } else if (no_io) {
    // Do not invoke any io.
  } else {
    std::unique_ptr<const BlockContents> compression_dict_block;
    Status s =
2011
        ReadCompressionDictBlock(prefetch_buffer, &compression_dict_block);
2012 2013 2014
    if (s.ok()) {
      assert(compression_dict_block != nullptr);
      // TODO(ajkr): find a way to avoid the `compression_dict_block` data copy
2015 2016 2017 2018
      std::unique_ptr<UncompressionDict> uncompression_dict(
          new UncompressionDict(compression_dict_block->data.ToString(),
                                rep_->blocks_definitely_zstd_compressed,
                                rep_->ioptions.statistics));
2019
      usage = uncompression_dict->ApproximateMemoryUsage();
2020
      s = rep_->table_options.block_cache->Insert(
2021 2022
          cache_key, uncompression_dict.get(), usage,
          &DeleteCachedUncompressionDictEntry, &cache_handle,
2023
          rep_->table_options.cache_index_and_filter_blocks_with_high_priority
2024 2025
              ? Cache::Priority::HIGH
              : Cache::Priority::LOW);
2026 2027 2028 2029 2030

      if (s.ok()) {
        UpdateCacheInsertionMetrics(BlockType::kCompressionDictionary,
                                    get_context, usage);
        dict = uncompression_dict.release();
2031
      } else {
2032 2033 2034
        RecordTick(rep_->ioptions.statistics, BLOCK_CACHE_ADD_FAILURES);
        assert(dict == nullptr);
        assert(cache_handle == nullptr);
2035 2036 2037
      }
    }
  }
2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051
  if (block_cache_tracer_ && lookup_context) {
    // Avoid making copy of block_key and cf_name when constructing the access
    // record.
    BlockCacheTraceRecord access_record(
        rep_->ioptions.env->NowMicros(),
        /*block_key=*/"", TraceType::kBlockTraceUncompressionDictBlock,
        /*block_size=*/usage, rep_->cf_id_for_tracing(),
        /*cf_name=*/"", rep_->level_for_tracing(),
        rep_->sst_number_for_tracing(), lookup_context->caller, is_cache_hit,
        /*no_insert=*/no_io);
    block_cache_tracer_->WriteBlockAccess(access_record, cache_key,
                                          rep_->cf_name_for_tracing(),
                                          /*referenced_key=*/nullptr);
  }
2052 2053
  return {dict, cache_handle ? rep_->table_options.block_cache.get() : nullptr,
          cache_handle, false /* own_value */};
2054 2055
}

2056 2057
// disable_prefix_seek should be set to true when prefix_extractor found in SST
// differs from the one in mutable_cf_options and index type is HashBasedIndex
2058
InternalIteratorBase<BlockHandle>* BlockBasedTable::NewIndexIterator(
2059
    const ReadOptions& read_options, bool disable_prefix_seek,
2060 2061
    IndexBlockIter* input_iter, GetContext* get_context,
    BlockCacheLookupContext* lookup_context) const {
2062 2063
  assert(rep_ != nullptr);
  assert(rep_->index_reader != nullptr);
2064

2065
  // We don't return pinned data from index blocks, so no need
2066
  // to set `block_contents_pinned`.
2067
  return rep_->index_reader->NewIterator(read_options, disable_prefix_seek,
2068 2069
                                         input_iter, get_context,
                                         lookup_context);
K
Kai Liu 已提交
2070 2071
}

L
Lei Jin 已提交
2072 2073
// Convert an index iterator value (i.e., an encoded BlockHandle)
// into an iterator over the contents of the corresponding block.
2074 2075
// If input_iter is null, new a iterator
// If input_iter is not null, update this iter and return it
M
Maysam Yabandeh 已提交
2076 2077
template <typename TBlockIter>
TBlockIter* BlockBasedTable::NewDataBlockIterator(
2078
    const ReadOptions& ro, const BlockHandle& handle, TBlockIter* input_iter,
2079
    BlockType block_type, bool key_includes_seq, bool index_key_is_full,
2080
    GetContext* get_context, BlockCacheLookupContext* lookup_context, Status s,
2081
    FilePrefetchBuffer* prefetch_buffer, bool for_compaction) const {
2082 2083
  PERF_TIMER_GUARD(new_table_block_iter_nanos);

2084 2085 2086 2087 2088 2089 2090 2091
  TBlockIter* iter = input_iter != nullptr ? input_iter : new TBlockIter;
  if (!s.ok()) {
    iter->Invalidate(s);
    return iter;
  }

  const bool no_io = (ro.read_tier == kBlockCacheTier);
  auto uncompression_dict_storage =
2092
      GetUncompressionDict(prefetch_buffer, no_io, get_context, lookup_context);
2093
  const UncompressionDict& uncompression_dict =
2094 2095 2096
      uncompression_dict_storage.GetValue() == nullptr
          ? UncompressionDict::GetEmptyDict()
          : *uncompression_dict_storage.GetValue();
2097

L
Lei Jin 已提交
2098
  CachableEntry<Block> block;
2099
  s = RetrieveBlock(prefetch_buffer, ro, handle, uncompression_dict, &block,
2100
                    block_type, get_context, lookup_context, for_compaction);
2101

2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116
  if (!s.ok()) {
    assert(block.IsEmpty());
    iter->Invalidate(s);
    return iter;
  }

  assert(block.GetValue() != nullptr);
  constexpr bool kTotalOrderSeek = true;
  // Block contents are pinned and it is still pinned after the iterator
  // is destroyed as long as cleanup functions are moved to another object,
  // when:
  // 1. block cache handle is set to be released in cleanup function, or
  // 2. it's pointing to immortal source. If own_bytes is true then we are
  //    not reading data from the original source, whether immortal or not.
  //    Otherwise, the block is pinned iff the source is immortal.
2117 2118
  const bool block_contents_pinned =
      block.IsCached() ||
2119
      (!block.GetValue()->own_bytes() && rep_->immortal_table);
2120
  iter = block.GetValue()->NewIterator<TBlockIter>(
2121 2122
      &rep_->internal_comparator, rep_->internal_comparator.user_comparator(),
      iter, rep_->ioptions.statistics, kTotalOrderSeek, key_includes_seq,
2123
      index_key_is_full, block_contents_pinned);
2124 2125

  if (!block.IsCached()) {
2126
    if (!ro.fill_cache && rep_->cache_key_prefix_size != 0) {
2127
      // insert a dummy record to block cache to track the memory usage
2128
      Cache* const block_cache = rep_->table_options.block_cache.get();
2129 2130 2131 2132 2133 2134 2135 2136
      Cache::Handle* cache_handle = nullptr;
      // There are two other types of cache keys: 1) SST cache key added in
      // `MaybeReadBlockAndLoadToCache` 2) dummy cache key added in
      // `write_buffer_manager`. Use longer prefix (41 bytes) to differentiate
      // from SST cache key(31 bytes), and use non-zero prefix to
      // differentiate from `write_buffer_manager`
      const size_t kExtraCacheKeyPrefix = kMaxVarint64Length * 4 + 1;
      char cache_key[kExtraCacheKeyPrefix + kMaxVarint64Length];
2137
      // Prefix: use rep_->cache_key_prefix padded by 0s
2138
      memset(cache_key, 0, kExtraCacheKeyPrefix + kMaxVarint64Length);
2139 2140 2141
      assert(rep_->cache_key_prefix_size != 0);
      assert(rep_->cache_key_prefix_size <= kExtraCacheKeyPrefix);
      memcpy(cache_key, rep_->cache_key_prefix, rep_->cache_key_prefix_size);
2142 2143 2144
      char* end = EncodeVarint64(cache_key + kExtraCacheKeyPrefix,
                                 next_cache_key_id_++);
      assert(end - cache_key <=
2145
             static_cast<int>(kExtraCacheKeyPrefix + kMaxVarint64Length));
2146 2147 2148 2149
      const Slice unique_key(cache_key, static_cast<size_t>(end - cache_key));
      s = block_cache->Insert(unique_key, nullptr,
                              block.GetValue()->ApproximateMemoryUsage(),
                              nullptr, &cache_handle);
2150

2151
      if (s.ok()) {
2152 2153 2154
        assert(cache_handle != nullptr);
        iter->RegisterCleanup(&ForceReleaseCachedEntry, block_cache,
                              cache_handle);
2155
      }
2156
    }
2157 2158
  } else {
    iter->SetCacheHandle(block.GetCacheHandle());
2159 2160
  }

2161
  block.TransferTo(iter);
2162 2163 2164
  return iter;
}

2165
Status BlockBasedTable::MaybeReadBlockAndLoadToCache(
2166
    FilePrefetchBuffer* prefetch_buffer, const ReadOptions& ro,
2167
    const BlockHandle& handle, const UncompressionDict& uncompression_dict,
2168
    CachableEntry<Block>* block_entry, BlockType block_type,
2169
    GetContext* get_context, BlockCacheLookupContext* lookup_context) const {
2170
  assert(block_entry != nullptr);
2171
  const bool no_io = (ro.read_tier == kBlockCacheTier);
2172
  Cache* block_cache = rep_->table_options.block_cache.get();
2173
  // No point to cache compressed blocks if it never goes away
2174
  Cache* block_cache_compressed =
2175 2176
      rep_->immortal_table ? nullptr
                           : rep_->table_options.block_cache_compressed.get();
L
Lei Jin 已提交
2177

2178 2179
  // First, try to get the block from the cache
  //
L
Lei Jin 已提交
2180
  // If either block cache is enabled, we'll try to read from it.
2181
  Status s;
2182 2183 2184 2185
  char cache_key[kMaxCacheKeyPrefixSize + kMaxVarint64Length];
  char compressed_cache_key[kMaxCacheKeyPrefixSize + kMaxVarint64Length];
  Slice key /* key to the block cache */;
  Slice ckey /* key to the compressed block cache */;
2186 2187
  bool is_cache_hit = false;
  bool no_insert = true;
L
Lei Jin 已提交
2188 2189 2190
  if (block_cache != nullptr || block_cache_compressed != nullptr) {
    // create key for block cache
    if (block_cache != nullptr) {
2191
      key = GetCacheKey(rep_->cache_key_prefix, rep_->cache_key_prefix_size,
2192
                        handle, cache_key);
L
Lei Jin 已提交
2193 2194 2195
    }

    if (block_cache_compressed != nullptr) {
2196 2197
      ckey = GetCacheKey(rep_->compressed_cache_key_prefix,
                         rep_->compressed_cache_key_prefix_size, handle,
L
Lei Jin 已提交
2198 2199 2200
                         compressed_cache_key);
    }

2201
    s = GetDataBlockFromCache(key, ckey, block_cache, block_cache_compressed,
2202
                              ro, block_entry, uncompression_dict, block_type,
2203
                              get_context);
2204 2205 2206 2207 2208
    if (block_entry->GetValue()) {
      // TODO(haoyu): Differentiate cache hit on uncompressed block cache and
      // compressed block cache.
      is_cache_hit = true;
    }
2209 2210
    // Can't find the block from the cache. If I/O is allowed, read from the
    // file.
2211
    if (block_entry->GetValue() == nullptr && !no_io && ro.fill_cache) {
2212
      no_insert = false;
2213
      Statistics* statistics = rep_->ioptions.statistics;
2214
      bool do_decompress =
2215
          block_cache_compressed == nullptr && rep_->blocks_maybe_compressed;
2216 2217
      CompressionType raw_block_comp_type;
      BlockContents raw_block_contents;
L
Lei Jin 已提交
2218
      {
2219
        StopWatch sw(rep_->ioptions.env, statistics, READ_BLOCK_GET_MICROS);
2220
        BlockFetcher block_fetcher(
2221 2222 2223
            rep_->file.get(), prefetch_buffer, rep_->footer, ro, handle,
            &raw_block_contents, rep_->ioptions,
            do_decompress /* do uncompress */, rep_->blocks_maybe_compressed,
2224
            block_type, uncompression_dict, rep_->persistent_cache_options,
2225 2226
            GetMemoryAllocator(rep_->table_options),
            GetMemoryAllocatorForCompressedBlock(rep_->table_options));
2227 2228
        s = block_fetcher.ReadBlockContents();
        raw_block_comp_type = block_fetcher.get_compression_type();
L
Lei Jin 已提交
2229 2230 2231
      }

      if (s.ok()) {
2232
        SequenceNumber seq_no = rep_->get_global_seqno(block_type);
2233 2234
        // If filling cache is allowed and a cache is configured, try to put the
        // block to the cache.
2235 2236 2237 2238
        s = PutDataBlockToCache(key, ckey, block_cache, block_cache_compressed,
                                block_entry, &raw_block_contents,
                                raw_block_comp_type, uncompression_dict, seq_no,
                                GetMemoryAllocator(rep_->table_options),
2239
                                block_type, get_context);
L
Lei Jin 已提交
2240 2241 2242
      }
    }
  }
2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295

  // Fill lookup_context.
  if (block_cache_tracer_ && lookup_context) {
    size_t usage = 0;
    uint64_t nkeys = 0;
    if (block_entry->GetValue()) {
      // Approximate the number of keys in the block using restarts.
      nkeys = rep_->table_options.block_restart_interval *
              block_entry->GetValue()->NumRestarts();
      usage = block_entry->GetValue()->ApproximateMemoryUsage();
    }
    TraceType trace_block_type = TraceType::kTraceMax;
    switch (block_type) {
      case BlockType::kIndex:
        trace_block_type = TraceType::kBlockTraceIndexBlock;
        break;
      case BlockType::kData:
        trace_block_type = TraceType::kBlockTraceDataBlock;
        break;
      case BlockType::kRangeDeletion:
        trace_block_type = TraceType::kBlockTraceRangeDeletionBlock;
        break;
      default:
        // This cannot happen.
        assert(false);
        break;
    }
    if (BlockCacheTraceHelper::ShouldTraceReferencedKey(
            trace_block_type, lookup_context->caller)) {
      // Defer logging the access to Get() and MultiGet() to trace additional
      // information, e.g., the referenced key,
      // referenced_key_exist_in_block.

      // Make a copy of the block key here since it will be logged later.
      lookup_context->FillLookupContext(
          is_cache_hit, no_insert, trace_block_type,
          /*block_size=*/usage, /*block_key=*/key.ToString(), nkeys);
    } else {
      // Avoid making copy of block_key and cf_name when constructing the access
      // record.
      BlockCacheTraceRecord access_record(
          rep_->ioptions.env->NowMicros(),
          /*block_key=*/"", trace_block_type,
          /*block_size=*/usage, rep_->cf_id_for_tracing(),
          /*cf_name=*/"", rep_->level_for_tracing(),
          rep_->sst_number_for_tracing(), lookup_context->caller, is_cache_hit,
          no_insert);
      block_cache_tracer_->WriteBlockAccess(access_record, key,
                                            rep_->cf_name_for_tracing(),
                                            /*referenced_key=*/nullptr);
    }
  }

2296
  assert(s.ok() || block_entry->GetValue() == nullptr);
2297
  return s;
2298 2299
}

2300
Status BlockBasedTable::RetrieveBlock(
2301
    FilePrefetchBuffer* prefetch_buffer, const ReadOptions& ro,
2302
    const BlockHandle& handle, const UncompressionDict& uncompression_dict,
2303
    CachableEntry<Block>* block_entry, BlockType block_type,
2304 2305
    GetContext* get_context, BlockCacheLookupContext* lookup_context,
    bool for_compaction) const {
2306 2307 2308 2309
  assert(block_entry);
  assert(block_entry->IsEmpty());

  Status s;
2310 2311 2312 2313
  if (rep_->table_options.cache_index_and_filter_blocks ||
      (block_type != BlockType::kFilter &&
       block_type != BlockType::kCompressionDictionary &&
       block_type != BlockType::kIndex)) {
2314
    s = MaybeReadBlockAndLoadToCache(prefetch_buffer, ro, handle,
2315
                                     uncompression_dict, block_entry,
2316
                                     block_type, get_context, lookup_context);
2317 2318 2319 2320 2321 2322

    if (!s.ok()) {
      return s;
    }

    if (block_entry->GetValue() != nullptr) {
2323
      assert(s.ok());
2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337
      return s;
    }
  }

  assert(block_entry->IsEmpty());

  const bool no_io = ro.read_tier == kBlockCacheTier;
  if (no_io) {
    return Status::Incomplete("no blocking io");
  }

  std::unique_ptr<Block> block;

  {
2338
    StopWatch sw(rep_->ioptions.env, rep_->ioptions.statistics,
2339 2340
                 READ_BLOCK_GET_MICROS);
    s = ReadBlockFromFile(
2341 2342
        rep_->file.get(), prefetch_buffer, rep_->footer, ro, handle, &block,
        rep_->ioptions, rep_->blocks_maybe_compressed,
2343
        rep_->blocks_maybe_compressed, block_type, uncompression_dict,
2344 2345 2346 2347
        rep_->persistent_cache_options, rep_->get_global_seqno(block_type),
        block_type == BlockType::kData
            ? rep_->table_options.read_amp_bytes_per_bit
            : 0,
2348
        GetMemoryAllocator(rep_->table_options), for_compaction);
2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360
  }

  if (!s.ok()) {
    return s;
  }

  block_entry->SetOwnedValue(block.release());

  assert(s.ok());
  return s;
}

2361
BlockBasedTable::PartitionedIndexIteratorState::PartitionedIndexIteratorState(
2362
    const BlockBasedTable* table,
M
Maysam Yabandeh 已提交
2363
    std::unordered_map<uint64_t, CachableEntry<Block>>* block_map,
2364
    bool index_key_includes_seq, bool index_key_is_full)
M
Maysam Yabandeh 已提交
2365 2366
    : table_(table),
      block_map_(block_map),
2367 2368
      index_key_includes_seq_(index_key_includes_seq),
      index_key_is_full_(index_key_is_full) {}
M
Maysam Yabandeh 已提交
2369

2370
InternalIteratorBase<BlockHandle>*
2371
BlockBasedTable::PartitionedIndexIteratorState::NewSecondaryIterator(
2372
    const BlockHandle& handle) {
M
Maysam Yabandeh 已提交
2373
  // Return a block iterator on the index partition
2374 2375 2376 2377
  auto block = block_map_->find(handle.offset());
  // This is a possible scenario since block cache might not have had space
  // for the partition
  if (block != block_map_->end()) {
2378 2379 2380
    auto rep = table_->get_rep();
    assert(rep);

M
Maysam Yabandeh 已提交
2381
    Statistics* kNullStats = nullptr;
2382
    // We don't return pinned data from index blocks, so no need
2383
    // to set `block_contents_pinned`.
2384
    return block->second.GetValue()->NewIterator<IndexBlockIter>(
M
Maysam Yabandeh 已提交
2385
        &rep->internal_comparator, rep->internal_comparator.user_comparator(),
2386
        nullptr, kNullStats, true, index_key_includes_seq_, index_key_is_full_);
2387 2388
  }
  // Create an empty iterator
2389
  return new IndexBlockIter();
2390 2391
}

T
Tyler Harter 已提交
2392 2393
// This will be broken if the user specifies an unusual implementation
// of Options.comparator, or if the user specifies an unusual
2394 2395
// definition of prefixes in BlockBasedTableOptions.filter_policy.
// In particular, we require the following three properties:
T
Tyler Harter 已提交
2396 2397 2398 2399
//
// 1) key.starts_with(prefix(key))
// 2) Compare(prefix(key), key) <= 0.
// 3) If Compare(key1, key2) <= 0, then Compare(prefix(key1), prefix(key2)) <= 0
T
Tyler Harter 已提交
2400
//
K
Kai Liu 已提交
2401 2402 2403
// Otherwise, this method guarantees no I/O will be incurred.
//
// REQUIRES: this method shouldn't be called while the DB lock is held.
2404 2405 2406
bool BlockBasedTable::PrefixMayMatch(
    const Slice& internal_key, const ReadOptions& read_options,
    const SliceTransform* options_prefix_extractor,
2407 2408
    const bool need_upper_bound_check,
    BlockCacheLookupContext* lookup_context) const {
2409
  if (!rep_->filter_policy) {
2410 2411 2412
    return true;
  }

2413 2414 2415 2416 2417 2418 2419 2420 2421 2422
  const SliceTransform* prefix_extractor;

  if (rep_->table_prefix_extractor == nullptr) {
    if (need_upper_bound_check) {
      return true;
    }
    prefix_extractor = options_prefix_extractor;
  } else {
    prefix_extractor = rep_->table_prefix_extractor.get();
  }
2423
  auto user_key = ExtractUserKey(internal_key);
2424
  if (!prefix_extractor->InDomain(user_key)) {
2425 2426
    return true;
  }
L
Lei Jin 已提交
2427

T
Tyler Harter 已提交
2428 2429 2430
  bool may_match = true;
  Status s;

2431
  // First, try check with full filter
2432 2433 2434
  auto filter_entry =
      GetFilter(prefix_extractor, /*prefetch_buffer=*/nullptr, /*no_io=*/false,
                /*get_context=*/nullptr, lookup_context);
2435
  FilterBlockReader* filter = filter_entry.GetValue();
2436
  bool filter_checked = true;
2437 2438
  if (filter != nullptr) {
    if (!filter->IsBlockBased()) {
M
Maysam Yabandeh 已提交
2439
      const Slice* const const_ikey_ptr = &internal_key;
2440 2441 2442
      may_match = filter->RangeMayExist(
          read_options.iterate_upper_bound, user_key, prefix_extractor,
          rep_->internal_comparator.user_comparator(), const_ikey_ptr,
2443
          &filter_checked, need_upper_bound_check, lookup_context);
2444
    } else {
2445 2446 2447 2448 2449
      // if prefix_extractor changed for block based filter, skip filter
      if (need_upper_bound_check) {
        return true;
      }
      auto prefix = prefix_extractor->Transform(user_key);
M
Maysam Yabandeh 已提交
2450 2451 2452 2453 2454 2455 2456 2457 2458
      InternalKey internal_key_prefix(prefix, kMaxSequenceNumber, kTypeValue);
      auto internal_prefix = internal_key_prefix.Encode();

      // To prevent any io operation in this method, we set `read_tier` to make
      // sure we always read index or filter only when they have already been
      // loaded to memory.
      ReadOptions no_io_read_options;
      no_io_read_options.read_tier = kBlockCacheTier;

2459
      // Then, try find it within each block
2460 2461
      // we already know prefix_extractor and prefix_extractor_name must match
      // because `CheckPrefixMayMatch` first checks `check_filter_ == true`
2462 2463 2464 2465
      std::unique_ptr<InternalIteratorBase<BlockHandle>> iiter(NewIndexIterator(
          no_io_read_options,
          /*need_upper_bound_check=*/false, /*input_iter=*/nullptr,
          /*need_upper_bound_check=*/nullptr, lookup_context));
2466 2467 2468 2469 2470 2471 2472 2473
      iiter->Seek(internal_prefix);

      if (!iiter->Valid()) {
        // we're past end of file
        // if it's incomplete, it means that we avoided I/O
        // and we're not really sure that we're past the end
        // of the file
        may_match = iiter->status().IsIncomplete();
2474 2475
      } else if ((rep_->table_properties &&
                          rep_->table_properties->index_key_is_user_key
M
Maysam Yabandeh 已提交
2476 2477
                      ? iiter->key()
                      : ExtractUserKey(iiter->key()))
2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495
                     .starts_with(ExtractUserKey(internal_prefix))) {
        // we need to check for this subtle case because our only
        // guarantee is that "the key is a string >= last key in that data
        // block" according to the doc/table_format.txt spec.
        //
        // Suppose iiter->key() starts with the desired prefix; it is not
        // necessarily the case that the corresponding data block will
        // contain the prefix, since iiter->key() need not be in the
        // block.  However, the next data block may contain the prefix, so
        // we return true to play it safe.
        may_match = true;
      } else if (filter->IsBlockBased()) {
        // iiter->key() does NOT start with the desired prefix.  Because
        // Seek() finds the first key that is >= the seek target, this
        // means that iiter->key() > prefix.  Thus, any data blocks coming
        // after the data block corresponding to iiter->key() cannot
        // possibly contain the key.  Thus, the corresponding data block
        // is the only on could potentially contain the prefix.
2496
        BlockHandle handle = iiter->value();
2497 2498 2499
        may_match = filter->PrefixMayMatch(
            prefix, prefix_extractor, handle.offset(), /*no_io=*/false,
            /*const_key_ptr=*/nullptr, lookup_context);
2500
      }
2501
    }
T
Tyler Harter 已提交
2502
  }
T
Tyler Harter 已提交
2503

2504 2505 2506 2507 2508 2509
  if (filter_checked) {
    Statistics* statistics = rep_->ioptions.statistics;
    RecordTick(statistics, BLOOM_FILTER_PREFIX_CHECKED);
    if (!may_match) {
      RecordTick(statistics, BLOOM_FILTER_PREFIX_USEFUL);
    }
T
Tyler Harter 已提交
2510 2511
  }

T
Tyler Harter 已提交
2512 2513 2514
  return may_match;
}

2515 2516
template <class TBlockIter, typename TValue>
void BlockBasedTableIterator<TBlockIter, TValue>::Seek(const Slice& target) {
2517
  is_out_of_bound_ = false;
2518
  if (!CheckPrefixMayMatch(target)) {
2519 2520 2521 2522
    ResetDataIter();
    return;
  }

2523
  bool need_seek_index = true;
2524
  if (block_iter_points_to_real_block_ && block_iter_.Valid()) {
2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540
    // Reseek.
    prev_index_value_ = index_iter_->value();
    // We can avoid an index seek if:
    // 1. The new seek key is larger than the current key
    // 2. The new seek key is within the upper bound of the block
    // Since we don't necessarily know the internal key for either
    // the current key or the upper bound, we check user keys and
    // exclude the equality case. Considering internal keys can
    // improve for the boundary cases, but it would complicate the
    // code.
    if (user_comparator_.Compare(ExtractUserKey(target),
                                 block_iter_.user_key()) > 0 &&
        user_comparator_.Compare(ExtractUserKey(target),
                                 index_iter_->user_key()) < 0) {
      need_seek_index = false;
    }
2541 2542
  }

2543 2544 2545 2546 2547 2548 2549 2550
  if (need_seek_index) {
    index_iter_->Seek(target);
    if (!index_iter_->Valid()) {
      ResetDataIter();
      return;
    }
    InitDataBlock();
  }
2551

M
Maysam Yabandeh 已提交
2552
  block_iter_.Seek(target);
2553 2554

  FindKeyForward();
2555
  CheckOutOfBound();
M
Maysam Yabandeh 已提交
2556 2557 2558
  assert(
      !block_iter_.Valid() ||
      (key_includes_seq_ && icomp_.Compare(target, block_iter_.key()) <= 0) ||
2559 2560
      (!key_includes_seq_ && user_comparator_.Compare(ExtractUserKey(target),
                                                      block_iter_.key()) <= 0));
2561 2562
}

2563 2564 2565
template <class TBlockIter, typename TValue>
void BlockBasedTableIterator<TBlockIter, TValue>::SeekForPrev(
    const Slice& target) {
2566
  is_out_of_bound_ = false;
2567
  if (!CheckPrefixMayMatch(target)) {
2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 2599
    ResetDataIter();
    return;
  }

  SavePrevIndexValue();

  // Call Seek() rather than SeekForPrev() in the index block, because the
  // target data block will likely to contain the position for `target`, the
  // same as Seek(), rather than than before.
  // For example, if we have three data blocks, each containing two keys:
  //   [2, 4]  [6, 8] [10, 12]
  //  (the keys in the index block would be [4, 8, 12])
  // and the user calls SeekForPrev(7), we need to go to the second block,
  // just like if they call Seek(7).
  // The only case where the block is difference is when they seek to a position
  // in the boundary. For example, if they SeekForPrev(5), we should go to the
  // first block, rather than the second. However, we don't have the information
  // to distinguish the two unless we read the second block. In this case, we'll
  // end up with reading two blocks.
  index_iter_->Seek(target);

  if (!index_iter_->Valid()) {
    index_iter_->SeekToLast();
    if (!index_iter_->Valid()) {
      ResetDataIter();
      block_iter_points_to_real_block_ = false;
      return;
    }
  }

  InitDataBlock();

M
Maysam Yabandeh 已提交
2600
  block_iter_.SeekForPrev(target);
2601 2602

  FindKeyBackward();
M
Maysam Yabandeh 已提交
2603 2604
  assert(!block_iter_.Valid() ||
         icomp_.Compare(target, block_iter_.key()) >= 0);
2605 2606
}

2607 2608
template <class TBlockIter, typename TValue>
void BlockBasedTableIterator<TBlockIter, TValue>::SeekToFirst() {
2609
  is_out_of_bound_ = false;
2610 2611 2612 2613 2614 2615 2616
  SavePrevIndexValue();
  index_iter_->SeekToFirst();
  if (!index_iter_->Valid()) {
    ResetDataIter();
    return;
  }
  InitDataBlock();
M
Maysam Yabandeh 已提交
2617
  block_iter_.SeekToFirst();
2618
  FindKeyForward();
2619
  CheckOutOfBound();
2620 2621
}

2622 2623
template <class TBlockIter, typename TValue>
void BlockBasedTableIterator<TBlockIter, TValue>::SeekToLast() {
2624
  is_out_of_bound_ = false;
2625 2626 2627 2628 2629 2630 2631
  SavePrevIndexValue();
  index_iter_->SeekToLast();
  if (!index_iter_->Valid()) {
    ResetDataIter();
    return;
  }
  InitDataBlock();
M
Maysam Yabandeh 已提交
2632
  block_iter_.SeekToLast();
2633 2634 2635
  FindKeyBackward();
}

2636 2637
template <class TBlockIter, typename TValue>
void BlockBasedTableIterator<TBlockIter, TValue>::Next() {
2638
  assert(block_iter_points_to_real_block_);
M
Maysam Yabandeh 已提交
2639
  block_iter_.Next();
2640 2641 2642
  FindKeyForward();
}

2643 2644
template <class TBlockIter, typename TValue>
bool BlockBasedTableIterator<TBlockIter, TValue>::NextAndGetResult(
2645
    Slice* ret_key) {
2646 2647 2648
  Next();
  bool is_valid = Valid();
  if (is_valid) {
2649
    *ret_key = key();
2650 2651 2652 2653
  }
  return is_valid;
}

2654 2655
template <class TBlockIter, typename TValue>
void BlockBasedTableIterator<TBlockIter, TValue>::Prev() {
2656
  assert(block_iter_points_to_real_block_);
M
Maysam Yabandeh 已提交
2657
  block_iter_.Prev();
2658 2659 2660
  FindKeyBackward();
}

2661 2662 2663 2664 2665 2666 2667
// Found that 256 KB readahead size provides the best performance, based on
// experiments, for auto readahead. Experiment data is in PR #3282.
template <class TBlockIter, typename TValue>
const size_t
    BlockBasedTableIterator<TBlockIter, TValue>::kMaxAutoReadaheadSize =
        256 * 1024;

2668 2669 2670
template <class TBlockIter, typename TValue>
void BlockBasedTableIterator<TBlockIter, TValue>::InitDataBlock() {
  BlockHandle data_block_handle = index_iter_->value();
2671
  if (!block_iter_points_to_real_block_ ||
2672
      data_block_handle.offset() != prev_index_value_.offset() ||
2673
      // if previous attempt of reading the block missed cache, try again
M
Maysam Yabandeh 已提交
2674
      block_iter_.status().IsIncomplete()) {
2675 2676 2677 2678 2679
    if (block_iter_points_to_real_block_) {
      ResetDataIter();
    }
    auto* rep = table_->get_rep();

2680 2681 2682 2683 2684 2685
    // Prefetch additional data for range scans (iterators). Enabled only for
    // user reads.
    // Implicit auto readahead:
    //   Enabled after 2 sequential IOs when ReadOptions.readahead_size == 0.
    // Explicit user requested readahead:
    //   Enabled from the very first IO when ReadOptions.readahead_size is set.
2686
    if (lookup_context_.caller != TableReaderCaller::kCompaction) {
2687 2688 2689 2690 2691 2692 2693 2694 2695 2696 2697 2698 2699 2700 2701 2702 2703 2704 2705 2706 2707 2708 2709 2710 2711 2712
      if (read_options_.readahead_size == 0) {
        // Implicit auto readahead
        num_file_reads_++;
        if (num_file_reads_ > kMinNumFileReadsToStartAutoReadahead) {
          if (!rep->file->use_direct_io() &&
              (data_block_handle.offset() +
                   static_cast<size_t>(data_block_handle.size()) +
                   kBlockTrailerSize >
               readahead_limit_)) {
            // Buffered I/O
            // Discarding the return status of Prefetch calls intentionally, as
            // we can fallback to reading from disk if Prefetch fails.
            rep->file->Prefetch(data_block_handle.offset(), readahead_size_);
            readahead_limit_ = static_cast<size_t>(data_block_handle.offset() +
                                                   readahead_size_);
            // Keep exponentially increasing readahead size until
            // kMaxAutoReadaheadSize.
            readahead_size_ =
                std::min(kMaxAutoReadaheadSize, readahead_size_ * 2);
          } else if (rep->file->use_direct_io() && !prefetch_buffer_) {
            // Direct I/O
            // Let FilePrefetchBuffer take care of the readahead.
            prefetch_buffer_.reset(
                new FilePrefetchBuffer(rep->file.get(), kInitAutoReadaheadSize,
                                       kMaxAutoReadaheadSize));
          }
2713
        }
2714 2715 2716 2717 2718 2719 2720
      } else if (!prefetch_buffer_) {
        // Explicit user requested readahead
        // The actual condition is:
        // if (read_options_.readahead_size != 0 && !prefetch_buffer_)
        prefetch_buffer_.reset(new FilePrefetchBuffer(
            rep->file.get(), read_options_.readahead_size,
            read_options_.readahead_size));
2721
      }
2722 2723 2724 2725
    } else if (!prefetch_buffer_) {
      prefetch_buffer_.reset(
          new FilePrefetchBuffer(rep->file.get(), compaction_readahead_size_,
                                 compaction_readahead_size_));
2726 2727
    }

2728
    Status s;
2729
    table_->NewDataBlockIterator<TBlockIter>(
2730
        read_options_, data_block_handle, &block_iter_, block_type_,
2731
        key_includes_seq_, index_key_is_full_,
2732
        /*get_context=*/nullptr, &lookup_context_, s, prefetch_buffer_.get(),
2733 2734
        /*for_compaction=*/lookup_context_.caller ==
            TableReaderCaller::kCompaction);
2735 2736 2737 2738
    block_iter_points_to_real_block_ = true;
  }
}

2739
template <class TBlockIter, typename TValue>
2740
void BlockBasedTableIterator<TBlockIter, TValue>::FindBlockForward() {
2741 2742
  // TODO the while loop inherits from two-level-iterator. We don't know
  // whether a block can be empty so it can be replaced by an "if".
2743
  do {
M
Maysam Yabandeh 已提交
2744
    if (!block_iter_.status().ok()) {
2745 2746
      return;
    }
2747
    // Whether next data block is out of upper bound, if there is one.
2748 2749 2750 2751 2752 2753 2754
    bool next_block_is_out_of_bound = false;
    if (read_options_.iterate_upper_bound != nullptr &&
        block_iter_points_to_real_block_) {
      next_block_is_out_of_bound =
          (user_comparator_.Compare(*read_options_.iterate_upper_bound,
                                    index_iter_->user_key()) <= 0);
    }
2755 2756
    ResetDataIter();
    index_iter_->Next();
2757 2758 2759 2760 2761 2762 2763 2764 2765 2766 2767
    if (next_block_is_out_of_bound) {
      // The next block is out of bound. No need to read it.
      TEST_SYNC_POINT_CALLBACK("BlockBasedTableIterator:out_of_bound", nullptr);
      // We need to make sure this is not the last data block before setting
      // is_out_of_bound_, since the index key for the last data block can be
      // larger than smallest key of the next file on the same level.
      if (index_iter_->Valid()) {
        is_out_of_bound_ = true;
      }
      return;
    }
2768 2769 2770

    if (index_iter_->Valid()) {
      InitDataBlock();
M
Maysam Yabandeh 已提交
2771
      block_iter_.SeekToFirst();
2772 2773 2774
    } else {
      return;
    }
2775 2776 2777 2778 2779 2780 2781 2782 2783
  } while (!block_iter_.Valid());
}

template <class TBlockIter, typename TValue>
void BlockBasedTableIterator<TBlockIter, TValue>::FindKeyForward() {
  assert(!is_out_of_bound_);

  if (!block_iter_.Valid()) {
    FindBlockForward();
2784 2785 2786
  }
}

2787 2788
template <class TBlockIter, typename TValue>
void BlockBasedTableIterator<TBlockIter, TValue>::FindKeyBackward() {
M
Maysam Yabandeh 已提交
2789 2790
  while (!block_iter_.Valid()) {
    if (!block_iter_.status().ok()) {
2791 2792 2793 2794 2795 2796 2797 2798
      return;
    }

    ResetDataIter();
    index_iter_->Prev();

    if (index_iter_->Valid()) {
      InitDataBlock();
M
Maysam Yabandeh 已提交
2799
      block_iter_.SeekToLast();
2800 2801 2802 2803 2804 2805 2806 2807 2808
    } else {
      return;
    }
  }

  // We could have check lower bound here too, but we opt not to do it for
  // code simplicity.
}

2809 2810 2811 2812 2813 2814 2815 2816 2817
template <class TBlockIter, typename TValue>
void BlockBasedTableIterator<TBlockIter, TValue>::CheckOutOfBound() {
  if (read_options_.iterate_upper_bound != nullptr &&
      block_iter_points_to_real_block_ && block_iter_.Valid()) {
    is_out_of_bound_ = user_comparator_.Compare(
                           *read_options_.iterate_upper_bound, user_key()) <= 0;
  }
}

2818 2819
InternalIterator* BlockBasedTable::NewIterator(
    const ReadOptions& read_options, const SliceTransform* prefix_extractor,
2820 2821
    Arena* arena, bool skip_filters, TableReaderCaller caller, size_t compaction_readahead_size) {
  BlockCacheLookupContext lookup_context{caller};
2822
  bool need_upper_bound_check =
2823
      PrefixExtractorChanged(rep_->table_properties.get(), prefix_extractor);
2824
  if (arena == nullptr) {
M
Maysam Yabandeh 已提交
2825
    return new BlockBasedTableIterator<DataBlockIter>(
2826
        this, read_options, rep_->internal_comparator,
2827 2828
        NewIndexIterator(
            read_options,
2829
            need_upper_bound_check &&
2830 2831
                rep_->index_type == BlockBasedTableOptions::kHashSearch,
            /*input_iter=*/nullptr, /*get_context=*/nullptr, &lookup_context),
2832
        !skip_filters && !read_options.total_order_seek &&
2833
            prefix_extractor != nullptr,
2834
        need_upper_bound_check, prefix_extractor, BlockType::kData,
2835
        /*key_includes_seq=*/true, /*index_key_is_full=*/true, caller,
2836
        compaction_readahead_size);
2837
  } else {
M
Maysam Yabandeh 已提交
2838 2839 2840
    auto* mem =
        arena->AllocateAligned(sizeof(BlockBasedTableIterator<DataBlockIter>));
    return new (mem) BlockBasedTableIterator<DataBlockIter>(
2841
        this, read_options, rep_->internal_comparator,
2842 2843 2844
        NewIndexIterator(read_options, need_upper_bound_check,
                         /*input_iter=*/nullptr, /*get_context=*/nullptr,
                         &lookup_context),
2845
        !skip_filters && !read_options.total_order_seek &&
2846
            prefix_extractor != nullptr,
2847
        need_upper_bound_check, prefix_extractor, BlockType::kData,
2848
        /*key_includes_seq=*/true, /*index_key_is_full=*/true, caller, compaction_readahead_size);
2849
  }
J
jorlow@chromium.org 已提交
2850 2851
}

2852
FragmentedRangeTombstoneIterator* BlockBasedTable::NewRangeTombstoneIterator(
2853
    const ReadOptions& read_options) {
2854 2855 2856
  if (rep_->fragmented_range_dels == nullptr) {
    return nullptr;
  }
2857 2858 2859 2860 2861
  SequenceNumber snapshot = kMaxSequenceNumber;
  if (read_options.snapshot != nullptr) {
    snapshot = read_options.snapshot->GetSequenceNumber();
  }
  return new FragmentedRangeTombstoneIterator(
2862
      rep_->fragmented_range_dels, rep_->internal_comparator, snapshot);
2863 2864
}

2865 2866 2867
bool BlockBasedTable::FullFilterKeyMayMatch(
    const ReadOptions& read_options, FilterBlockReader* filter,
    const Slice& internal_key, const bool no_io,
2868 2869
    const SliceTransform* prefix_extractor,
    BlockCacheLookupContext* lookup_context) const {
2870 2871 2872 2873
  if (filter == nullptr || filter->IsBlockBased()) {
    return true;
  }
  Slice user_key = ExtractUserKey(internal_key);
M
Maysam Yabandeh 已提交
2874
  const Slice* const const_ikey_ptr = &internal_key;
2875
  bool may_match = true;
2876
  if (filter->whole_key_filtering()) {
2877 2878 2879
    size_t ts_sz =
        rep_->internal_comparator.user_comparator()->timestamp_size();
    Slice user_key_without_ts = StripTimestampFromUserKey(user_key, ts_sz);
2880 2881 2882
    may_match =
        filter->KeyMayMatch(user_key_without_ts, prefix_extractor, kNotValid,
                            no_io, const_ikey_ptr, lookup_context);
2883
  } else if (!read_options.total_order_seek && prefix_extractor &&
2884
             rep_->table_properties->prefix_extractor_name.compare(
2885 2886 2887 2888
                 prefix_extractor->Name()) == 0 &&
             prefix_extractor->InDomain(user_key) &&
             !filter->PrefixMayMatch(prefix_extractor->Transform(user_key),
                                     prefix_extractor, kNotValid, false,
2889
                                     const_ikey_ptr, lookup_context)) {
2890 2891 2892 2893
    may_match = false;
  }
  if (may_match) {
    RecordTick(rep_->ioptions.statistics, BLOOM_FILTER_FULL_POSITIVE);
2894
    PERF_COUNTER_BY_LEVEL_ADD(bloom_filter_full_positive, 1, rep_->level);
2895
  }
2896
  return may_match;
2897 2898
}

2899 2900 2901
void BlockBasedTable::FullFilterKeysMayMatch(
    const ReadOptions& read_options, FilterBlockReader* filter,
    MultiGetRange* range, const bool no_io,
2902 2903
    const SliceTransform* prefix_extractor,
    BlockCacheLookupContext* lookup_context) const {
2904 2905 2906 2907
  if (filter == nullptr || filter->IsBlockBased()) {
    return;
  }
  if (filter->whole_key_filtering()) {
2908 2909
    filter->KeysMayMatch(range, prefix_extractor, kNotValid, no_io,
                         lookup_context);
2910 2911 2912 2913 2914 2915 2916 2917 2918 2919
  } else if (!read_options.total_order_seek && prefix_extractor &&
             rep_->table_properties->prefix_extractor_name.compare(
                 prefix_extractor->Name()) == 0) {
    for (auto iter = range->begin(); iter != range->end(); ++iter) {
      Slice user_key = iter->lkey->user_key();

      if (!prefix_extractor->InDomain(user_key)) {
        range->SkipKey(iter);
      }
    }
2920 2921
    filter->PrefixesMayMatch(range, prefix_extractor, kNotValid, false,
                             lookup_context);
2922 2923 2924
  }
}

2925
Status BlockBasedTable::Get(const ReadOptions& read_options, const Slice& key,
2926 2927 2928
                            GetContext* get_context,
                            const SliceTransform* prefix_extractor,
                            bool skip_filters) {
M
Maysam Yabandeh 已提交
2929
  assert(key.size() >= 8);  // key must be internal key
S
Sanjay Ghemawat 已提交
2930
  Status s;
M
Maysam Yabandeh 已提交
2931
  const bool no_io = read_options.read_tier == kBlockCacheTier;
2932
  CachableEntry<FilterBlockReader> filter_entry;
2933 2934
  bool may_match;
  FilterBlockReader* filter = nullptr;
2935
  BlockCacheLookupContext lookup_context{TableReaderCaller::kUserGet};
2936 2937
  {
    if (!skip_filters) {
2938 2939 2940
      filter_entry = GetFilter(prefix_extractor, /*prefetch_buffer=*/nullptr,
                               read_options.read_tier == kBlockCacheTier,
                               get_context, &lookup_context);
2941
    }
2942
    filter = filter_entry.GetValue();
2943

2944 2945 2946
    // First check the full filter
    // If full filter not useful, Then go into each block
    may_match = FullFilterKeyMayMatch(read_options, filter, key, no_io,
2947
                                      prefix_extractor, &lookup_context);
2948 2949
  }
  if (!may_match) {
2950
    RecordTick(rep_->ioptions.statistics, BLOOM_FILTER_USEFUL);
2951
    PERF_COUNTER_BY_LEVEL_ADD(bloom_filter_useful, 1, rep_->level);
2952
  } else {
M
Maysam Yabandeh 已提交
2953
    IndexBlockIter iiter_on_stack;
2954 2955
    // if prefix_extractor found in block differs from options, disable
    // BlockPrefixIndex. Only do this check when index_type is kHashSearch.
2956
    bool need_upper_bound_check = false;
2957
    if (rep_->index_type == BlockBasedTableOptions::kHashSearch) {
2958
      need_upper_bound_check = PrefixExtractorChanged(
2959
          rep_->table_properties.get(), prefix_extractor);
2960
    }
2961 2962 2963
    auto iiter =
        NewIndexIterator(read_options, need_upper_bound_check, &iiter_on_stack,
                         get_context, &lookup_context);
2964
    std::unique_ptr<InternalIteratorBase<BlockHandle>> iiter_unique_ptr;
M
Maysam Yabandeh 已提交
2965
    if (iiter != &iiter_on_stack) {
M
Maysam Yabandeh 已提交
2966
      iiter_unique_ptr.reset(iiter);
M
Maysam Yabandeh 已提交
2967
    }
2968

2969 2970
    size_t ts_sz =
        rep_->internal_comparator.user_comparator()->timestamp_size();
2971
    bool matched = false;  // if such user key mathced a key in SST
2972
    bool done = false;
M
Maysam Yabandeh 已提交
2973
    for (iiter->Seek(key); iiter->Valid() && !done; iiter->Next()) {
2974
      BlockHandle handle = iiter->value();
2975

2976 2977
      bool not_exist_in_filter =
          filter != nullptr && filter->IsBlockBased() == true &&
2978
          !filter->KeyMayMatch(ExtractUserKeyAndStripTimestamp(key, ts_sz),
2979 2980
                               prefix_extractor, handle.offset(), no_io,
                               /*const_ikey_ptr=*/nullptr, &lookup_context);
2981 2982 2983 2984 2985 2986

      if (not_exist_in_filter) {
        // Not found
        // TODO: think about interaction with Merge. If a user key cannot
        // cross one data block, we should be fine.
        RecordTick(rep_->ioptions.statistics, BLOOM_FILTER_USEFUL);
2987
        PERF_COUNTER_BY_LEVEL_ADD(bloom_filter_useful, 1, rep_->level);
2988 2989
        break;
      } else {
2990
        BlockCacheLookupContext lookup_data_block_context{
2991
            TableReaderCaller::kUserGet};
2992
        bool does_referenced_key_exist = false;
M
Maysam Yabandeh 已提交
2993
        DataBlockIter biter;
2994
        uint64_t referenced_data_size = 0;
M
Maysam Yabandeh 已提交
2995
        NewDataBlockIterator<DataBlockIter>(
2996
            read_options, iiter->value(), &biter, BlockType::kData,
2997
            /*key_includes_seq=*/true,
2998
            /*index_key_is_full=*/true, get_context, &lookup_data_block_context,
2999
            /*s=*/Status(), /*prefetch_buffer*/ nullptr);
3000

3001
        if (read_options.read_tier == kBlockCacheTier &&
M
Maysam Yabandeh 已提交
3002
            biter.status().IsIncomplete()) {
3003
          // couldn't get block from block_cache
3004 3005
          // Update Saver.state to Found because we are only looking for
          // whether we can guarantee the key is not there when "no_io" is set
3006
          get_context->MarkKeyMayExist();
3007 3008
          break;
        }
M
Maysam Yabandeh 已提交
3009 3010
        if (!biter.status().ok()) {
          s = biter.status();
3011 3012 3013
          break;
        }

3014
        bool may_exist = biter.SeekForGet(key);
3015 3016 3017
        // If user-specified timestamp is supported, we cannot end the search
        // just because hash index lookup indicates the key+ts does not exist.
        if (!may_exist && ts_sz == 0) {
3018 3019 3020 3021
          // HashSeek cannot find the key this block and the the iter is not
          // the end of the block, i.e. cannot be in the following blocks
          // either. In this case, the seek_key cannot be found, so we break
          // from the top level for-loop.
3022 3023 3024 3025 3026 3027 3028 3029
          done = true;
        } else {
          // Call the *saver function on each entry/block until it returns false
          for (; biter.Valid(); biter.Next()) {
            ParsedInternalKey parsed_key;
            if (!ParseInternalKey(biter.key(), &parsed_key)) {
              s = Status::Corruption(Slice());
            }
3030

3031 3032 3033 3034 3035 3036 3037 3038
            if (!get_context->SaveValue(
                    parsed_key, biter.value(), &matched,
                    biter.IsValuePinned() ? &biter : nullptr)) {
              does_referenced_key_exist = true;
              referenced_data_size = biter.key().size() + biter.value().size();
              done = true;
              break;
            }
3039
          }
3040 3041 3042 3043 3044 3045 3046 3047 3048 3049 3050 3051 3052 3053 3054 3055 3056 3057 3058 3059
          s = biter.status();
        }
        // Write the block cache access record.
        if (block_cache_tracer_) {
          // Avoid making copy of block_key, cf_name, and referenced_key when
          // constructing the access record.
          BlockCacheTraceRecord access_record(
              rep_->ioptions.env->NowMicros(),
              /*block_key=*/"", lookup_data_block_context.block_type,
              lookup_data_block_context.block_size, rep_->cf_id_for_tracing(),
              /*cf_name=*/"", rep_->level_for_tracing(),
              rep_->sst_number_for_tracing(), lookup_data_block_context.caller,
              lookup_data_block_context.is_cache_hit,
              lookup_data_block_context.no_insert,
              /*referenced_key=*/"", referenced_data_size,
              lookup_data_block_context.num_keys_in_block,
              does_referenced_key_exist);
          block_cache_tracer_->WriteBlockAccess(
              access_record, lookup_data_block_context.block_key,
              rep_->cf_name_for_tracing(), key);
3060
        }
S
Sanjay Ghemawat 已提交
3061
      }
3062

M
Maysam Yabandeh 已提交
3063 3064 3065 3066
      if (done) {
        // Avoid the extra Next which is expensive in two-level indexes
        break;
      }
3067
    }
3068 3069
    if (matched && filter != nullptr && !filter->IsBlockBased()) {
      RecordTick(rep_->ioptions.statistics, BLOOM_FILTER_FULL_TRUE_POSITIVE);
3070 3071
      PERF_COUNTER_BY_LEVEL_ADD(bloom_filter_full_true_positive, 1,
                                rep_->level);
3072
    }
3073
    if (s.ok()) {
M
Maysam Yabandeh 已提交
3074
      s = iiter->status();
S
Sanjay Ghemawat 已提交
3075 3076
    }
  }
K
Kai Liu 已提交
3077

S
Sanjay Ghemawat 已提交
3078 3079 3080
  return s;
}

3081 3082 3083 3084 3085
using MultiGetRange = MultiGetContext::Range;
void BlockBasedTable::MultiGet(const ReadOptions& read_options,
                               const MultiGetRange* mget_range,
                               const SliceTransform* prefix_extractor,
                               bool skip_filters) {
3086
  BlockCacheLookupContext lookup_context{TableReaderCaller::kUserMultiGet};
3087 3088 3089 3090 3091 3092 3093 3094
  const bool no_io = read_options.read_tier == kBlockCacheTier;
  CachableEntry<FilterBlockReader> filter_entry;
  FilterBlockReader* filter = nullptr;
  MultiGetRange sst_file_range(*mget_range, mget_range->begin(),
                               mget_range->end());
  {
    if (!skip_filters) {
      // TODO: Figure out where the stats should go
3095
      filter_entry = GetFilter(prefix_extractor, /*prefetch_buffer=*/nullptr,
3096
                               read_options.read_tier == kBlockCacheTier,
3097
                               /*get_context=*/nullptr, &lookup_context);
3098
    }
3099
    filter = filter_entry.GetValue();
3100 3101 3102 3103

    // First check the full filter
    // If full filter not useful, Then go into each block
    FullFilterKeysMayMatch(read_options, filter, &sst_file_range, no_io,
3104
                           prefix_extractor, &lookup_context);
3105 3106 3107 3108 3109 3110 3111 3112 3113 3114
  }
  if (skip_filters || !sst_file_range.empty()) {
    IndexBlockIter iiter_on_stack;
    // if prefix_extractor found in block differs from options, disable
    // BlockPrefixIndex. Only do this check when index_type is kHashSearch.
    bool need_upper_bound_check = false;
    if (rep_->index_type == BlockBasedTableOptions::kHashSearch) {
      need_upper_bound_check = PrefixExtractorChanged(
          rep_->table_properties.get(), prefix_extractor);
    }
3115 3116
    auto iiter =
        NewIndexIterator(read_options, need_upper_bound_check, &iiter_on_stack,
3117
                         sst_file_range.begin()->get_context, &lookup_context);
3118 3119 3120 3121 3122
    std::unique_ptr<InternalIteratorBase<BlockHandle>> iiter_unique_ptr;
    if (iiter != &iiter_on_stack) {
      iiter_unique_ptr.reset(iiter);
    }

3123 3124
    DataBlockIter biter;
    uint64_t offset = std::numeric_limits<uint64_t>::max();
3125 3126 3127 3128 3129 3130 3131 3132
    for (auto miter = sst_file_range.begin(); miter != sst_file_range.end();
         ++miter) {
      Status s;
      GetContext* get_context = miter->get_context;
      const Slice& key = miter->ikey;
      bool matched = false;  // if such user key matched a key in SST
      bool done = false;
      for (iiter->Seek(key); iiter->Valid() && !done; iiter->Next()) {
3133
        bool reusing_block = true;
3134 3135 3136
        uint64_t referenced_data_size = 0;
        bool does_referenced_key_exist = false;
        BlockCacheLookupContext lookup_data_block_context(
3137
            TableReaderCaller::kUserMultiGet);
3138 3139 3140 3141
        if (iiter->value().offset() != offset) {
          offset = iiter->value().offset();
          biter.Invalidate(Status::OK());
          NewDataBlockIterator<DataBlockIter>(
3142 3143
              read_options, iiter->value(), &biter, BlockType::kData,
              /*key_includes_seq=*/false,
3144 3145
              /*index_key_is_full=*/true, get_context,
              &lookup_data_block_context, Status(), nullptr);
3146 3147
          reusing_block = false;
        }
3148 3149 3150 3151 3152 3153 3154 3155 3156 3157 3158 3159 3160 3161 3162 3163 3164 3165 3166
        if (read_options.read_tier == kBlockCacheTier &&
            biter.status().IsIncomplete()) {
          // couldn't get block from block_cache
          // Update Saver.state to Found because we are only looking for
          // whether we can guarantee the key is not there when "no_io" is set
          get_context->MarkKeyMayExist();
          break;
        }
        if (!biter.status().ok()) {
          s = biter.status();
          break;
        }

        bool may_exist = biter.SeekForGet(key);
        if (!may_exist) {
          // HashSeek cannot find the key this block and the the iter is not
          // the end of the block, i.e. cannot be in the following blocks
          // either. In this case, the seek_key cannot be found, so we break
          // from the top level for-loop.
3167 3168 3169 3170 3171 3172 3173 3174 3175 3176 3177 3178 3179 3180 3181 3182 3183 3184 3185 3186 3187 3188
          done = true;
        } else {
          // Call the *saver function on each entry/block until it returns false
          for (; biter.Valid(); biter.Next()) {
            ParsedInternalKey parsed_key;
            Cleanable dummy;
            Cleanable* value_pinner = nullptr;

            if (!ParseInternalKey(biter.key(), &parsed_key)) {
              s = Status::Corruption(Slice());
            }
            if (biter.IsValuePinned()) {
              if (reusing_block) {
                Cache* block_cache = rep_->table_options.block_cache.get();
                assert(biter.cache_handle() != nullptr);
                block_cache->Ref(biter.cache_handle());
                dummy.RegisterCleanup(&ReleaseCachedEntry, block_cache,
                                      biter.cache_handle());
                value_pinner = &dummy;
              } else {
                value_pinner = &biter;
              }
3189
            }
3190

3191 3192 3193 3194 3195 3196 3197
            if (!get_context->SaveValue(parsed_key, biter.value(), &matched,
                                        value_pinner)) {
              does_referenced_key_exist = true;
              referenced_data_size = biter.key().size() + biter.value().size();
              done = true;
              break;
            }
3198
          }
3199 3200 3201 3202 3203 3204 3205 3206 3207 3208 3209 3210 3211 3212 3213 3214 3215 3216 3217 3218
          s = biter.status();
        }
        // Write the block cache access.
        if (block_cache_tracer_) {
          // Avoid making copy of block_key, cf_name, and referenced_key when
          // constructing the access record.
          BlockCacheTraceRecord access_record(
              rep_->ioptions.env->NowMicros(),
              /*block_key=*/"", lookup_data_block_context.block_type,
              lookup_data_block_context.block_size, rep_->cf_id_for_tracing(),
              /*cf_name=*/"", rep_->level_for_tracing(),
              rep_->sst_number_for_tracing(), lookup_data_block_context.caller,
              lookup_data_block_context.is_cache_hit,
              lookup_data_block_context.no_insert,
              /*referenced_key=*/"", referenced_data_size,
              lookup_data_block_context.num_keys_in_block,
              does_referenced_key_exist);
          block_cache_tracer_->WriteBlockAccess(
              access_record, lookup_data_block_context.block_key,
              rep_->cf_name_for_tracing(), key);
3219 3220 3221 3222 3223 3224 3225 3226 3227 3228 3229 3230 3231 3232 3233 3234 3235 3236 3237
        }
        if (done) {
          // Avoid the extra Next which is expensive in two-level indexes
          break;
        }
      }
      if (matched && filter != nullptr && !filter->IsBlockBased()) {
        RecordTick(rep_->ioptions.statistics, BLOOM_FILTER_FULL_TRUE_POSITIVE);
        PERF_COUNTER_BY_LEVEL_ADD(bloom_filter_full_true_positive, 1,
                                  rep_->level);
      }
      if (s.ok()) {
        s = iiter->status();
      }
      *(miter->s) = s;
    }
  }
}

3238 3239 3240
Status BlockBasedTable::Prefetch(const Slice* const begin,
                                 const Slice* const end) {
  auto& comparator = rep_->internal_comparator;
M
Maysam Yabandeh 已提交
3241
  auto user_comparator = comparator.user_comparator();
3242 3243 3244 3245
  // pre-condition
  if (begin && end && comparator.Compare(*begin, *end) > 0) {
    return Status::InvalidArgument(*begin, *end);
  }
3246
  BlockCacheLookupContext lookup_context{TableReaderCaller::kPrefetch};
M
Maysam Yabandeh 已提交
3247
  IndexBlockIter iiter_on_stack;
3248 3249 3250
  auto iiter = NewIndexIterator(ReadOptions(), /*need_upper_bound_check=*/false,
                                &iiter_on_stack, /*get_context=*/nullptr,
                                &lookup_context);
3251
  std::unique_ptr<InternalIteratorBase<BlockHandle>> iiter_unique_ptr;
M
Maysam Yabandeh 已提交
3252
  if (iiter != &iiter_on_stack) {
3253 3254
    iiter_unique_ptr =
        std::unique_ptr<InternalIteratorBase<BlockHandle>>(iiter);
M
Maysam Yabandeh 已提交
3255
  }
3256

M
Maysam Yabandeh 已提交
3257
  if (!iiter->status().ok()) {
3258
    // error opening index iterator
M
Maysam Yabandeh 已提交
3259
    return iiter->status();
3260 3261 3262 3263 3264
  }

  // indicates if we are on the last page that need to be pre-fetched
  bool prefetching_boundary_page = false;

M
Maysam Yabandeh 已提交
3265 3266
  for (begin ? iiter->Seek(*begin) : iiter->SeekToFirst(); iiter->Valid();
       iiter->Next()) {
3267
    BlockHandle block_handle = iiter->value();
3268 3269
    const bool is_user_key = rep_->table_properties &&
                             rep_->table_properties->index_key_is_user_key > 0;
M
Maysam Yabandeh 已提交
3270 3271 3272 3273
    if (end &&
        ((!is_user_key && comparator.Compare(iiter->key(), *end) >= 0) ||
         (is_user_key &&
          user_comparator->Compare(iiter->key(), ExtractUserKey(*end)) >= 0))) {
3274 3275 3276 3277 3278 3279 3280 3281 3282 3283
      if (prefetching_boundary_page) {
        break;
      }

      // The index entry represents the last key in the data block.
      // We should load this page into memory as well, but no more
      prefetching_boundary_page = true;
    }

    // Load the block specified by the block_handle into the block cache
M
Maysam Yabandeh 已提交
3284
    DataBlockIter biter;
3285 3286 3287 3288 3289 3290

    NewDataBlockIterator<DataBlockIter>(
        ReadOptions(), block_handle, &biter, /*type=*/BlockType::kData,
        /*key_includes_seq=*/true, /*index_key_is_full=*/true,
        /*get_context=*/nullptr, &lookup_context, Status(),
        /*prefetch_buffer=*/nullptr);
3291 3292 3293 3294 3295 3296 3297 3298 3299 3300

    if (!biter.status().ok()) {
      // there was an unexpected error while pre-fetching
      return biter.status();
    }
  }

  return Status::OK();
}

3301
Status BlockBasedTable::VerifyChecksum(TableReaderCaller caller) {
A
Aaron G 已提交
3302 3303 3304 3305
  Status s;
  // Check Meta blocks
  std::unique_ptr<Block> meta;
  std::unique_ptr<InternalIterator> meta_iter;
3306
  s = ReadMetaBlock(nullptr /* prefetch buffer */, &meta, &meta_iter);
A
Aaron G 已提交
3307
  if (s.ok()) {
3308
    s = VerifyChecksumInMetaBlocks(meta_iter.get());
A
Aaron G 已提交
3309 3310 3311 3312 3313 3314 3315
    if (!s.ok()) {
      return s;
    }
  } else {
    return s;
  }
  // Check Data blocks
M
Maysam Yabandeh 已提交
3316
  IndexBlockIter iiter_on_stack;
3317
  BlockCacheLookupContext context{caller};
3318 3319
  InternalIteratorBase<BlockHandle>* iiter = NewIndexIterator(
      ReadOptions(), /*need_upper_bound_check=*/false, &iiter_on_stack,
3320
      /*get_context=*/nullptr, &context);
3321
  std::unique_ptr<InternalIteratorBase<BlockHandle>> iiter_unique_ptr;
A
Aaron G 已提交
3322
  if (iiter != &iiter_on_stack) {
3323 3324
    iiter_unique_ptr =
        std::unique_ptr<InternalIteratorBase<BlockHandle>>(iiter);
A
Aaron G 已提交
3325 3326 3327 3328 3329 3330 3331 3332 3333
  }
  if (!iiter->status().ok()) {
    // error opening index iterator
    return iiter->status();
  }
  s = VerifyChecksumInBlocks(iiter);
  return s;
}

3334 3335
Status BlockBasedTable::VerifyChecksumInBlocks(
    InternalIteratorBase<BlockHandle>* index_iter) {
A
Aaron G 已提交
3336 3337 3338 3339 3340 3341
  Status s;
  for (index_iter->SeekToFirst(); index_iter->Valid(); index_iter->Next()) {
    s = index_iter->status();
    if (!s.ok()) {
      break;
    }
3342 3343
    BlockHandle handle = index_iter->value();
    BlockContents contents;
3344 3345 3346
    BlockFetcher block_fetcher(
        rep_->file.get(), nullptr /* prefetch buffer */, rep_->footer,
        ReadOptions(), handle, &contents, rep_->ioptions,
3347
        false /* decompress */, false /*maybe_compressed*/, BlockType::kData,
3348
        UncompressionDict::GetEmptyDict(), rep_->persistent_cache_options);
3349 3350 3351 3352 3353 3354 3355 3356
    s = block_fetcher.ReadBlockContents();
    if (!s.ok()) {
      break;
    }
  }
  return s;
}

3357 3358 3359 3360 3361 3362 3363 3364 3365 3366 3367 3368 3369 3370 3371 3372 3373 3374 3375 3376 3377 3378 3379 3380 3381 3382 3383 3384 3385 3386 3387 3388
BlockType BlockBasedTable::GetBlockTypeForMetaBlockByName(
    const Slice& meta_block_name) {
  if (meta_block_name.starts_with(kFilterBlockPrefix) ||
      meta_block_name.starts_with(kFullFilterBlockPrefix) ||
      meta_block_name.starts_with(kPartitionedFilterBlockPrefix)) {
    return BlockType::kFilter;
  }

  if (meta_block_name == kPropertiesBlock) {
    return BlockType::kProperties;
  }

  if (meta_block_name == kCompressionDictBlock) {
    return BlockType::kCompressionDictionary;
  }

  if (meta_block_name == kRangeDelBlock) {
    return BlockType::kRangeDeletion;
  }

  if (meta_block_name == kHashIndexPrefixesBlock) {
    return BlockType::kHashIndexPrefixes;
  }

  if (meta_block_name == kHashIndexPrefixesMetadataBlock) {
    return BlockType::kHashIndexMetadata;
  }

  assert(false);
  return BlockType::kInvalid;
}

3389
Status BlockBasedTable::VerifyChecksumInMetaBlocks(
3390 3391 3392 3393
    InternalIteratorBase<Slice>* index_iter) {
  Status s;
  for (index_iter->SeekToFirst(); index_iter->Valid(); index_iter->Next()) {
    s = index_iter->status();
A
Aaron G 已提交
3394 3395 3396
    if (!s.ok()) {
      break;
    }
3397 3398 3399
    BlockHandle handle;
    Slice input = index_iter->value();
    s = handle.DecodeFrom(&input);
A
Aaron G 已提交
3400
    BlockContents contents;
3401
    const Slice meta_block_name = index_iter->key();
3402 3403 3404 3405
    BlockFetcher block_fetcher(
        rep_->file.get(), nullptr /* prefetch buffer */, rep_->footer,
        ReadOptions(), handle, &contents, rep_->ioptions,
        false /* decompress */, false /*maybe_compressed*/,
3406
        GetBlockTypeForMetaBlockByName(meta_block_name),
3407
        UncompressionDict::GetEmptyDict(), rep_->persistent_cache_options);
S
Siying Dong 已提交
3408
    s = block_fetcher.ReadBlockContents();
3409
    if (s.IsCorruption() && meta_block_name == kPropertiesBlock) {
3410
      TableProperties* table_properties;
3411
      s = TryReadPropertiesWithGlobalSeqno(nullptr /* prefetch_buffer */,
3412 3413 3414 3415
                                           index_iter->value(),
                                           &table_properties);
      delete table_properties;
    }
A
Aaron G 已提交
3416 3417 3418 3419 3420 3421 3422
    if (!s.ok()) {
      break;
    }
  }
  return s;
}

3423 3424 3425 3426 3427 3428 3429 3430 3431
bool BlockBasedTable::TEST_BlockInCache(const BlockHandle& handle) const {
  assert(rep_ != nullptr);

  Cache* const cache = rep_->table_options.block_cache.get();
  if (cache == nullptr) {
    return false;
  }

  char cache_key_storage[kMaxCacheKeyPrefixSize + kMaxVarint64Length];
3432 3433 3434
  Slice cache_key =
      GetCacheKey(rep_->cache_key_prefix, rep_->cache_key_prefix_size, handle,
                  cache_key_storage);
3435 3436 3437 3438 3439 3440 3441 3442 3443 3444 3445

  Cache::Handle* const cache_handle = cache->Lookup(cache_key);
  if (cache_handle == nullptr) {
    return false;
  }

  cache->Release(cache_handle);

  return true;
}

S
Siying Dong 已提交
3446 3447
bool BlockBasedTable::TEST_KeyInCache(const ReadOptions& options,
                                      const Slice& key) {
3448 3449 3450
  std::unique_ptr<InternalIteratorBase<BlockHandle>> iiter(NewIndexIterator(
      options, /*need_upper_bound_check=*/false, /*input_iter=*/nullptr,
      /*get_context=*/nullptr, /*lookup_contex=*/nullptr));
I
Igor Canadi 已提交
3451 3452 3453
  iiter->Seek(key);
  assert(iiter->Valid());

3454
  return TEST_BlockInCache(iiter->value());
3455
}
S
Sanjay Ghemawat 已提交
3456

3457
BlockBasedTableOptions::IndexType BlockBasedTable::UpdateIndexType() {
3458 3459
  // Some old version of block-based tables don't have index type present in
  // table properties. If that's the case we can safely use the kBinarySearch.
3460 3461
  BlockBasedTableOptions::IndexType index_type_on_file =
      BlockBasedTableOptions::kBinarySearch;
3462 3463 3464 3465 3466 3467
  if (rep_->table_properties) {
    auto& props = rep_->table_properties->user_collected_properties;
    auto pos = props.find(BlockBasedTablePropertyNames::kIndexType);
    if (pos != props.end()) {
      index_type_on_file = static_cast<BlockBasedTableOptions::IndexType>(
          DecodeFixed32(pos->second.c_str()));
3468 3469
      // update index_type with the true type
      rep_->index_type = index_type_on_file;
3470 3471
    }
  }
3472 3473 3474 3475 3476 3477 3478 3479 3480 3481
  return index_type_on_file;
}

// REQUIRES: The following fields of rep_ should have already been populated:
//  1. file
//  2. index_handle,
//  3. options
//  4. internal_comparator
//  5. index_type
Status BlockBasedTable::CreateIndexReader(
3482 3483
    FilePrefetchBuffer* prefetch_buffer,
    InternalIterator* preloaded_meta_index_iter, bool use_cache, bool prefetch,
3484 3485
    bool pin, IndexReader** index_reader,
    BlockCacheLookupContext* lookup_context) {
3486
  auto index_type_on_file = rep_->index_type;
3487

3488 3489
  // kHashSearch requires non-empty prefix_extractor but bypass checking
  // prefix_extractor here since we have no access to MutableCFOptions.
3490
  // Add need_upper_bound_check flag in  BlockBasedTable::NewIndexIterator.
3491 3492
  // If prefix_extractor does not match prefix_extractor_name from table
  // properties, turn off Hash Index by setting total_order_seek to true
3493

K
Kai Liu 已提交
3494
  switch (index_type_on_file) {
M
Maysam Yabandeh 已提交
3495
    case BlockBasedTableOptions::kTwoLevelIndexSearch: {
3496
      return PartitionIndexReader::Create(this, prefetch_buffer, use_cache,
3497 3498
                                          prefetch, pin, index_reader,
                                          lookup_context);
M
Maysam Yabandeh 已提交
3499
    }
3500
    case BlockBasedTableOptions::kBinarySearch: {
3501
      return BinarySearchIndexReader::Create(this, prefetch_buffer, use_cache,
3502 3503
                                             prefetch, pin, index_reader,
                                             lookup_context);
3504 3505
    }
    case BlockBasedTableOptions::kHashSearch: {
K
Kai Liu 已提交
3506
      std::unique_ptr<Block> meta_guard;
S
sdong 已提交
3507
      std::unique_ptr<InternalIterator> meta_iter_guard;
K
Kai Liu 已提交
3508 3509
      auto meta_index_iter = preloaded_meta_index_iter;
      if (meta_index_iter == nullptr) {
3510
        auto s = ReadMetaBlock(prefetch_buffer, &meta_guard, &meta_iter_guard);
K
Kai Liu 已提交
3511
        if (!s.ok()) {
3512 3513
          // we simply fall back to binary search in case there is any
          // problem with prefix hash index loading.
3514 3515 3516
          ROCKS_LOG_WARN(rep_->ioptions.info_log,
                         "Unable to read the metaindex block."
                         " Fall back to binary search index.");
3517 3518 3519
          return BinarySearchIndexReader::Create(this, prefetch_buffer,
                                                 use_cache, prefetch, pin,
                                                 index_reader, lookup_context);
K
Kai Liu 已提交
3520 3521 3522 3523
        }
        meta_index_iter = meta_iter_guard.get();
      }

3524
      return HashIndexReader::Create(this, prefetch_buffer, meta_index_iter,
3525 3526
                                     use_cache, prefetch, pin, index_reader,
                                     lookup_context);
3527 3528 3529
    }
    default: {
      std::string error_message =
3530
          "Unrecognized index type: " + ToString(index_type_on_file);
3531
      return Status::InvalidArgument(error_message.c_str());
3532 3533 3534 3535
    }
  }
}

3536
uint64_t BlockBasedTable::ApproximateOffsetOf(const Slice& key,
3537 3538
                                              TableReaderCaller caller) {
  BlockCacheLookupContext context(caller);
3539
  std::unique_ptr<InternalIteratorBase<BlockHandle>> index_iter(
3540 3541 3542
      NewIndexIterator(ReadOptions(), /*need_upper_bound_check=*/false,
                       /*input_iter=*/nullptr, /*get_context=*/nullptr,
                       /*lookup_contex=*/&context));
K
Kai Liu 已提交
3543

J
jorlow@chromium.org 已提交
3544 3545 3546
  index_iter->Seek(key);
  uint64_t result;
  if (index_iter->Valid()) {
3547 3548
    BlockHandle handle = index_iter->value();
    result = handle.offset();
J
jorlow@chromium.org 已提交
3549
  } else {
K
Kai Liu 已提交
3550 3551 3552
    // key is past the last key in the file. If table_properties is not
    // available, approximate the offset by returning the offset of the
    // metaindex block (which is right near the end of the file).
3553 3554 3555 3556
    result = 0;
    if (rep_->table_properties) {
      result = rep_->table_properties->data_size;
    }
K
Kai Liu 已提交
3557 3558
    // table_properties is not present in the table.
    if (result == 0) {
I
xxHash  
Igor Canadi 已提交
3559
      result = rep_->footer.metaindex_handle().offset();
K
Kai Liu 已提交
3560
    }
J
jorlow@chromium.org 已提交
3561 3562 3563 3564
  }
  return result;
}

3565 3566 3567 3568
bool BlockBasedTable::TEST_filter_block_preloaded() const {
  return rep_->filter != nullptr;
}

3569 3570 3571 3572
bool BlockBasedTable::TEST_IndexBlockInCache() const {
  assert(rep_ != nullptr);

  return TEST_BlockInCache(rep_->footer.index_handle());
3573 3574
}

O
omegaga 已提交
3575 3576
Status BlockBasedTable::GetKVPairsFromDataBlocks(
    std::vector<KVPairBlock>* kv_pair_blocks) {
3577
  std::unique_ptr<InternalIteratorBase<BlockHandle>> blockhandles_iter(
3578 3579 3580
      NewIndexIterator(ReadOptions(), /*need_upper_bound_check=*/false,
                       /*input_iter=*/nullptr, /*get_context=*/nullptr,
                       /*lookup_contex=*/nullptr));
O
omegaga 已提交
3581 3582 3583 3584 3585 3586 3587 3588 3589 3590 3591 3592 3593 3594 3595 3596

  Status s = blockhandles_iter->status();
  if (!s.ok()) {
    // Cannot read Index Block
    return s;
  }

  for (blockhandles_iter->SeekToFirst(); blockhandles_iter->Valid();
       blockhandles_iter->Next()) {
    s = blockhandles_iter->status();

    if (!s.ok()) {
      break;
    }

    std::unique_ptr<InternalIterator> datablock_iter;
M
Maysam Yabandeh 已提交
3597
    datablock_iter.reset(NewDataBlockIterator<DataBlockIter>(
3598 3599 3600 3601 3602
        ReadOptions(), blockhandles_iter->value(), /*input_iter=*/nullptr,
        /*type=*/BlockType::kData,
        /*key_includes_seq=*/true, /*index_key_is_full=*/true,
        /*get_context=*/nullptr, /*lookup_context=*/nullptr, Status(),
        /*prefetch_buffer=*/nullptr));
O
omegaga 已提交
3603 3604 3605 3606 3607 3608 3609 3610 3611 3612 3613 3614 3615 3616 3617 3618 3619 3620 3621 3622 3623 3624 3625 3626 3627 3628 3629 3630
    s = datablock_iter->status();

    if (!s.ok()) {
      // Error reading the block - Skipped
      continue;
    }

    KVPairBlock kv_pair_block;
    for (datablock_iter->SeekToFirst(); datablock_iter->Valid();
         datablock_iter->Next()) {
      s = datablock_iter->status();
      if (!s.ok()) {
        // Error reading the block - Skipped
        break;
      }
      const Slice& key = datablock_iter->key();
      const Slice& value = datablock_iter->value();
      std::string key_copy = std::string(key.data(), key.size());
      std::string value_copy = std::string(value.data(), value.size());

      kv_pair_block.push_back(
          std::make_pair(std::move(key_copy), std::move(value_copy)));
    }
    kv_pair_blocks->push_back(std::move(kv_pair_block));
  }
  return Status::OK();
}

3631 3632
Status BlockBasedTable::DumpTable(WritableFile* out_file,
                                  const SliceTransform* prefix_extractor) {
3633 3634 3635 3636 3637 3638 3639 3640 3641 3642 3643 3644 3645
  // Output Footer
  out_file->Append(
      "Footer Details:\n"
      "--------------------------------------\n"
      "  ");
  out_file->Append(rep_->footer.ToString().c_str());
  out_file->Append("\n");

  // Output MetaIndex
  out_file->Append(
      "Metaindex Details:\n"
      "--------------------------------------\n");
  std::unique_ptr<Block> meta;
S
sdong 已提交
3646
  std::unique_ptr<InternalIterator> meta_iter;
3647
  Status s = ReadMetaBlock(nullptr /* prefetch_buffer */, &meta, &meta_iter);
3648 3649 3650 3651 3652 3653 3654 3655 3656 3657
  if (s.ok()) {
    for (meta_iter->SeekToFirst(); meta_iter->Valid(); meta_iter->Next()) {
      s = meta_iter->status();
      if (!s.ok()) {
        return s;
      }
      if (meta_iter->key() == rocksdb::kPropertiesBlock) {
        out_file->Append("  Properties block handle: ");
        out_file->Append(meta_iter->value().ToString(true).c_str());
        out_file->Append("\n");
3658 3659 3660 3661
      } else if (meta_iter->key() == rocksdb::kCompressionDictBlock) {
        out_file->Append("  Compression dictionary block handle: ");
        out_file->Append(meta_iter->value().ToString(true).c_str());
        out_file->Append("\n");
3662 3663 3664 3665 3666
      } else if (strstr(meta_iter->key().ToString().c_str(),
                        "filter.rocksdb.") != nullptr) {
        out_file->Append("  Filter block handle: ");
        out_file->Append(meta_iter->value().ToString(true).c_str());
        out_file->Append("\n");
3667 3668 3669 3670
      } else if (meta_iter->key() == rocksdb::kRangeDelBlock) {
        out_file->Append("  Range deletion block handle: ");
        out_file->Append(meta_iter->value().ToString(true).c_str());
        out_file->Append("\n");
3671 3672 3673 3674 3675 3676 3677 3678 3679 3680 3681 3682 3683 3684 3685 3686 3687 3688 3689
      }
    }
    out_file->Append("\n");
  } else {
    return s;
  }

  // Output TableProperties
  const rocksdb::TableProperties* table_properties;
  table_properties = rep_->table_properties.get();

  if (table_properties != nullptr) {
    out_file->Append(
        "Table Properties:\n"
        "--------------------------------------\n"
        "  ");
    out_file->Append(table_properties->ToString("\n  ", ": ").c_str());
    out_file->Append("\n");

3690 3691 3692 3693 3694 3695 3696 3697 3698 3699 3700 3701 3702 3703 3704
    // Output Filter blocks
    if (!rep_->filter && !table_properties->filter_policy_name.empty()) {
      // Support only BloomFilter as off now
      rocksdb::BlockBasedTableOptions table_options;
      table_options.filter_policy.reset(rocksdb::NewBloomFilterPolicy(1));
      if (table_properties->filter_policy_name.compare(
              table_options.filter_policy->Name()) == 0) {
        std::string filter_block_key = kFilterBlockPrefix;
        filter_block_key.append(table_properties->filter_policy_name);
        BlockHandle handle;
        if (FindMetaBlock(meta_iter.get(), filter_block_key, &handle).ok()) {
          BlockContents block;
          BlockFetcher block_fetcher(
              rep_->file.get(), nullptr /* prefetch_buffer */, rep_->footer,
              ReadOptions(), handle, &block, rep_->ioptions,
3705
              false /*decompress*/, false /*maybe_compressed*/,
3706
              BlockType::kFilter, UncompressionDict::GetEmptyDict(),
3707
              rep_->persistent_cache_options);
3708 3709 3710 3711 3712 3713 3714
          s = block_fetcher.ReadBlockContents();
          if (!s.ok()) {
            rep_->filter.reset(new BlockBasedFilterBlockReader(
                prefix_extractor, table_options,
                table_options.whole_key_filtering, std::move(block),
                rep_->ioptions.statistics));
          }
3715 3716 3717 3718 3719 3720 3721 3722 3723 3724 3725 3726 3727 3728 3729 3730 3731 3732
        }
      }
    }
  }
  if (rep_->filter) {
    out_file->Append(
        "Filter Details:\n"
        "--------------------------------------\n"
        "  ");
    out_file->Append(rep_->filter->ToString().c_str());
    out_file->Append("\n");
  }

  // Output Index block
  s = DumpIndexBlock(out_file);
  if (!s.ok()) {
    return s;
  }
3733 3734

  // Output compression dictionary
3735 3736
  if (!rep_->compression_dict_handle.IsNull()) {
    std::unique_ptr<const BlockContents> compression_dict_block;
3737
    s = ReadCompressionDictBlock(nullptr /* prefetch_buffer */,
3738 3739 3740 3741 3742 3743
                                 &compression_dict_block);
    if (!s.ok()) {
      return s;
    }
    assert(compression_dict_block != nullptr);
    auto compression_dict = compression_dict_block->data;
3744 3745 3746 3747 3748 3749 3750 3751 3752 3753 3754
    out_file->Append(
        "Compression Dictionary:\n"
        "--------------------------------------\n");
    out_file->Append("  size (bytes): ");
    out_file->Append(rocksdb::ToString(compression_dict.size()));
    out_file->Append("\n\n");
    out_file->Append("  HEX    ");
    out_file->Append(compression_dict.ToString(true).c_str());
    out_file->Append("\n\n");
  }

3755
  // Output range deletions block
A
Andrew Kryczka 已提交
3756
  auto* range_del_iter = NewRangeTombstoneIterator(ReadOptions());
A
Andrew Kryczka 已提交
3757 3758 3759 3760 3761 3762 3763 3764 3765 3766 3767
  if (range_del_iter != nullptr) {
    range_del_iter->SeekToFirst();
    if (range_del_iter->Valid()) {
      out_file->Append(
          "Range deletions:\n"
          "--------------------------------------\n"
          "  ");
      for (; range_del_iter->Valid(); range_del_iter->Next()) {
        DumpKeyValue(range_del_iter->key(), range_del_iter->value(), out_file);
      }
      out_file->Append("\n");
3768
    }
A
Andrew Kryczka 已提交
3769
    delete range_del_iter;
3770
  }
3771 3772 3773 3774 3775 3776
  // Output Data blocks
  s = DumpDataBlocks(out_file);

  return s;
}

3777
void BlockBasedTable::Close() {
3778 3779 3780
  if (rep_->closed) {
    return;
  }
3781 3782 3783 3784 3785

  Cache* const cache = rep_->table_options.block_cache.get();

  // cleanup index, filter, and compression dictionary blocks
  // to avoid accessing dangling pointers
3786 3787
  if (!rep_->table_options.no_block_cache) {
    char cache_key[kMaxCacheKeyPrefixSize + kMaxVarint64Length];
3788

3789 3790
    // Get the filter block key
    auto key = GetCacheKey(rep_->cache_key_prefix, rep_->cache_key_prefix_size,
M
Maysam Yabandeh 已提交
3791
                           rep_->filter_handle, cache_key);
3792 3793 3794 3795 3796 3797 3798 3799
    cache->Erase(key);

    if (!rep_->compression_dict_handle.IsNull()) {
      // Get the compression dictionary block key
      key = GetCacheKey(rep_->cache_key_prefix, rep_->cache_key_prefix_size,
                        rep_->compression_dict_handle, cache_key);
      cache->Erase(key);
    }
3800
  }
3801

3802
  rep_->closed = true;
3803 3804
}

3805 3806 3807 3808
Status BlockBasedTable::DumpIndexBlock(WritableFile* out_file) {
  out_file->Append(
      "Index Details:\n"
      "--------------------------------------\n");
3809
  std::unique_ptr<InternalIteratorBase<BlockHandle>> blockhandles_iter(
3810 3811 3812
      NewIndexIterator(ReadOptions(), /*need_upper_bound_check=*/false,
                       /*input_iter=*/nullptr, /*get_context=*/nullptr,
                       /*lookup_contex=*/nullptr));
3813 3814 3815 3816 3817 3818 3819 3820 3821 3822 3823 3824 3825 3826 3827
  Status s = blockhandles_iter->status();
  if (!s.ok()) {
    out_file->Append("Can not read Index Block \n\n");
    return s;
  }

  out_file->Append("  Block key hex dump: Data block handle\n");
  out_file->Append("  Block key ascii\n\n");
  for (blockhandles_iter->SeekToFirst(); blockhandles_iter->Valid();
       blockhandles_iter->Next()) {
    s = blockhandles_iter->status();
    if (!s.ok()) {
      break;
    }
    Slice key = blockhandles_iter->key();
M
Maysam Yabandeh 已提交
3828
    Slice user_key;
3829
    InternalKey ikey;
3830 3831 3832 3833
    if (rep_->table_properties &&
        rep_->table_properties->index_key_is_user_key != 0) {
      user_key = key;
    } else {
M
Maysam Yabandeh 已提交
3834 3835 3836
      ikey.DecodeFrom(key);
      user_key = ikey.user_key();
    }
3837 3838

    out_file->Append("  HEX    ");
M
Maysam Yabandeh 已提交
3839
    out_file->Append(user_key.ToString(true).c_str());
3840 3841 3842 3843
    out_file->Append(": ");
    out_file->Append(blockhandles_iter->value().ToString(true).c_str());
    out_file->Append("\n");

M
Maysam Yabandeh 已提交
3844
    std::string str_key = user_key.ToString();
3845 3846 3847 3848 3849 3850 3851 3852 3853 3854 3855 3856 3857 3858 3859
    std::string res_key("");
    char cspace = ' ';
    for (size_t i = 0; i < str_key.size(); i++) {
      res_key.append(&str_key[i], 1);
      res_key.append(1, cspace);
    }
    out_file->Append("  ASCII  ");
    out_file->Append(res_key.c_str());
    out_file->Append("\n  ------\n");
  }
  out_file->Append("\n");
  return Status::OK();
}

Status BlockBasedTable::DumpDataBlocks(WritableFile* out_file) {
3860
  std::unique_ptr<InternalIteratorBase<BlockHandle>> blockhandles_iter(
3861 3862 3863
      NewIndexIterator(ReadOptions(), /*need_upper_bound_check=*/false,
                       /*input_iter=*/nullptr, /*get_context=*/nullptr,
                       /*lookup_contex=*/nullptr));
3864 3865 3866 3867 3868 3869
  Status s = blockhandles_iter->status();
  if (!s.ok()) {
    out_file->Append("Can not read Index Block \n\n");
    return s;
  }

3870 3871 3872 3873
  uint64_t datablock_size_min = std::numeric_limits<uint64_t>::max();
  uint64_t datablock_size_max = 0;
  uint64_t datablock_size_sum = 0;

3874 3875 3876 3877 3878 3879 3880 3881
  size_t block_id = 1;
  for (blockhandles_iter->SeekToFirst(); blockhandles_iter->Valid();
       block_id++, blockhandles_iter->Next()) {
    s = blockhandles_iter->status();
    if (!s.ok()) {
      break;
    }

3882
    BlockHandle bh = blockhandles_iter->value();
3883 3884 3885 3886 3887
    uint64_t datablock_size = bh.size();
    datablock_size_min = std::min(datablock_size_min, datablock_size);
    datablock_size_max = std::max(datablock_size_max, datablock_size);
    datablock_size_sum += datablock_size;

3888
    out_file->Append("Data Block # ");
S
sdong 已提交
3889
    out_file->Append(rocksdb::ToString(block_id));
3890 3891 3892 3893 3894
    out_file->Append(" @ ");
    out_file->Append(blockhandles_iter->value().ToString(true).c_str());
    out_file->Append("\n");
    out_file->Append("--------------------------------------\n");

S
sdong 已提交
3895
    std::unique_ptr<InternalIterator> datablock_iter;
M
Maysam Yabandeh 已提交
3896
    datablock_iter.reset(NewDataBlockIterator<DataBlockIter>(
3897 3898 3899 3900 3901
        ReadOptions(), blockhandles_iter->value(), /*input_iter=*/nullptr,
        /*type=*/BlockType::kData,
        /*key_includes_seq=*/true, /*index_key_is_full=*/true,
        /*get_context=*/nullptr, /*lookup_context=*/nullptr, Status(),
        /*prefetch_buffer=*/nullptr));
3902 3903 3904 3905 3906 3907 3908 3909 3910 3911 3912 3913 3914 3915
    s = datablock_iter->status();

    if (!s.ok()) {
      out_file->Append("Error reading the block - Skipped \n\n");
      continue;
    }

    for (datablock_iter->SeekToFirst(); datablock_iter->Valid();
         datablock_iter->Next()) {
      s = datablock_iter->status();
      if (!s.ok()) {
        out_file->Append("Error reading the block - Skipped \n");
        break;
      }
3916
      DumpKeyValue(datablock_iter->key(), datablock_iter->value(), out_file);
3917 3918 3919
    }
    out_file->Append("\n");
  }
3920 3921 3922 3923 3924 3925 3926 3927 3928 3929 3930 3931 3932 3933 3934 3935 3936 3937

  uint64_t num_datablocks = block_id - 1;
  if (num_datablocks) {
    double datablock_size_avg =
        static_cast<double>(datablock_size_sum) / num_datablocks;
    out_file->Append("Data Block Summary:\n");
    out_file->Append("--------------------------------------");
    out_file->Append("\n  # data blocks: ");
    out_file->Append(rocksdb::ToString(num_datablocks));
    out_file->Append("\n  min data block size: ");
    out_file->Append(rocksdb::ToString(datablock_size_min));
    out_file->Append("\n  max data block size: ");
    out_file->Append(rocksdb::ToString(datablock_size_max));
    out_file->Append("\n  avg data block size: ");
    out_file->Append(rocksdb::ToString(datablock_size_avg));
    out_file->Append("\n");
  }

3938 3939 3940
  return Status::OK();
}

3941 3942 3943 3944 3945 3946 3947 3948 3949 3950 3951 3952 3953 3954 3955 3956
void BlockBasedTable::DumpKeyValue(const Slice& key, const Slice& value,
                                   WritableFile* out_file) {
  InternalKey ikey;
  ikey.DecodeFrom(key);

  out_file->Append("  HEX    ");
  out_file->Append(ikey.user_key().ToString(true).c_str());
  out_file->Append(": ");
  out_file->Append(value.ToString(true).c_str());
  out_file->Append("\n");

  std::string str_key = ikey.user_key().ToString();
  std::string str_value = value.ToString();
  std::string res_key(""), res_value("");
  char cspace = ' ';
  for (size_t i = 0; i < str_key.size(); i++) {
3957 3958 3959 3960 3961
    if (str_key[i] == '\0') {
      res_key.append("\\0", 2);
    } else {
      res_key.append(&str_key[i], 1);
    }
3962 3963 3964
    res_key.append(1, cspace);
  }
  for (size_t i = 0; i < str_value.size(); i++) {
3965 3966 3967 3968 3969
    if (str_value[i] == '\0') {
      res_value.append("\\0", 2);
    } else {
      res_value.append(&str_value[i], 1);
    }
3970 3971 3972 3973 3974 3975 3976 3977 3978 3979
    res_value.append(1, cspace);
  }

  out_file->Append("  ASCII  ");
  out_file->Append(res_key.c_str());
  out_file->Append(": ");
  out_file->Append(res_value.c_str());
  out_file->Append("\n  ------\n");
}

3980 3981
namespace {

A
Andrew Kryczka 已提交
3982
void DeleteCachedFilterEntry(const Slice& /*key*/, void* value) {
3983 3984 3985
  FilterBlockReader* filter = reinterpret_cast<FilterBlockReader*>(value);
  if (filter->statistics() != nullptr) {
    RecordTick(filter->statistics(), BLOCK_CACHE_FILTER_BYTES_EVICT,
3986
               filter->ApproximateMemoryUsage());
3987 3988 3989 3990
  }
  delete filter;
}

3991 3992 3993 3994 3995 3996 3997
void DeleteCachedUncompressionDictEntry(const Slice& /*key*/, void* value) {
  UncompressionDict* dict = reinterpret_cast<UncompressionDict*>(value);
  RecordTick(dict->statistics(), BLOCK_CACHE_COMPRESSION_DICT_BYTES_EVICT,
             dict->ApproximateMemoryUsage());
  delete dict;
}

3998 3999
}  // anonymous namespace

4000
}  // namespace rocksdb