block_based_table_reader.cc 147.6 KB
Newer Older
1
//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
S
Siying Dong 已提交
2 3 4
//  This source code is licensed under both the GPLv2 (found in the
//  COPYING file in the root directory) and Apache 2.0 License
//  (found in the LICENSE.Apache file in the root directory).
5
//
J
jorlow@chromium.org 已提交
6 7 8
// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file. See the AUTHORS file for names of contributors.
9
#include "table/block_based/block_based_table_reader.h"
10

11
#include <algorithm>
12
#include <array>
13
#include <limits>
14 15
#include <string>
#include <utility>
O
omegaga 已提交
16
#include <vector>
17

T
Tyler Harter 已提交
18
#include "db/dbformat.h"
19
#include "db/pinned_iterators_manager.h"
T
Tyler Harter 已提交
20

21
#include "rocksdb/cache.h"
22 23 24
#include "rocksdb/comparator.h"
#include "rocksdb/env.h"
#include "rocksdb/filter_policy.h"
25
#include "rocksdb/iterator.h"
26 27
#include "rocksdb/options.h"
#include "rocksdb/statistics.h"
S
Siying Dong 已提交
28
#include "rocksdb/table.h"
29
#include "rocksdb/table_properties.h"
30

31 32 33 34 35 36 37
#include "table/block_based/block.h"
#include "table/block_based/block_based_filter_block.h"
#include "table/block_based/block_based_table_factory.h"
#include "table/block_based/block_prefix_index.h"
#include "table/block_based/filter_block.h"
#include "table/block_based/full_filter_block.h"
#include "table/block_based/partitioned_filter_block.h"
38
#include "table/block_fetcher.h"
J
jorlow@chromium.org 已提交
39
#include "table/format.h"
K
krad 已提交
40
#include "table/get_context.h"
S
sdong 已提交
41
#include "table/internal_iterator.h"
42
#include "table/meta_blocks.h"
43
#include "table/multiget_context.h"
K
krad 已提交
44
#include "table/persistent_cache_helper.h"
45
#include "table/sst_file_writer_collectors.h"
J
jorlow@chromium.org 已提交
46
#include "table/two_level_iterator.h"
47

48
#include "monitoring/perf_context_imp.h"
49
#include "test_util/sync_point.h"
J
jorlow@chromium.org 已提交
50
#include "util/coding.h"
51
#include "util/crc32c.h"
52
#include "util/file_reader_writer.h"
53
#include "util/stop_watch.h"
54
#include "util/string_util.h"
55
#include "util/xxhash.h"
J
jorlow@chromium.org 已提交
56

57
namespace rocksdb {
J
jorlow@chromium.org 已提交
58

I
xxHash  
Igor Canadi 已提交
59
extern const uint64_t kBlockBasedTableMagicNumber;
K
Kai Liu 已提交
60 61
extern const std::string kHashIndexPrefixesBlock;
extern const std::string kHashIndexPrefixesMetadataBlock;
62 63 64

typedef BlockBasedTable::IndexReader IndexReader;

M
Maysam Yabandeh 已提交
65 66 67 68 69
BlockBasedTable::~BlockBasedTable() {
  Close();
  delete rep_;
}

70 71
std::atomic<uint64_t> BlockBasedTable::next_cache_key_id_(0);

72 73 74 75 76
namespace {
// Read the block identified by "handle" from "file".
// The only relevant option is options.verify_checksums for now.
// On failure return non-OK.
// On success fill *result and return OK - caller owns *result
77
// @param uncompression_dict Data for presetting the compression library's
78
//    dictionary.
79 80 81 82
Status ReadBlockFromFile(
    RandomAccessFileReader* file, FilePrefetchBuffer* prefetch_buffer,
    const Footer& footer, const ReadOptions& options, const BlockHandle& handle,
    std::unique_ptr<Block>* result, const ImmutableCFOptions& ioptions,
83 84
    bool do_uncompress, bool maybe_compressed,
    const UncompressionDict& uncompression_dict,
85
    const PersistentCacheOptions& cache_options, SequenceNumber global_seqno,
86
    size_t read_amp_bytes_per_bit, MemoryAllocator* memory_allocator) {
87
  BlockContents contents;
88 89
  BlockFetcher block_fetcher(file, prefetch_buffer, footer, options, handle,
                             &contents, ioptions, do_uncompress,
90 91
                             maybe_compressed, uncompression_dict,
                             cache_options, memory_allocator);
S
Siying Dong 已提交
92
  Status s = block_fetcher.ReadBlockContents();
93
  if (s.ok()) {
94 95
    result->reset(new Block(std::move(contents), global_seqno,
                            read_amp_bytes_per_bit, ioptions.statistics));
96 97 98 99 100
  }

  return s;
}

Y
Yi Wu 已提交
101
inline MemoryAllocator* GetMemoryAllocator(
102
    const BlockBasedTableOptions& table_options) {
103 104 105
  return table_options.block_cache.get()
             ? table_options.block_cache->memory_allocator()
             : nullptr;
106 107
}

108 109 110 111 112 113 114
inline MemoryAllocator* GetMemoryAllocatorForCompressedBlock(
    const BlockBasedTableOptions& table_options) {
  return table_options.block_cache_compressed.get()
             ? table_options.block_cache_compressed->memory_allocator()
             : nullptr;
}

115 116
// Delete the entry resided in the cache.
template <class Entry>
A
Andrew Kryczka 已提交
117
void DeleteCachedEntry(const Slice& /*key*/, void* value) {
118 119 120 121
  auto entry = reinterpret_cast<Entry*>(value);
  delete entry;
}

122
void DeleteCachedFilterEntry(const Slice& key, void* value);
123
void DeleteCachedUncompressionDictEntry(const Slice& key, void* value);
124

125 126 127 128
// Release the cached entry and decrement its ref count.
void ForceReleaseCachedEntry(void* arg, void* h) {
  Cache* cache = reinterpret_cast<Cache*>(arg);
  Cache::Handle* handle = reinterpret_cast<Cache::Handle*>(h);
129
  cache->Release(handle, true /* force_erase */);
130 131
}

132 133 134 135 136 137 138 139
// Release the cached entry and decrement its ref count.
// Do not force erase
void ReleaseCachedEntry(void* arg, void* h) {
  Cache* cache = reinterpret_cast<Cache*>(arg);
  Cache::Handle* handle = reinterpret_cast<Cache::Handle*>(h);
  cache->Release(handle, false /* force_erase */);
}

140 141 142
// For hash based index, return true if prefix_extractor and
// prefix_extractor_block mismatch, false otherwise. This flag will be used
// as total_order_seek via NewIndexIterator
143 144
bool PrefixExtractorChanged(const TableProperties* table_properties,
                            const SliceTransform* prefix_extractor) {
145 146 147
  // BlockBasedTableOptions::kHashSearch requires prefix_extractor to be set.
  // Turn off hash index in prefix_extractor is not set; if  prefix_extractor
  // is set but prefix_extractor_block is not set, also disable hash index
148 149
  if (prefix_extractor == nullptr || table_properties == nullptr ||
      table_properties->prefix_extractor_name.empty()) {
150 151
    return true;
  }
152

153
  // prefix_extractor and prefix_extractor_block are both non-empty
154 155
  if (table_properties->prefix_extractor_name.compare(
          prefix_extractor->Name()) != 0) {
156 157 158 159 160 161
    return true;
  } else {
    return false;
  }
}

162 163
}  // namespace

164 165 166 167 168
// Encapsulates common functionality for the various index reader
// implementations. Provides access to the index block regardless of whether
// it is owned by the reader or stored in the cache, or whether it is pinned
// in the cache or not.
class BlockBasedTable::IndexReaderCommon : public BlockBasedTable::IndexReader {
169
 public:
170 171
  IndexReaderCommon(const BlockBasedTable* t,
                    CachableEntry<Block>&& index_block)
172
      : table_(t), index_block_(std::move(index_block)) {
173 174 175
    assert(table_ != nullptr);
  }

176
 protected:
177
  static Status ReadIndexBlock(const BlockBasedTable* table,
178 179 180
                               FilePrefetchBuffer* prefetch_buffer,
                               const ReadOptions& read_options,
                               GetContext* get_context,
181
                               BlockCacheLookupContext* lookup_context,
182
                               CachableEntry<Block>* index_block);
183

184
  const BlockBasedTable* table() const { return table_; }
185 186 187 188 189 190 191 192 193 194 195 196 197

  const InternalKeyComparator* internal_comparator() const {
    assert(table_ != nullptr);
    assert(table_->get_rep() != nullptr);

    return &table_->get_rep()->internal_comparator;
  }

  bool index_key_includes_seq() const {
    assert(table_ != nullptr);
    assert(table_->get_rep() != nullptr);

    const TableProperties* const properties =
198
        table_->get_rep()->table_properties.get();
199 200 201 202 203 204 205 206 207

    return properties == nullptr || !properties->index_key_is_user_key;
  }

  bool index_value_is_full() const {
    assert(table_ != nullptr);
    assert(table_->get_rep() != nullptr);

    const TableProperties* const properties =
208
        table_->get_rep()->table_properties.get();
209 210 211 212

    return properties == nullptr || !properties->index_value_is_delta_encoded;
  }

213
  Status GetOrReadIndexBlock(bool no_io, GetContext* get_context,
214
                             BlockCacheLookupContext* lookup_context,
215 216 217 218
                             CachableEntry<Block>* index_block) const;

  size_t ApproximateIndexBlockMemoryUsage() const {
    assert(!index_block_.GetOwnValue() || index_block_.GetValue() != nullptr);
219 220 221
    return index_block_.GetOwnValue()
               ? index_block_.GetValue()->ApproximateMemoryUsage()
               : 0;
222 223
  }

224
 private:
225
  const BlockBasedTable* table_;
226 227 228 229
  CachableEntry<Block> index_block_;
};

Status BlockBasedTable::IndexReaderCommon::ReadIndexBlock(
230
    const BlockBasedTable* table, FilePrefetchBuffer* prefetch_buffer,
231
    const ReadOptions& read_options, GetContext* get_context,
232
    BlockCacheLookupContext* lookup_context,
233
    CachableEntry<Block>* index_block) {
234 235 236 237 238 239 240 241 242
  PERF_TIMER_GUARD(read_index_block_nanos);

  assert(table != nullptr);
  assert(index_block != nullptr);
  assert(index_block->IsEmpty());

  const Rep* const rep = table->get_rep();
  assert(rep != nullptr);

243 244
  const Status s = table->RetrieveBlock(
      prefetch_buffer, read_options, rep->footer.index_handle(),
245
      UncompressionDict::GetEmptyDict(), index_block, BlockType::kIndex,
246
      get_context, lookup_context);
247 248 249 250 251

  return s;
}

Status BlockBasedTable::IndexReaderCommon::GetOrReadIndexBlock(
252
    bool no_io, GetContext* get_context,
253
    BlockCacheLookupContext* lookup_context,
254
    CachableEntry<Block>* index_block) const {
255 256 257
  assert(index_block != nullptr);

  if (!index_block_.IsEmpty()) {
258
    index_block->SetUnownedValue(index_block_.GetValue());
259 260 261
    return Status::OK();
  }

262 263 264 265 266
  ReadOptions read_options;
  if (no_io) {
    read_options.read_tier = kBlockCacheTier;
  }

267 268
  return ReadIndexBlock(table_, /*prefetch_buffer=*/nullptr, read_options,
                        get_context, lookup_context, index_block);
269 270
}

M
Maysam Yabandeh 已提交
271
// Index that allows binary search lookup in a two-level index structure.
272
class PartitionIndexReader : public BlockBasedTable::IndexReaderCommon {
M
Maysam Yabandeh 已提交
273 274 275 276 277
 public:
  // Read the partition index from the file and create an instance for
  // `PartitionIndexReader`.
  // On success, index_reader will be populated; otherwise it will remain
  // unmodified.
278
  static Status Create(const BlockBasedTable* table,
279
                       FilePrefetchBuffer* prefetch_buffer, bool use_cache,
280 281
                       bool prefetch, bool pin, IndexReader** index_reader,
                       BlockCacheLookupContext* lookup_context) {
282 283 284 285 286 287 288
    assert(table != nullptr);
    assert(table->get_rep());
    assert(!pin || prefetch);
    assert(index_reader != nullptr);

    CachableEntry<Block> index_block;
    if (prefetch || !use_cache) {
289 290 291
      const Status s =
          ReadIndexBlock(table, prefetch_buffer, ReadOptions(),
                         /*get_context=*/nullptr, lookup_context, &index_block);
292 293 294
      if (!s.ok()) {
        return s;
      }
M
Maysam Yabandeh 已提交
295

296 297 298
      if (use_cache && !pin) {
        index_block.Reset();
      }
M
Maysam Yabandeh 已提交
299 300
    }

301 302 303
    *index_reader = new PartitionIndexReader(table, std::move(index_block));

    return Status::OK();
M
Maysam Yabandeh 已提交
304 305 306
  }

  // return a two-level iterator: first level is on the partition index
307
  InternalIteratorBase<BlockHandle>* NewIterator(
308
      const ReadOptions& read_options, bool /* disable_prefix_seek */,
309 310
      IndexBlockIter* iter, GetContext* get_context,
      BlockCacheLookupContext* lookup_context) override {
311
    const bool no_io = (read_options.read_tier == kBlockCacheTier);
312
    CachableEntry<Block> index_block;
313 314
    const Status s =
        GetOrReadIndexBlock(no_io, get_context, lookup_context, &index_block);
315 316 317 318 319 320 321 322 323 324 325
    if (!s.ok()) {
      if (iter != nullptr) {
        iter->Invalidate(s);
        return iter;
      }

      return NewErrorInternalIterator<BlockHandle>(s);
    }

    InternalIteratorBase<BlockHandle>* it = nullptr;

M
Maysam Yabandeh 已提交
326
    Statistics* kNullStats = nullptr;
M
Maysam Yabandeh 已提交
327
    // Filters are already checked before seeking the index
328
    if (!partition_map_.empty()) {
329
      // We don't return pinned data from index blocks, so no need
330
      // to set `block_contents_pinned`.
331
      it = NewTwoLevelIterator(
332
          new BlockBasedTable::PartitionedIndexIteratorState(
333 334 335 336 337 338
              table(), &partition_map_, index_key_includes_seq(),
              index_value_is_full()),
          index_block.GetValue()->NewIterator<IndexBlockIter>(
              internal_comparator(), internal_comparator()->user_comparator(),
              nullptr, kNullStats, true, index_key_includes_seq(),
              index_value_is_full()));
339
    } else {
340 341 342
      ReadOptions ro;
      ro.fill_cache = read_options.fill_cache;
      // We don't return pinned data from index blocks, so no need
343
      // to set `block_contents_pinned`.
344 345 346 347 348 349
      it = new BlockBasedTableIterator<IndexBlockIter, BlockHandle>(
          table(), ro, *internal_comparator(),
          index_block.GetValue()->NewIterator<IndexBlockIter>(
              internal_comparator(), internal_comparator()->user_comparator(),
              nullptr, kNullStats, true, index_key_includes_seq(),
              index_value_is_full()),
350
          false, true, /* prefix_extractor */ nullptr, BlockType::kIndex,
351
          index_key_includes_seq(), index_value_is_full());
352
    }
353 354 355 356 357 358

    assert(it != nullptr);
    index_block.TransferTo(it);

    return it;

M
Maysam Yabandeh 已提交
359
    // TODO(myabandeh): Update TwoLevelIterator to be able to make use of
M
Maysam Yabandeh 已提交
360 361 362 363 364
    // on-stack BlockIter while the state is on heap. Currentlly it assumes
    // the first level iter is always on heap and will attempt to delete it
    // in its destructor.
  }

365
  void CacheDependencies(bool pin) override {
M
Maysam Yabandeh 已提交
366
    // Before read partitions, prefetch them to avoid lots of IOs
367
    BlockCacheLookupContext lookup_context{BlockCacheLookupCaller::kPrefetch};
368
    auto rep = table()->rep_;
M
Maysam Yabandeh 已提交
369
    IndexBlockIter biter;
M
Maysam Yabandeh 已提交
370
    BlockHandle handle;
M
Maysam Yabandeh 已提交
371
    Statistics* kNullStats = nullptr;
372 373

    CachableEntry<Block> index_block;
374
    Status s = GetOrReadIndexBlock(false /* no_io */, nullptr /* get_context */,
375
                                   &lookup_context, &index_block);
376 377 378
    if (!s.ok()) {
      ROCKS_LOG_WARN(rep->ioptions.info_log,
                     "Error retrieving top-level index block while trying to "
379 380
                     "cache index partitions: %s",
                     s.ToString().c_str());
381 382 383 384
      return;
    }

    // We don't return pinned data from index blocks, so no need
385
    // to set `block_contents_pinned`.
386 387 388
    index_block.GetValue()->NewIterator<IndexBlockIter>(
        internal_comparator(), internal_comparator()->user_comparator(), &biter,
        kNullStats, true, index_key_includes_seq(), index_value_is_full());
M
Maysam Yabandeh 已提交
389 390 391
    // Index partitions are assumed to be consecuitive. Prefetch them all.
    // Read the first block offset
    biter.SeekToFirst();
392 393 394 395
    if (!biter.Valid()) {
      // Empty index.
      return;
    }
396
    handle = biter.value();
M
Maysam Yabandeh 已提交
397 398 399 400
    uint64_t prefetch_off = handle.offset();

    // Read the last block's offset
    biter.SeekToLast();
401 402 403 404
    if (!biter.Valid()) {
      // Empty index.
      return;
    }
405
    handle = biter.value();
M
Maysam Yabandeh 已提交
406 407 408
    uint64_t last_off = handle.offset() + handle.size() + kBlockTrailerSize;
    uint64_t prefetch_len = last_off - prefetch_off;
    std::unique_ptr<FilePrefetchBuffer> prefetch_buffer;
409
    auto& file = rep->file;
M
Maysam Yabandeh 已提交
410
    prefetch_buffer.reset(new FilePrefetchBuffer());
411 412
    s = prefetch_buffer->Prefetch(file.get(), prefetch_off,
                                  static_cast<size_t>(prefetch_len));
M
Maysam Yabandeh 已提交
413 414 415 416 417

    // After prefetch, read the partitions one by one
    biter.SeekToFirst();
    auto ro = ReadOptions();
    for (; biter.Valid(); biter.Next()) {
418
      handle = biter.value();
419
      CachableEntry<Block> block;
420 421
      // TODO: Support counter batch update for partitioned index and
      // filter blocks
422 423
      s = table()->MaybeReadBlockAndLoadToCache(
          prefetch_buffer.get(), ro, handle, UncompressionDict::GetEmptyDict(),
424
          &block, BlockType::kIndex, /*get_context=*/nullptr, &lookup_context);
M
Maysam Yabandeh 已提交
425

426 427 428
      assert(s.ok() || block.GetValue() == nullptr);
      if (s.ok() && block.GetValue() != nullptr) {
        if (block.IsCached()) {
429
          if (pin) {
430
            partition_map_[handle.offset()] = std::move(block);
431
          }
M
Maysam Yabandeh 已提交
432 433 434
        }
      }
    }
M
Maysam Yabandeh 已提交
435 436
  }

437
  size_t ApproximateMemoryUsage() const override {
438
    size_t usage = ApproximateIndexBlockMemoryUsage();
439 440 441 442 443 444 445
#ifdef ROCKSDB_MALLOC_USABLE_SIZE
    usage += malloc_usable_size((void*)this);
#else
    usage += sizeof(*this);
#endif  // ROCKSDB_MALLOC_USABLE_SIZE
    // TODO(myabandeh): more accurate estimate of partition_map_ mem usage
    return usage;
M
Maysam Yabandeh 已提交
446 447 448
  }

 private:
449 450
  PartitionIndexReader(const BlockBasedTable* t,
                       CachableEntry<Block>&& index_block)
451
      : IndexReaderCommon(t, std::move(index_block)) {}
452

453
  std::unordered_map<uint64_t, CachableEntry<Block>> partition_map_;
M
Maysam Yabandeh 已提交
454 455
};

456 457 458
// Index that allows binary search lookup for the first key of each block.
// This class can be viewed as a thin wrapper for `Block` class which already
// supports binary search.
459
class BinarySearchIndexReader : public BlockBasedTable::IndexReaderCommon {
460 461 462
 public:
  // Read index from the file and create an intance for
  // `BinarySearchIndexReader`.
463 464
  // On success, index_reader will be populated; otherwise it will remain
  // unmodified.
465
  static Status Create(const BlockBasedTable* table,
466
                       FilePrefetchBuffer* prefetch_buffer, bool use_cache,
467 468
                       bool prefetch, bool pin, IndexReader** index_reader,
                       BlockCacheLookupContext* lookup_context) {
469 470 471 472 473 474 475
    assert(table != nullptr);
    assert(table->get_rep());
    assert(!pin || prefetch);
    assert(index_reader != nullptr);

    CachableEntry<Block> index_block;
    if (prefetch || !use_cache) {
476 477 478
      const Status s =
          ReadIndexBlock(table, prefetch_buffer, ReadOptions(),
                         /*get_context=*/nullptr, lookup_context, &index_block);
479 480 481
      if (!s.ok()) {
        return s;
      }
482

483 484 485
      if (use_cache && !pin) {
        index_block.Reset();
      }
486 487
    }

488 489 490
    *index_reader = new BinarySearchIndexReader(table, std::move(index_block));

    return Status::OK();
491 492
  }

493
  InternalIteratorBase<BlockHandle>* NewIterator(
494
      const ReadOptions& read_options, bool /* disable_prefix_seek */,
495 496
      IndexBlockIter* iter, GetContext* get_context,
      BlockCacheLookupContext* lookup_context) override {
497
    const bool no_io = (read_options.read_tier == kBlockCacheTier);
498
    CachableEntry<Block> index_block;
499 500
    const Status s =
        GetOrReadIndexBlock(no_io, get_context, lookup_context, &index_block);
501 502 503 504 505 506 507 508 509
    if (!s.ok()) {
      if (iter != nullptr) {
        iter->Invalidate(s);
        return iter;
      }

      return NewErrorInternalIterator<BlockHandle>(s);
    }

M
Maysam Yabandeh 已提交
510
    Statistics* kNullStats = nullptr;
511
    // We don't return pinned data from index blocks, so no need
512
    // to set `block_contents_pinned`.
513 514 515
    auto it = index_block.GetValue()->NewIterator<IndexBlockIter>(
        internal_comparator(), internal_comparator()->user_comparator(), iter,
        kNullStats, true, index_key_includes_seq(), index_value_is_full());
516

517 518 519 520 521
    assert(it != nullptr);
    index_block.TransferTo(it);

    return it;
  }
522

523
  size_t ApproximateMemoryUsage() const override {
524
    size_t usage = ApproximateIndexBlockMemoryUsage();
525 526 527 528 529 530
#ifdef ROCKSDB_MALLOC_USABLE_SIZE
    usage += malloc_usable_size((void*)this);
#else
    usage += sizeof(*this);
#endif  // ROCKSDB_MALLOC_USABLE_SIZE
    return usage;
531 532
  }

533
 private:
534
  BinarySearchIndexReader(const BlockBasedTable* t,
535
                          CachableEntry<Block>&& index_block)
536
      : IndexReaderCommon(t, std::move(index_block)) {}
537 538 539 540
};

// Index that leverages an internal hash table to quicken the lookup for a given
// key.
541
class HashIndexReader : public BlockBasedTable::IndexReaderCommon {
542
 public:
543
  static Status Create(const BlockBasedTable* table,
544 545
                       FilePrefetchBuffer* prefetch_buffer,
                       InternalIterator* meta_index_iter, bool use_cache,
546 547
                       bool prefetch, bool pin, IndexReader** index_reader,
                       BlockCacheLookupContext* lookup_context) {
548 549 550 551 552 553 554 555 556
    assert(table != nullptr);
    assert(index_reader != nullptr);
    assert(!pin || prefetch);

    auto rep = table->get_rep();
    assert(rep != nullptr);

    CachableEntry<Block> index_block;
    if (prefetch || !use_cache) {
557 558 559
      const Status s =
          ReadIndexBlock(table, prefetch_buffer, ReadOptions(),
                         /*get_context=*/nullptr, lookup_context, &index_block);
560 561 562
      if (!s.ok()) {
        return s;
      }
563

564 565 566
      if (use_cache && !pin) {
        index_block.Reset();
      }
567 568
    }

569 570 571 572
    // Note, failure to create prefix hash index does not need to be a
    // hard error. We can still fall back to the original binary search index.
    // So, Create will succeed regardless, from this point on.

573
    auto new_index_reader = new HashIndexReader(table, std::move(index_block));
574 575
    *index_reader = new_index_reader;

K
Kai Liu 已提交
576 577
    // Get prefixes block
    BlockHandle prefixes_handle;
578 579
    Status s = FindMetaBlock(meta_index_iter, kHashIndexPrefixesBlock,
                             &prefixes_handle);
K
Kai Liu 已提交
580
    if (!s.ok()) {
581 582
      // TODO: log error
      return Status::OK();
K
Kai Liu 已提交
583 584 585 586 587 588 589
    }

    // Get index metadata block
    BlockHandle prefixes_meta_handle;
    s = FindMetaBlock(meta_index_iter, kHashIndexPrefixesMetadataBlock,
                      &prefixes_meta_handle);
    if (!s.ok()) {
590 591
      // TODO: log error
      return Status::OK();
K
Kai Liu 已提交
592 593
    }

594 595 596 597 598
    RandomAccessFileReader* const file = rep->file.get();
    const Footer& footer = rep->footer;
    const ImmutableCFOptions& ioptions = rep->ioptions;
    const PersistentCacheOptions& cache_options = rep->persistent_cache_options;
    MemoryAllocator* const memory_allocator =
599
        GetMemoryAllocator(rep->table_options);
600

K
Kai Liu 已提交
601 602
    // Read contents for the blocks
    BlockContents prefixes_contents;
S
Siying Dong 已提交
603 604
    BlockFetcher prefixes_block_fetcher(
        file, prefetch_buffer, footer, ReadOptions(), prefixes_handle,
605
        &prefixes_contents, ioptions, true /*decompress*/,
606
        true /*maybe_compressed*/, UncompressionDict::GetEmptyDict(),
607
        cache_options, memory_allocator);
S
Siying Dong 已提交
608
    s = prefixes_block_fetcher.ReadBlockContents();
K
Kai Liu 已提交
609 610 611 612
    if (!s.ok()) {
      return s;
    }
    BlockContents prefixes_meta_contents;
S
Siying Dong 已提交
613 614
    BlockFetcher prefixes_meta_block_fetcher(
        file, prefetch_buffer, footer, ReadOptions(), prefixes_meta_handle,
615
        &prefixes_meta_contents, ioptions, true /*decompress*/,
616
        true /*maybe_compressed*/, UncompressionDict::GetEmptyDict(),
617
        cache_options, memory_allocator);
618
    s = prefixes_meta_block_fetcher.ReadBlockContents();
K
Kai Liu 已提交
619
    if (!s.ok()) {
620 621
      // TODO: log error
      return Status::OK();
K
Kai Liu 已提交
622 623
    }

624
    BlockPrefixIndex* prefix_index = nullptr;
625 626
    s = BlockPrefixIndex::Create(rep->internal_prefix_transform.get(),
                                 prefixes_contents.data,
627 628 629
                                 prefixes_meta_contents.data, &prefix_index);
    // TODO: log error
    if (s.ok()) {
M
Maysam Yabandeh 已提交
630
      new_index_reader->prefix_index_.reset(prefix_index);
K
Kai Liu 已提交
631 632
    }

633
    return Status::OK();
634 635
  }

636
  InternalIteratorBase<BlockHandle>* NewIterator(
637
      const ReadOptions& read_options, bool disable_prefix_seek,
638 639
      IndexBlockIter* iter, GetContext* get_context,
      BlockCacheLookupContext* lookup_context) override {
640
    const bool no_io = (read_options.read_tier == kBlockCacheTier);
641
    CachableEntry<Block> index_block;
642 643
    const Status s =
        GetOrReadIndexBlock(no_io, get_context, lookup_context, &index_block);
644 645 646 647 648 649 650 651 652
    if (!s.ok()) {
      if (iter != nullptr) {
        iter->Invalidate(s);
        return iter;
      }

      return NewErrorInternalIterator<BlockHandle>(s);
    }

M
Maysam Yabandeh 已提交
653
    Statistics* kNullStats = nullptr;
654 655
    const bool total_order_seek =
        read_options.total_order_seek || disable_prefix_seek;
656
    // We don't return pinned data from index blocks, so no need
657
    // to set `block_contents_pinned`.
658 659 660 661 662
    auto it = index_block.GetValue()->NewIterator<IndexBlockIter>(
        internal_comparator(), internal_comparator()->user_comparator(), iter,
        kNullStats, total_order_seek, index_key_includes_seq(),
        index_value_is_full(), false /* block_contents_pinned */,
        prefix_index_.get());
663

664 665 666 667 668
    assert(it != nullptr);
    index_block.TransferTo(it);

    return it;
  }
669

670
  size_t ApproximateMemoryUsage() const override {
671
    size_t usage = ApproximateIndexBlockMemoryUsage();
672 673 674
#ifdef ROCKSDB_MALLOC_USABLE_SIZE
    usage += malloc_usable_size((void*)this);
#else
M
Maysam Yabandeh 已提交
675 676 677
    if (prefix_index_) {
      usage += prefix_index_->ApproximateMemoryUsage();
    }
678 679 680
    usage += sizeof(*this);
#endif  // ROCKSDB_MALLOC_USABLE_SIZE
    return usage;
681 682
  }

683
 private:
684
  HashIndexReader(const BlockBasedTable* t, CachableEntry<Block>&& index_block)
685
      : IndexReaderCommon(t, std::move(index_block)) {}
K
Kai Liu 已提交
686

M
Maysam Yabandeh 已提交
687
  std::unique_ptr<BlockPrefixIndex> prefix_index_;
688 689
};

690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861
void BlockBasedTable::UpdateCacheHitMetrics(BlockType block_type,
                                            GetContext* get_context,
                                            size_t usage) const {
  Statistics* const statistics = rep_->ioptions.statistics;

  PERF_COUNTER_ADD(block_cache_hit_count, 1);
  PERF_COUNTER_BY_LEVEL_ADD(block_cache_hit_count, 1,
                            static_cast<uint32_t>(rep_->level));

  if (get_context) {
    ++get_context->get_context_stats_.num_cache_hit;
    get_context->get_context_stats_.num_cache_bytes_read += usage;
  } else {
    RecordTick(statistics, BLOCK_CACHE_HIT);
    RecordTick(statistics, BLOCK_CACHE_BYTES_READ, usage);
  }

  switch (block_type) {
    case BlockType::kFilter:
      PERF_COUNTER_ADD(block_cache_filter_hit_count, 1);

      if (get_context) {
        ++get_context->get_context_stats_.num_cache_filter_hit;
      } else {
        RecordTick(statistics, BLOCK_CACHE_FILTER_HIT);
      }
      break;

    case BlockType::kCompressionDictionary:
      // TODO: introduce perf counter for compression dictionary hit count
      if (get_context) {
        ++get_context->get_context_stats_.num_cache_compression_dict_hit;
      } else {
        RecordTick(statistics, BLOCK_CACHE_COMPRESSION_DICT_HIT);
      }
      break;

    case BlockType::kIndex:
      PERF_COUNTER_ADD(block_cache_index_hit_count, 1);

      if (get_context) {
        ++get_context->get_context_stats_.num_cache_index_hit;
      } else {
        RecordTick(statistics, BLOCK_CACHE_INDEX_HIT);
      }
      break;

    default:
      // TODO: introduce dedicated tickers/statistics/counters
      // for range tombstones
      if (get_context) {
        ++get_context->get_context_stats_.num_cache_data_hit;
      } else {
        RecordTick(statistics, BLOCK_CACHE_DATA_HIT);
      }
      break;
  }
}

void BlockBasedTable::UpdateCacheMissMetrics(BlockType block_type,
                                             GetContext* get_context) const {
  Statistics* const statistics = rep_->ioptions.statistics;

  // TODO: introduce aggregate (not per-level) block cache miss count
  PERF_COUNTER_BY_LEVEL_ADD(block_cache_miss_count, 1,
                            static_cast<uint32_t>(rep_->level));

  if (get_context) {
    ++get_context->get_context_stats_.num_cache_miss;
  } else {
    RecordTick(statistics, BLOCK_CACHE_MISS);
  }

  // TODO: introduce perf counters for misses per block type
  switch (block_type) {
    case BlockType::kFilter:
      if (get_context) {
        ++get_context->get_context_stats_.num_cache_filter_miss;
      } else {
        RecordTick(statistics, BLOCK_CACHE_FILTER_MISS);
      }
      break;

    case BlockType::kCompressionDictionary:
      if (get_context) {
        ++get_context->get_context_stats_.num_cache_compression_dict_miss;
      } else {
        RecordTick(statistics, BLOCK_CACHE_COMPRESSION_DICT_MISS);
      }
      break;

    case BlockType::kIndex:
      if (get_context) {
        ++get_context->get_context_stats_.num_cache_index_miss;
      } else {
        RecordTick(statistics, BLOCK_CACHE_INDEX_MISS);
      }
      break;

    default:
      // TODO: introduce dedicated tickers/statistics/counters
      // for range tombstones
      if (get_context) {
        ++get_context->get_context_stats_.num_cache_data_miss;
      } else {
        RecordTick(statistics, BLOCK_CACHE_DATA_MISS);
      }
      break;
  }
}

void BlockBasedTable::UpdateCacheInsertionMetrics(BlockType block_type,
                                                  GetContext* get_context,
                                                  size_t usage) const {
  Statistics* const statistics = rep_->ioptions.statistics;

  // TODO: introduce perf counters for block cache insertions
  if (get_context) {
    ++get_context->get_context_stats_.num_cache_add;
    get_context->get_context_stats_.num_cache_bytes_write += usage;
  } else {
    RecordTick(statistics, BLOCK_CACHE_ADD);
    RecordTick(statistics, BLOCK_CACHE_BYTES_WRITE, usage);
  }

  switch (block_type) {
    case BlockType::kFilter:
      if (get_context) {
        ++get_context->get_context_stats_.num_cache_filter_add;
        get_context->get_context_stats_.num_cache_filter_bytes_insert += usage;
      } else {
        RecordTick(statistics, BLOCK_CACHE_FILTER_ADD);
        RecordTick(statistics, BLOCK_CACHE_FILTER_BYTES_INSERT, usage);
      }
      break;

    case BlockType::kCompressionDictionary:
      if (get_context) {
        ++get_context->get_context_stats_.num_cache_compression_dict_add;
        get_context->get_context_stats_
            .num_cache_compression_dict_bytes_insert += usage;
      } else {
        RecordTick(statistics, BLOCK_CACHE_COMPRESSION_DICT_ADD);
        RecordTick(statistics, BLOCK_CACHE_COMPRESSION_DICT_BYTES_INSERT,
                   usage);
      }
      break;

    case BlockType::kIndex:
      if (get_context) {
        ++get_context->get_context_stats_.num_cache_index_add;
        get_context->get_context_stats_.num_cache_index_bytes_insert += usage;
      } else {
        RecordTick(statistics, BLOCK_CACHE_INDEX_ADD);
        RecordTick(statistics, BLOCK_CACHE_INDEX_BYTES_INSERT, usage);
      }
      break;

    default:
      // TODO: introduce dedicated tickers/statistics/counters
      // for range tombstones
      if (get_context) {
        ++get_context->get_context_stats_.num_cache_data_add;
        get_context->get_context_stats_.num_cache_data_bytes_insert += usage;
      } else {
        RecordTick(statistics, BLOCK_CACHE_DATA_ADD);
        RecordTick(statistics, BLOCK_CACHE_DATA_BYTES_INSERT, usage);
      }
      break;
  }
}

862
Cache::Handle* BlockBasedTable::GetEntryFromCache(
863
    Cache* block_cache, const Slice& key, BlockType block_type,
864
    GetContext* get_context) const {
865 866
  auto cache_handle = block_cache->Lookup(key, rep_->ioptions.statistics);

867
  if (cache_handle != nullptr) {
868 869
    UpdateCacheHitMetrics(block_type, get_context,
                          block_cache->GetUsage(cache_handle));
870
  } else {
871
    UpdateCacheMissMetrics(block_type, get_context);
872 873 874 875 876
  }

  return cache_handle;
}

877
// Helper function to setup the cache key's prefix for the Table.
878
void BlockBasedTable::SetupCacheKeyPrefix(Rep* rep) {
879 880
  assert(kMaxCacheKeyPrefixSize >= 10);
  rep->cache_key_prefix_size = 0;
881
  rep->compressed_cache_key_prefix_size = 0;
882
  if (rep->table_options.block_cache != nullptr) {
883 884
    GenerateCachePrefix(rep->table_options.block_cache.get(), rep->file->file(),
                        &rep->cache_key_prefix[0], &rep->cache_key_prefix_size);
885
  }
K
krad 已提交
886 887 888 889 890
  if (rep->table_options.persistent_cache != nullptr) {
    GenerateCachePrefix(/*cache=*/nullptr, rep->file->file(),
                        &rep->persistent_cache_key_prefix[0],
                        &rep->persistent_cache_key_prefix_size);
  }
891 892
  if (rep->table_options.block_cache_compressed != nullptr) {
    GenerateCachePrefix(rep->table_options.block_cache_compressed.get(),
893
                        rep->file->file(), &rep->compressed_cache_key_prefix[0],
894 895 896 897
                        &rep->compressed_cache_key_prefix_size);
  }
}

898 899
void BlockBasedTable::GenerateCachePrefix(Cache* cc, RandomAccessFile* file,
                                          char* buffer, size_t* size) {
900 901 902 903 904
  // generate an id from the file
  *size = file->GetUniqueId(buffer, kMaxCacheKeyPrefixSize);

  // If the prefix wasn't generated or was too long,
  // create one from the cache.
K
krad 已提交
905
  if (cc && *size == 0) {
906 907 908 909 910
    char* end = EncodeVarint64(buffer, cc->NewId());
    *size = static_cast<size_t>(end - buffer);
  }
}

911 912
void BlockBasedTable::GenerateCachePrefix(Cache* cc, WritableFile* file,
                                          char* buffer, size_t* size) {
913 914 915 916 917 918 919 920
  // generate an id from the file
  *size = file->GetUniqueId(buffer, kMaxCacheKeyPrefixSize);

  // If the prefix wasn't generated or was too long,
  // create one from the cache.
  if (*size == 0) {
    char* end = EncodeVarint64(buffer, cc->NewId());
    *size = static_cast<size_t>(end - buffer);
921 922 923
  }
}

924 925 926 927 928 929 930 931 932 933 934 935
namespace {
// Return True if table_properties has `user_prop_name` has a `true` value
// or it doesn't contain this property (for backward compatible).
bool IsFeatureSupported(const TableProperties& table_properties,
                        const std::string& user_prop_name, Logger* info_log) {
  auto& props = table_properties.user_collected_properties;
  auto pos = props.find(user_prop_name);
  // Older version doesn't have this value set. Skip this check.
  if (pos != props.end()) {
    if (pos->second == kPropFalse) {
      return false;
    } else if (pos->second != kPropTrue) {
936 937
      ROCKS_LOG_WARN(info_log, "Property %s has invalidate value %s",
                     user_prop_name.c_str(), pos->second.c_str());
938 939 940 941
    }
  }
  return true;
}
942

943 944 945 946 947 948 949
// Caller has to ensure seqno is not nullptr.
Status GetGlobalSequenceNumber(const TableProperties& table_properties,
                               SequenceNumber largest_seqno,
                               SequenceNumber* seqno) {
  const auto& props = table_properties.user_collected_properties;
  const auto version_pos = props.find(ExternalSstFilePropertyNames::kVersion);
  const auto seqno_pos = props.find(ExternalSstFilePropertyNames::kGlobalSeqno);
950

951
  *seqno = kDisableGlobalSequenceNumber;
952 953
  if (version_pos == props.end()) {
    if (seqno_pos != props.end()) {
954
      std::array<char, 200> msg_buf;
955
      // This is not an external sst file, global_seqno is not supported.
956 957
      snprintf(
          msg_buf.data(), msg_buf.max_size(),
958 959
          "A non-external sst file have global seqno property with value %s",
          seqno_pos->second.c_str());
960
      return Status::Corruption(msg_buf.data());
961
    }
962
    return Status::OK();
963 964 965 966 967
  }

  uint32_t version = DecodeFixed32(version_pos->second.c_str());
  if (version < 2) {
    if (seqno_pos != props.end() || version != 1) {
968
      std::array<char, 200> msg_buf;
969
      // This is a v1 external sst file, global_seqno is not supported.
970 971 972 973 974
      snprintf(msg_buf.data(), msg_buf.max_size(),
               "An external sst file with version %u have global seqno "
               "property with value %s",
               version, seqno_pos->second.c_str());
      return Status::Corruption(msg_buf.data());
975
    }
976
    return Status::OK();
977 978
  }

979 980 981 982 983 984 985
  // Since we have a plan to deprecate global_seqno, we do not return failure
  // if seqno_pos == props.end(). We rely on version_pos to detect whether the
  // SST is external.
  SequenceNumber global_seqno(0);
  if (seqno_pos != props.end()) {
    global_seqno = DecodeFixed64(seqno_pos->second.c_str());
  }
986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001
  // SstTableReader open table reader with kMaxSequenceNumber as largest_seqno
  // to denote it is unknown.
  if (largest_seqno < kMaxSequenceNumber) {
    if (global_seqno == 0) {
      global_seqno = largest_seqno;
    }
    if (global_seqno != largest_seqno) {
      std::array<char, 200> msg_buf;
      snprintf(
          msg_buf.data(), msg_buf.max_size(),
          "An external sst file with version %u have global seqno property "
          "with value %s, while largest seqno in the file is %llu",
          version, seqno_pos->second.c_str(),
          static_cast<unsigned long long>(largest_seqno));
      return Status::Corruption(msg_buf.data());
    }
1002
  }
1003
  *seqno = global_seqno;
1004 1005

  if (global_seqno > kMaxSequenceNumber) {
1006 1007 1008 1009 1010 1011
    std::array<char, 200> msg_buf;
    snprintf(msg_buf.data(), msg_buf.max_size(),
             "An external sst file with version %u have global seqno property "
             "with value %llu, which is greater than kMaxSequenceNumber",
             version, static_cast<unsigned long long>(global_seqno));
    return Status::Corruption(msg_buf.data());
1012 1013
  }

1014
  return Status::OK();
1015
}
1016 1017
}  // namespace

K
krad 已提交
1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029
Slice BlockBasedTable::GetCacheKey(const char* cache_key_prefix,
                                   size_t cache_key_prefix_size,
                                   const BlockHandle& handle, char* cache_key) {
  assert(cache_key != nullptr);
  assert(cache_key_prefix_size != 0);
  assert(cache_key_prefix_size <= kMaxCacheKeyPrefixSize);
  memcpy(cache_key, cache_key_prefix, cache_key_prefix_size);
  char* end =
      EncodeVarint64(cache_key + cache_key_prefix_size, handle.offset());
  return Slice(cache_key, static_cast<size_t>(end - cache_key));
}

1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040
Status BlockBasedTable::Open(
    const ImmutableCFOptions& ioptions, const EnvOptions& env_options,
    const BlockBasedTableOptions& table_options,
    const InternalKeyComparator& internal_comparator,
    std::unique_ptr<RandomAccessFileReader>&& file, uint64_t file_size,
    std::unique_ptr<TableReader>* table_reader,
    const SliceTransform* prefix_extractor,
    const bool prefetch_index_and_filter_in_cache, const bool skip_filters,
    const int level, const bool immortal_table,
    const SequenceNumber largest_seqno, TailPrefetchStats* tail_prefetch_stats,
    BlockCacheTracer* const block_cache_tracer) {
S
Siying Dong 已提交
1041
  table_reader->reset();
1042

1043
  Status s;
1044
  Footer footer;
1045 1046
  std::unique_ptr<FilePrefetchBuffer> prefetch_buffer;

1047 1048 1049
  // prefetch both index and filters, down to all partitions
  const bool prefetch_all = prefetch_index_and_filter_in_cache || level == 0;
  const bool preload_all = !table_options.cache_index_and_filter_blocks;
1050

1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061
  s = PrefetchTail(file.get(), file_size, tail_prefetch_stats, prefetch_all,
                   preload_all, &prefetch_buffer);

  // Read in the following order:
  //    1. Footer
  //    2. [metaindex block]
  //    3. [meta block: properties]
  //    4. [meta block: range deletion tombstone]
  //    5. [meta block: compression dictionary]
  //    6. [meta block: index]
  //    7. [meta block: filter]
1062 1063
  s = ReadFooterFromFile(file.get(), prefetch_buffer.get(), file_size, &footer,
                         kBlockBasedTableMagicNumber);
1064 1065 1066
  if (!s.ok()) {
    return s;
  }
1067
  if (!BlockBasedTableSupportedVersion(footer.version())) {
1068
    return Status::Corruption(
1069
        "Unknown Footer version. Maybe this file was created with newer "
1070 1071
        "version of RocksDB?");
  }
J
jorlow@chromium.org 已提交
1072

A
Aaron Gao 已提交
1073
  // We've successfully read the footer. We are ready to serve requests.
1074 1075 1076
  // Better not mutate rep_ after the creation. eg. internal_prefix_transform
  // raw pointer will be used to create HashIndexReader, whose reset may
  // access a dangling pointer.
1077
  BlockCacheLookupContext lookup_context{BlockCacheLookupCaller::kPrefetch};
1078
  Rep* rep = new BlockBasedTable::Rep(ioptions, env_options, table_options,
1079
                                      internal_comparator, skip_filters, level,
1080
                                      immortal_table);
K
Kai Liu 已提交
1081
  rep->file = std::move(file);
I
xxHash  
Igor Canadi 已提交
1082
  rep->footer = footer;
1083
  rep->index_type = table_options.index_type;
1084
  rep->hash_index_allow_collision = table_options.hash_index_allow_collision;
1085 1086
  // We need to wrap data with internal_prefix_transform to make sure it can
  // handle prefix correctly.
1087
  rep->internal_prefix_transform.reset(
1088
      new InternalKeySliceTransform(prefix_extractor));
1089
  SetupCacheKeyPrefix(rep);
1090 1091
  std::unique_ptr<BlockBasedTable> new_table(
      new BlockBasedTable(rep, block_cache_tracer));
K
Kai Liu 已提交
1092

K
krad 已提交
1093 1094 1095 1096 1097
  // page cache options
  rep->persistent_cache_options =
      PersistentCacheOptions(rep->table_options.persistent_cache,
                             std::string(rep->persistent_cache_key_prefix,
                                         rep->persistent_cache_key_prefix_size),
1098
                             rep->ioptions.statistics);
K
krad 已提交
1099

1100 1101 1102 1103 1104
  // Meta-blocks are not dictionary compressed. Explicitly set the dictionary
  // handle to null, otherwise it may be seen as uninitialized during the below
  // meta-block reads.
  rep->compression_dict_handle = BlockHandle::NullBlockHandle();

1105
  // Read metaindex
K
Kai Liu 已提交
1106
  std::unique_ptr<Block> meta;
S
sdong 已提交
1107
  std::unique_ptr<InternalIterator> meta_iter;
1108
  s = new_table->ReadMetaBlock(prefetch_buffer.get(), &meta, &meta_iter);
1109 1110 1111
  if (!s.ok()) {
    return s;
  }
K
Kai Liu 已提交
1112

1113 1114
  s = new_table->ReadPropertiesBlock(prefetch_buffer.get(), meta_iter.get(),
                                     largest_seqno);
1115 1116 1117
  if (!s.ok()) {
    return s;
  }
1118
  s = new_table->ReadRangeDelBlock(prefetch_buffer.get(), meta_iter.get(),
1119
                                   internal_comparator, &lookup_context);
1120 1121 1122
  if (!s.ok()) {
    return s;
  }
1123 1124
  s = new_table->PrefetchIndexAndFilterBlocks(
      prefetch_buffer.get(), meta_iter.get(), new_table.get(), prefetch_all,
1125
      table_options, level, &lookup_context);
1126 1127 1128 1129 1130 1131 1132 1133

  if (s.ok()) {
    // Update tail prefetch stats
    assert(prefetch_buffer.get() != nullptr);
    if (tail_prefetch_stats != nullptr) {
      assert(prefetch_buffer->min_offset_read() < file_size);
      tail_prefetch_stats->RecordEffectiveSize(
          static_cast<size_t>(file_size) - prefetch_buffer->min_offset_read());
I
Igor Canadi 已提交
1134
    }
1135 1136

    *table_reader = std::move(new_table);
I
Igor Canadi 已提交
1137 1138
  }

1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184
  return s;
}

Status BlockBasedTable::PrefetchTail(
    RandomAccessFileReader* file, uint64_t file_size,
    TailPrefetchStats* tail_prefetch_stats, const bool prefetch_all,
    const bool preload_all,
    std::unique_ptr<FilePrefetchBuffer>* prefetch_buffer) {
  size_t tail_prefetch_size = 0;
  if (tail_prefetch_stats != nullptr) {
    // Multiple threads may get a 0 (no history) when running in parallel,
    // but it will get cleared after the first of them finishes.
    tail_prefetch_size = tail_prefetch_stats->GetSuggestedPrefetchSize();
  }
  if (tail_prefetch_size == 0) {
    // Before read footer, readahead backwards to prefetch data. Do more
    // readahead if we're going to read index/filter.
    // TODO: This may incorrectly select small readahead in case partitioned
    // index/filter is enabled and top-level partition pinning is enabled.
    // That's because we need to issue readahead before we read the properties,
    // at which point we don't yet know the index type.
    tail_prefetch_size = prefetch_all || preload_all ? 512 * 1024 : 4 * 1024;
  }
  size_t prefetch_off;
  size_t prefetch_len;
  if (file_size < tail_prefetch_size) {
    prefetch_off = 0;
    prefetch_len = static_cast<size_t>(file_size);
  } else {
    prefetch_off = static_cast<size_t>(file_size - tail_prefetch_size);
    prefetch_len = tail_prefetch_size;
  }
  TEST_SYNC_POINT_CALLBACK("BlockBasedTable::Open::TailPrefetchLen",
                           &tail_prefetch_size);
  Status s;
  // TODO should not have this special logic in the future.
  if (!file->use_direct_io()) {
    prefetch_buffer->reset(new FilePrefetchBuffer(nullptr, 0, 0, false, true));
    s = file->Prefetch(prefetch_off, prefetch_len);
  } else {
    prefetch_buffer->reset(new FilePrefetchBuffer(nullptr, 0, 0, true, true));
    s = (*prefetch_buffer)->Prefetch(file, prefetch_off, prefetch_len);
  }
  return s;
}

1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211
Status VerifyChecksum(const ChecksumType type, const char* buf, size_t len,
                      uint32_t expected) {
  Status s;
  uint32_t actual = 0;
  switch (type) {
    case kNoChecksum:
      break;
    case kCRC32c:
      expected = crc32c::Unmask(expected);
      actual = crc32c::Value(buf, len);
      break;
    case kxxHash:
      actual = XXH32(buf, static_cast<int>(len), 0);
      break;
    case kxxHash64:
      actual = static_cast<uint32_t>(XXH64(buf, static_cast<int>(len), 0) &
                                     uint64_t{0xffffffff});
      break;
    default:
      s = Status::Corruption("unknown checksum type");
  }
  if (s.ok() && actual != expected) {
    s = Status::Corruption("properties block checksum mismatched");
  }
  return s;
}

1212
Status BlockBasedTable::TryReadPropertiesWithGlobalSeqno(
1213
    FilePrefetchBuffer* prefetch_buffer, const Slice& handle_value,
1214 1215 1216 1217 1218 1219 1220 1221 1222 1223
    TableProperties** table_properties) {
  assert(table_properties != nullptr);
  // If this is an external SST file ingested with write_global_seqno set to
  // true, then we expect the checksum mismatch because checksum was written
  // by SstFileWriter, but its global seqno in the properties block may have
  // been changed during ingestion. In this case, we read the properties
  // block, copy it to a memory buffer, change the global seqno to its
  // original value, i.e. 0, and verify the checksum again.
  BlockHandle props_block_handle;
  CacheAllocationPtr tmp_buf;
1224 1225
  Status s = ReadProperties(handle_value, rep_->file.get(), prefetch_buffer,
                            rep_->footer, rep_->ioptions, table_properties,
1226 1227 1228 1229 1230 1231 1232 1233
                            false /* verify_checksum */, &props_block_handle,
                            &tmp_buf, false /* compression_type_missing */,
                            nullptr /* memory_allocator */);
  if (s.ok() && tmp_buf) {
    const auto seqno_pos_iter =
        (*table_properties)
            ->properties_offsets.find(
                ExternalSstFilePropertyNames::kGlobalSeqno);
1234
    size_t block_size = static_cast<size_t>(props_block_handle.size());
1235 1236 1237 1238 1239 1240
    if (seqno_pos_iter != (*table_properties)->properties_offsets.end()) {
      uint64_t global_seqno_offset = seqno_pos_iter->second;
      EncodeFixed64(
          tmp_buf.get() + global_seqno_offset - props_block_handle.offset(), 0);
    }
    uint32_t value = DecodeFixed32(tmp_buf.get() + block_size + 1);
1241
    s = rocksdb::VerifyChecksum(rep_->footer.checksum(), tmp_buf.get(),
1242 1243 1244 1245 1246
                                block_size + 1, value);
  }
  return s;
}

1247
Status BlockBasedTable::ReadPropertiesBlock(
1248
    FilePrefetchBuffer* prefetch_buffer, InternalIterator* meta_iter,
1249
    const SequenceNumber largest_seqno) {
1250
  bool found_properties_block = true;
1251 1252
  Status s;
  s = SeekToPropertiesBlock(meta_iter, &found_properties_block);
1253

1254
  if (!s.ok()) {
1255
    ROCKS_LOG_WARN(rep_->ioptions.info_log,
1256 1257
                   "Error when seeking to properties block from file: %s",
                   s.ToString().c_str());
1258
  } else if (found_properties_block) {
K
Kai Liu 已提交
1259
    s = meta_iter->status();
K
kailiu 已提交
1260
    TableProperties* table_properties = nullptr;
K
Kai Liu 已提交
1261
    if (s.ok()) {
1262
      s = ReadProperties(
1263 1264
          meta_iter->value(), rep_->file.get(), prefetch_buffer, rep_->footer,
          rep_->ioptions, &table_properties, true /* verify_checksum */,
1265 1266 1267 1268 1269
          nullptr /* ret_block_handle */, nullptr /* ret_block_contents */,
          false /* compression_type_missing */, nullptr /* memory_allocator */);
    }

    if (s.IsCorruption()) {
1270 1271
      s = TryReadPropertiesWithGlobalSeqno(prefetch_buffer, meta_iter->value(),
                                           &table_properties);
1272 1273 1274 1275
    }
    std::unique_ptr<TableProperties> props_guard;
    if (table_properties != nullptr) {
      props_guard.reset(table_properties);
K
Kai Liu 已提交
1276
    }
J
jorlow@chromium.org 已提交
1277

K
Kai Liu 已提交
1278
    if (!s.ok()) {
1279
      ROCKS_LOG_WARN(rep_->ioptions.info_log,
1280 1281 1282
                     "Encountered error while reading data from properties "
                     "block %s",
                     s.ToString().c_str());
K
kailiu 已提交
1283
    } else {
1284
      assert(table_properties != nullptr);
1285 1286 1287 1288 1289 1290
      rep_->table_properties.reset(props_guard.release());
      rep_->blocks_maybe_compressed =
          rep_->table_properties->compression_name !=
          CompressionTypeToString(kNoCompression);
      rep_->blocks_definitely_zstd_compressed =
          (rep_->table_properties->compression_name ==
1291
               CompressionTypeToString(kZSTD) ||
1292
           rep_->table_properties->compression_name ==
1293
               CompressionTypeToString(kZSTDNotFinalCompression));
K
Kai Liu 已提交
1294
    }
1295
  } else {
1296
    ROCKS_LOG_ERROR(rep_->ioptions.info_log,
1297
                    "Cannot find Properties block from file.");
K
Kai Liu 已提交
1298
  }
1299
#ifndef ROCKSDB_LITE
1300 1301 1302
  if (rep_->table_properties) {
    ParseSliceTransform(rep_->table_properties->prefix_extractor_name,
                        &(rep_->table_prefix_extractor));
1303 1304
  }
#endif  // ROCKSDB_LITE
K
Kai Liu 已提交
1305

1306
  // Read the table properties, if provided.
1307 1308 1309
  if (rep_->table_properties) {
    rep_->whole_key_filtering &=
        IsFeatureSupported(*(rep_->table_properties),
1310
                           BlockBasedTablePropertyNames::kWholeKeyFiltering,
1311 1312 1313 1314 1315 1316 1317 1318
                           rep_->ioptions.info_log);
    rep_->prefix_filtering &=
        IsFeatureSupported(*(rep_->table_properties),
                           BlockBasedTablePropertyNames::kPrefixFiltering,
                           rep_->ioptions.info_log);

    s = GetGlobalSequenceNumber(*(rep_->table_properties), largest_seqno,
                                &(rep_->global_seqno));
1319
    if (!s.ok()) {
1320
      ROCKS_LOG_ERROR(rep_->ioptions.info_log, "%s", s.ToString().c_str());
1321
    }
1322
  }
1323 1324
  return s;
}
1325

1326
Status BlockBasedTable::ReadRangeDelBlock(
1327
    FilePrefetchBuffer* prefetch_buffer, InternalIterator* meta_iter,
1328 1329
    const InternalKeyComparator& internal_comparator,
    BlockCacheLookupContext* lookup_context) {
1330
  Status s;
1331
  bool found_range_del_block;
1332 1333
  BlockHandle range_del_handle;
  s = SeekToRangeDelBlock(meta_iter, &found_range_del_block, &range_del_handle);
1334
  if (!s.ok()) {
1335
    ROCKS_LOG_WARN(
1336
        rep_->ioptions.info_log,
1337 1338
        "Error when seeking to range delete tombstones block from file: %s",
        s.ToString().c_str());
1339
  } else if (found_range_del_block && !range_del_handle.IsNull()) {
1340
    ReadOptions read_options;
1341
    std::unique_ptr<InternalIterator> iter(NewDataBlockIterator<DataBlockIter>(
1342 1343 1344 1345
        read_options, range_del_handle,
        /*input_iter=*/nullptr, BlockType::kRangeDeletion,
        /*key_includes_seq=*/true, /*index_key_is_full=*/true,
        /*get_context=*/nullptr, lookup_context, Status(), prefetch_buffer));
1346 1347
    assert(iter != nullptr);
    s = iter->status();
1348 1349
    if (!s.ok()) {
      ROCKS_LOG_WARN(
1350
          rep_->ioptions.info_log,
1351 1352
          "Encountered error while reading data from range del block %s",
          s.ToString().c_str());
1353
    } else {
1354
      rep_->fragmented_range_dels =
1355 1356
          std::make_shared<FragmentedRangeTombstoneList>(std::move(iter),
                                                         internal_comparator);
1357 1358
    }
  }
1359 1360 1361 1362
  return s;
}

Status BlockBasedTable::ReadCompressionDictBlock(
1363 1364
    FilePrefetchBuffer* prefetch_buffer,
    std::unique_ptr<const BlockContents>* compression_dict_block) const {
1365
  assert(compression_dict_block != nullptr);
1366
  Status s;
1367
  if (!rep_->compression_dict_handle.IsNull()) {
1368 1369 1370
    std::unique_ptr<BlockContents> compression_dict_cont{new BlockContents()};
    PersistentCacheOptions cache_options;
    ReadOptions read_options;
1371
    read_options.verify_checksums = true;
1372
    BlockFetcher compression_block_fetcher(
1373 1374 1375
        rep_->file.get(), prefetch_buffer, rep_->footer, read_options,
        rep_->compression_dict_handle, compression_dict_cont.get(),
        rep_->ioptions, false /* decompress */, false /*maybe_compressed*/,
1376
        UncompressionDict::GetEmptyDict(), cache_options);
1377 1378 1379 1380
    s = compression_block_fetcher.ReadBlockContents();

    if (!s.ok()) {
      ROCKS_LOG_WARN(
1381
          rep_->ioptions.info_log,
1382 1383 1384 1385
          "Encountered error while reading data from compression dictionary "
          "block %s",
          s.ToString().c_str());
    } else {
1386
      *compression_dict_block = std::move(compression_dict_cont);
1387 1388 1389 1390 1391 1392
    }
  }
  return s;
}

Status BlockBasedTable::PrefetchIndexAndFilterBlocks(
1393
    FilePrefetchBuffer* prefetch_buffer, InternalIterator* meta_iter,
1394
    BlockBasedTable* new_table, bool prefetch_all,
1395 1396
    const BlockBasedTableOptions& table_options, const int level,
    BlockCacheLookupContext* lookup_context) {
1397 1398 1399
  Status s;

  // Find filter handle and filter type
1400
  if (rep_->filter_policy) {
1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418
    for (auto filter_type :
         {Rep::FilterType::kFullFilter, Rep::FilterType::kPartitionedFilter,
          Rep::FilterType::kBlockFilter}) {
      std::string prefix;
      switch (filter_type) {
        case Rep::FilterType::kFullFilter:
          prefix = kFullFilterBlockPrefix;
          break;
        case Rep::FilterType::kPartitionedFilter:
          prefix = kPartitionedFilterBlockPrefix;
          break;
        case Rep::FilterType::kBlockFilter:
          prefix = kFilterBlockPrefix;
          break;
        default:
          assert(0);
      }
      std::string filter_block_key = prefix;
1419 1420
      filter_block_key.append(rep_->filter_policy->Name());
      if (FindMetaBlock(meta_iter, filter_block_key, &rep_->filter_handle)
1421
              .ok()) {
1422
        rep_->filter_type = filter_type;
1423 1424 1425 1426
        break;
      }
    }
  }
1427

1428 1429 1430 1431
  {
    // Find compression dictionary handle
    bool found_compression_dict;
    s = SeekToCompressionDictBlock(meta_iter, &found_compression_dict,
1432
                                   &rep_->compression_dict_handle);
1433 1434
  }

1435
  BlockBasedTableOptions::IndexType index_type = new_table->UpdateIndexType();
1436 1437 1438

  const bool use_cache = table_options.cache_index_and_filter_blocks;

1439 1440 1441 1442 1443 1444 1445
  // prefetch the first level of index
  const bool prefetch_index =
      prefetch_all ||
      (table_options.pin_top_level_index_and_filter &&
       index_type == BlockBasedTableOptions::kTwoLevelIndexSearch);
  // prefetch the first level of filter
  const bool prefetch_filter =
1446 1447 1448
      prefetch_all ||
      (table_options.pin_top_level_index_and_filter &&
       rep_->filter_type == Rep::FilterType::kPartitionedFilter);
1449
  // Partition fitlers cannot be enabled without partition indexes
1450
  assert(!prefetch_filter || prefetch_index);
1451 1452
  // pin both index and filters, down to all partitions
  const bool pin_all =
1453
      rep_->table_options.pin_l0_filter_and_index_blocks_in_cache && level == 0;
1454 1455 1456 1457 1458 1459 1460
  // pin the first level of index
  const bool pin_index =
      pin_all || (table_options.pin_top_level_index_and_filter &&
                  index_type == BlockBasedTableOptions::kTwoLevelIndexSearch);
  // pin the first level of filter
  const bool pin_filter =
      pin_all || (table_options.pin_top_level_index_and_filter &&
1461
                  rep_->filter_type == Rep::FilterType::kPartitionedFilter);
1462 1463 1464 1465

  IndexReader* index_reader = nullptr;
  if (s.ok()) {
    s = new_table->CreateIndexReader(prefetch_buffer, meta_iter, use_cache,
1466 1467
                                     prefetch_index, pin_index, &index_reader,
                                     lookup_context);
1468 1469
    if (s.ok()) {
      assert(index_reader != nullptr);
1470
      rep_->index_reader.reset(index_reader);
1471 1472 1473 1474
      // The partitions of partitioned index are always stored in cache. They
      // are hence follow the configuration for pin and prefetch regardless of
      // the value of cache_index_and_filter_blocks
      if (prefetch_all) {
1475
        rep_->index_reader->CacheDependencies(pin_all);
1476 1477 1478 1479 1480 1481 1482
      }
    } else {
      delete index_reader;
      index_reader = nullptr;
    }
  }

1483
  // pre-fetching of blocks is turned on
1484
  // Will use block cache for meta-blocks access
1485
  // Always prefetch index and filter for level 0
1486
  // TODO(ajkr): also prefetch compression dictionary block
1487 1488
  // TODO(ajkr): also pin compression dictionary block when
  // `pin_l0_filter_and_index_blocks_in_cache == true`.
1489
  if (table_options.cache_index_and_filter_blocks) {
1490 1491 1492
    assert(table_options.block_cache != nullptr);
    if (s.ok() && prefetch_filter) {
      // Hack: Call GetFilter() to implicitly add filter to the block_cache
1493
      auto filter_entry =
1494 1495 1496
          new_table->GetFilter(rep_->table_prefix_extractor.get(),
                               /*prefetch_buffer=*/nullptr, /*no_io=*/false,
                               /*get_context=*/nullptr, lookup_context);
1497 1498
      if (filter_entry.GetValue() != nullptr && prefetch_all) {
        filter_entry.GetValue()->CacheDependencies(
1499
            pin_all, rep_->table_prefix_extractor.get());
1500 1501 1502 1503 1504
      }
      // if pin_filter is true then save it in rep_->filter_entry; it will be
      // released in the destructor only, hence it will be pinned in the
      // cache while this reader is alive
      if (pin_filter) {
1505
        rep_->filter_entry = std::move(filter_entry);
K
Kai Liu 已提交
1506
      }
1507 1508
    }
  } else {
1509
    std::unique_ptr<const BlockContents> compression_dict_block;
1510 1511
    if (s.ok()) {
      // Set filter block
1512
      if (rep_->filter_policy) {
M
Maysam Yabandeh 已提交
1513
        const bool is_a_filter_partition = true;
1514 1515 1516 1517
        auto filter = new_table->ReadFilter(
            prefetch_buffer, rep_->filter_handle, !is_a_filter_partition,
            rep_->table_prefix_extractor.get());
        rep_->filter.reset(filter);
1518 1519
        // Refer to the comment above about paritioned indexes always being
        // cached
1520
        if (filter && prefetch_all) {
1521 1522
          filter->CacheDependencies(pin_all,
                                    rep_->table_prefix_extractor.get());
M
Maysam Yabandeh 已提交
1523
        }
1524
      }
1525
      s = ReadCompressionDictBlock(prefetch_buffer, &compression_dict_block);
K
Kai Liu 已提交
1526
    }
1527
    if (s.ok() && !rep_->compression_dict_handle.IsNull()) {
1528 1529
      assert(compression_dict_block != nullptr);
      // TODO(ajkr): find a way to avoid the `compression_dict_block` data copy
1530
      rep_->uncompression_dict.reset(new UncompressionDict(
1531
          compression_dict_block->data.ToString(),
1532
          rep_->blocks_definitely_zstd_compressed, rep_->ioptions.statistics));
1533
    }
K
Kai Liu 已提交
1534
  }
J
jorlow@chromium.org 已提交
1535 1536 1537
  return s;
}

S
Siying Dong 已提交
1538
void BlockBasedTable::SetupForCompaction() {
1539
  switch (rep_->ioptions.access_hint_on_compaction_start) {
1540 1541 1542
    case Options::NONE:
      break;
    case Options::NORMAL:
1543
      rep_->file->file()->Hint(RandomAccessFile::NORMAL);
1544 1545
      break;
    case Options::SEQUENTIAL:
1546
      rep_->file->file()->Hint(RandomAccessFile::SEQUENTIAL);
1547 1548
      break;
    case Options::WILLNEED:
1549
      rep_->file->file()->Hint(RandomAccessFile::WILLNEED);
1550 1551 1552 1553 1554 1555
      break;
    default:
      assert(false);
  }
}

K
kailiu 已提交
1556 1557
std::shared_ptr<const TableProperties> BlockBasedTable::GetTableProperties()
    const {
K
kailiu 已提交
1558
  return rep_->table_properties;
K
Kai Liu 已提交
1559
}
S
Sanjay Ghemawat 已提交
1560

1561 1562 1563 1564 1565 1566 1567 1568
size_t BlockBasedTable::ApproximateMemoryUsage() const {
  size_t usage = 0;
  if (rep_->filter) {
    usage += rep_->filter->ApproximateMemoryUsage();
  }
  if (rep_->index_reader) {
    usage += rep_->index_reader->ApproximateMemoryUsage();
  }
1569 1570 1571
  if (rep_->uncompression_dict) {
    usage += rep_->uncompression_dict->ApproximateMemoryUsage();
  }
1572 1573 1574
  return usage;
}

K
Kai Liu 已提交
1575 1576
// Load the meta-block from the file. On success, return the loaded meta block
// and its iterator.
1577
Status BlockBasedTable::ReadMetaBlock(FilePrefetchBuffer* prefetch_buffer,
S
sdong 已提交
1578 1579
                                      std::unique_ptr<Block>* meta_block,
                                      std::unique_ptr<InternalIterator>* iter) {
S
Sanjay Ghemawat 已提交
1580 1581
  // TODO(sanjay): Skip this if footer.metaindex_handle() size indicates
  // it is an empty block.
1582
  std::unique_ptr<Block> meta;
K
Kai Liu 已提交
1583
  Status s = ReadBlockFromFile(
1584 1585
      rep_->file.get(), prefetch_buffer, rep_->footer, ReadOptions(),
      rep_->footer.metaindex_handle(), &meta, rep_->ioptions,
1586
      true /* decompress */, true /*maybe_compressed*/,
1587
      UncompressionDict::GetEmptyDict(), rep_->persistent_cache_options,
1588
      kDisableGlobalSequenceNumber, 0 /* read_amp_bytes_per_bit */,
1589
      GetMemoryAllocator(rep_->table_options));
K
Kai Liu 已提交
1590

K
Kai Liu 已提交
1591
  if (!s.ok()) {
1592
    ROCKS_LOG_ERROR(rep_->ioptions.info_log,
1593 1594 1595
                    "Encountered error while reading data from properties"
                    " block %s",
                    s.ToString().c_str());
K
Kai Liu 已提交
1596
    return s;
S
Sanjay Ghemawat 已提交
1597
  }
K
Kai Liu 已提交
1598

1599
  *meta_block = std::move(meta);
K
Kai Liu 已提交
1600
  // meta block uses bytewise comparator.
1601 1602
  iter->reset(meta_block->get()->NewIterator<DataBlockIter>(
      BytewiseComparator(), BytewiseComparator()));
K
Kai Liu 已提交
1603
  return Status::OK();
S
Sanjay Ghemawat 已提交
1604 1605
}

1606 1607
Status BlockBasedTable::GetDataBlockFromCache(
    const Slice& block_cache_key, const Slice& compressed_block_cache_key,
1608
    Cache* block_cache, Cache* block_cache_compressed,
1609
    const ReadOptions& read_options, CachableEntry<Block>* block,
1610
    const UncompressionDict& uncompression_dict, BlockType block_type,
1611 1612
    GetContext* get_context) const {
  const size_t read_amp_bytes_per_bit =
1613 1614 1615
      block_type == BlockType::kData
          ? rep_->table_options.read_amp_bytes_per_bit
          : 0;
1616 1617 1618
  assert(block);
  assert(block->IsEmpty());

1619
  Status s;
1620
  BlockContents* compressed_block = nullptr;
1621 1622 1623 1624
  Cache::Handle* block_cache_compressed_handle = nullptr;

  // Lookup uncompressed cache first
  if (block_cache != nullptr) {
1625 1626
    auto cache_handle = GetEntryFromCache(block_cache, block_cache_key,
                                          block_type, get_context);
1627 1628 1629 1630
    if (cache_handle != nullptr) {
      block->SetCachedValue(
          reinterpret_cast<Block*>(block_cache->Value(cache_handle)),
          block_cache, cache_handle);
1631 1632 1633 1634 1635
      return s;
    }
  }

  // If not found, search from the compressed block cache.
1636
  assert(block->IsEmpty());
1637 1638 1639 1640 1641 1642 1643 1644

  if (block_cache_compressed == nullptr) {
    return s;
  }

  assert(!compressed_block_cache_key.empty());
  block_cache_compressed_handle =
      block_cache_compressed->Lookup(compressed_block_cache_key);
1645 1646 1647

  Statistics* statistics = rep_->ioptions.statistics;

1648 1649 1650 1651 1652 1653 1654 1655 1656
  // if we found in the compressed cache, then uncompress and insert into
  // uncompressed cache
  if (block_cache_compressed_handle == nullptr) {
    RecordTick(statistics, BLOCK_CACHE_COMPRESSED_MISS);
    return s;
  }

  // found compressed block
  RecordTick(statistics, BLOCK_CACHE_COMPRESSED_HIT);
1657
  compressed_block = reinterpret_cast<BlockContents*>(
1658
      block_cache_compressed->Value(block_cache_compressed_handle));
1659 1660
  CompressionType compression_type = compressed_block->get_compression_type();
  assert(compression_type != kNoCompression);
1661 1662 1663

  // Retrieve the uncompressed contents into a new buffer
  BlockContents contents;
1664
  UncompressionContext context(compression_type);
1665
  UncompressionInfo info(context, uncompression_dict, compression_type);
1666 1667 1668 1669
  s = UncompressBlockContents(
      info, compressed_block->data.data(), compressed_block->data.size(),
      &contents, rep_->table_options.format_version, rep_->ioptions,
      GetMemoryAllocator(rep_->table_options));
1670 1671 1672

  // Insert uncompressed block into block cache
  if (s.ok()) {
1673
    std::unique_ptr<Block> block_holder(
1674
        new Block(std::move(contents), rep_->get_global_seqno(block_type),
1675
                  read_amp_bytes_per_bit, statistics));  // uncompressed block
1676 1677

    if (block_cache != nullptr && block_holder->own_bytes() &&
1678
        read_options.fill_cache) {
1679 1680 1681
      size_t charge = block_holder->ApproximateMemoryUsage();
      Cache::Handle* cache_handle = nullptr;
      s = block_cache->Insert(block_cache_key, block_holder.get(), charge,
1682
                              &DeleteCachedEntry<Block>, &cache_handle);
1683
#ifndef NDEBUG
1684
      block_cache->TEST_mark_as_data_block(block_cache_key, charge);
1685
#endif  // NDEBUG
1686
      if (s.ok()) {
1687 1688 1689 1690
        assert(cache_handle != nullptr);
        block->SetCachedValue(block_holder.release(), block_cache,
                              cache_handle);

1691
        UpdateCacheInsertionMetrics(block_type, get_context, charge);
1692 1693 1694
      } else {
        RecordTick(statistics, BLOCK_CACHE_ADD_FAILURES);
      }
1695 1696
    } else {
      block->SetOwnedValue(block_holder.release());
1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707
    }
  }

  // Release hold on compressed cache entry
  block_cache_compressed->Release(block_cache_compressed_handle);
  return s;
}

Status BlockBasedTable::PutDataBlockToCache(
    const Slice& block_cache_key, const Slice& compressed_block_cache_key,
    Cache* block_cache, Cache* block_cache_compressed,
1708
    CachableEntry<Block>* cached_block, BlockContents* raw_block_contents,
1709
    CompressionType raw_block_comp_type,
1710
    const UncompressionDict& uncompression_dict, SequenceNumber seq_no,
1711
    MemoryAllocator* memory_allocator, BlockType block_type,
1712 1713 1714 1715
    GetContext* get_context) const {
  const ImmutableCFOptions& ioptions = rep_->ioptions;
  const uint32_t format_version = rep_->table_options.format_version;
  const size_t read_amp_bytes_per_bit =
1716 1717 1718
      block_type == BlockType::kData
          ? rep_->table_options.read_amp_bytes_per_bit
          : 0;
1719
  const Cache::Priority priority =
1720 1721 1722 1723
      rep_->table_options.cache_index_and_filter_blocks_with_high_priority &&
              (block_type == BlockType::kFilter ||
               block_type == BlockType::kCompressionDictionary ||
               block_type == BlockType::kIndex)
1724 1725
          ? Cache::Priority::HIGH
          : Cache::Priority::LOW;
1726 1727
  assert(cached_block);
  assert(cached_block->IsEmpty());
1728
  assert(raw_block_comp_type == kNoCompression ||
1729 1730 1731
         block_cache_compressed != nullptr);

  Status s;
1732
  Statistics* statistics = ioptions.statistics;
1733 1734

  std::unique_ptr<Block> block_holder;
1735
  if (raw_block_comp_type != kNoCompression) {
1736 1737
    // Retrieve the uncompressed contents into a new buffer
    BlockContents uncompressed_block_contents;
1738
    UncompressionContext context(raw_block_comp_type);
1739
    UncompressionInfo info(context, uncompression_dict, raw_block_comp_type);
1740 1741 1742 1743
    s = UncompressBlockContents(info, raw_block_contents->data.data(),
                                raw_block_contents->data.size(),
                                &uncompressed_block_contents, format_version,
                                ioptions, memory_allocator);
1744 1745 1746
    if (!s.ok()) {
      return s;
    }
1747

1748 1749
    block_holder.reset(new Block(std::move(uncompressed_block_contents), seq_no,
                                 read_amp_bytes_per_bit, statistics));
1750
  } else {
1751 1752
    block_holder.reset(new Block(std::move(*raw_block_contents), seq_no,
                                 read_amp_bytes_per_bit, statistics));
1753 1754 1755 1756
  }

  // Insert compressed block into compressed block cache.
  // Release the hold on the compressed cache entry immediately.
1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771
  if (block_cache_compressed != nullptr &&
      raw_block_comp_type != kNoCompression && raw_block_contents != nullptr &&
      raw_block_contents->own_bytes()) {
#ifndef NDEBUG
    assert(raw_block_contents->is_raw_block);
#endif  // NDEBUG

    // We cannot directly put raw_block_contents because this could point to
    // an object in the stack.
    BlockContents* block_cont_for_comp_cache =
        new BlockContents(std::move(*raw_block_contents));
    s = block_cache_compressed->Insert(
        compressed_block_cache_key, block_cont_for_comp_cache,
        block_cont_for_comp_cache->ApproximateMemoryUsage(),
        &DeleteCachedEntry<BlockContents>);
1772 1773 1774 1775 1776
    if (s.ok()) {
      // Avoid the following code to delete this cached block.
      RecordTick(statistics, BLOCK_CACHE_COMPRESSED_ADD);
    } else {
      RecordTick(statistics, BLOCK_CACHE_COMPRESSED_ADD_FAILURES);
1777
      delete block_cont_for_comp_cache;
1778
    }
1779 1780 1781
  }

  // insert into uncompressed block cache
1782 1783 1784 1785
  if (block_cache != nullptr && block_holder->own_bytes()) {
    size_t charge = block_holder->ApproximateMemoryUsage();
    Cache::Handle* cache_handle = nullptr;
    s = block_cache->Insert(block_cache_key, block_holder.get(), charge,
1786
                            &DeleteCachedEntry<Block>, &cache_handle, priority);
1787
#ifndef NDEBUG
1788
    block_cache->TEST_mark_as_data_block(block_cache_key, charge);
1789
#endif  // NDEBUG
1790
    if (s.ok()) {
1791 1792 1793 1794
      assert(cache_handle != nullptr);
      cached_block->SetCachedValue(block_holder.release(), block_cache,
                                   cache_handle);

1795
      UpdateCacheInsertionMetrics(block_type, get_context, charge);
1796 1797 1798
    } else {
      RecordTick(statistics, BLOCK_CACHE_ADD_FAILURES);
    }
1799 1800
  } else {
    cached_block->SetOwnedValue(block_holder.release());
1801 1802 1803 1804 1805
  }

  return s;
}

M
Maysam Yabandeh 已提交
1806
FilterBlockReader* BlockBasedTable::ReadFilter(
1807
    FilePrefetchBuffer* prefetch_buffer, const BlockHandle& filter_handle,
1808 1809
    const bool is_a_filter_partition,
    const SliceTransform* prefix_extractor) const {
M
Maysam Yabandeh 已提交
1810
  auto& rep = rep_;
K
Kai Liu 已提交
1811 1812
  // TODO: We might want to unify with ReadBlockFromFile() if we start
  // requiring checksum verification in Table::Open.
I
Igor Canadi 已提交
1813 1814 1815 1816
  if (rep->filter_type == Rep::FilterType::kNoFilter) {
    return nullptr;
  }
  BlockContents block;
S
Siying Dong 已提交
1817

1818 1819 1820
  BlockFetcher block_fetcher(
      rep->file.get(), prefetch_buffer, rep->footer, ReadOptions(),
      filter_handle, &block, rep->ioptions, false /* decompress */,
1821
      false /*maybe_compressed*/, UncompressionDict::GetEmptyDict(),
1822
      rep->persistent_cache_options, GetMemoryAllocator(rep->table_options));
S
Siying Dong 已提交
1823 1824 1825
  Status s = block_fetcher.ReadBlockContents();

  if (!s.ok()) {
I
Igor Canadi 已提交
1826 1827 1828 1829 1830 1831
    // Error reading the block
    return nullptr;
  }

  assert(rep->filter_policy);

M
Maysam Yabandeh 已提交
1832 1833 1834 1835 1836 1837 1838 1839 1840
  auto filter_type = rep->filter_type;
  if (rep->filter_type == Rep::FilterType::kPartitionedFilter &&
      is_a_filter_partition) {
    filter_type = Rep::FilterType::kFullFilter;
  }

  switch (filter_type) {
    case Rep::FilterType::kPartitionedFilter: {
      return new PartitionedFilterBlockReader(
1841
          rep->prefix_filtering ? prefix_extractor : nullptr,
M
Maysam Yabandeh 已提交
1842
          rep->whole_key_filtering, std::move(block), nullptr,
1843 1844
          rep->ioptions.statistics, rep->internal_comparator, this,
          rep_->table_properties == nullptr ||
1845 1846 1847
              rep_->table_properties->index_key_is_user_key == 0,
          rep_->table_properties == nullptr ||
              rep_->table_properties->index_value_is_delta_encoded == 0);
M
Maysam Yabandeh 已提交
1848 1849 1850 1851
    }

    case Rep::FilterType::kBlockFilter:
      return new BlockBasedFilterBlockReader(
1852
          rep->prefix_filtering ? prefix_extractor : nullptr,
M
Maysam Yabandeh 已提交
1853 1854 1855 1856 1857 1858 1859
          rep->table_options, rep->whole_key_filtering, std::move(block),
          rep->ioptions.statistics);

    case Rep::FilterType::kFullFilter: {
      auto filter_bits_reader =
          rep->filter_policy->GetFilterBitsReader(block.data);
      assert(filter_bits_reader != nullptr);
I
Igor Canadi 已提交
1860
      return new FullFilterBlockReader(
1861
          rep->prefix_filtering ? prefix_extractor : nullptr,
1862 1863
          rep->whole_key_filtering, std::move(block), filter_bits_reader,
          rep->ioptions.statistics);
1864
    }
I
Igor Canadi 已提交
1865

M
Maysam Yabandeh 已提交
1866 1867 1868 1869 1870 1871
    default:
      // filter_type is either kNoFilter (exited the function at the first if),
      // or it must be covered in this switch block
      assert(false);
      return nullptr;
  }
S
Sanjay Ghemawat 已提交
1872 1873
}

1874
CachableEntry<FilterBlockReader> BlockBasedTable::GetFilter(
1875
    const SliceTransform* prefix_extractor, FilePrefetchBuffer* prefetch_buffer,
1876 1877
    bool no_io, GetContext* get_context,
    BlockCacheLookupContext* lookup_context) const {
M
Maysam Yabandeh 已提交
1878 1879
  const BlockHandle& filter_blk_handle = rep_->filter_handle;
  const bool is_a_filter_partition = true;
1880
  return GetFilter(prefetch_buffer, filter_blk_handle, !is_a_filter_partition,
1881
                   no_io, get_context, lookup_context, prefix_extractor);
M
Maysam Yabandeh 已提交
1882 1883
}

1884
CachableEntry<FilterBlockReader> BlockBasedTable::GetFilter(
1885
    FilePrefetchBuffer* prefetch_buffer, const BlockHandle& filter_blk_handle,
1886
    const bool is_a_filter_partition, bool no_io, GetContext* get_context,
1887
    BlockCacheLookupContext* lookup_context,
1888
    const SliceTransform* prefix_extractor) const {
1889 1890 1891 1892
  // If cache_index_and_filter_blocks is false, filter should be pre-populated.
  // We will return rep_->filter anyway. rep_->filter can be nullptr if filter
  // read fails at Open() time. We don't want to reload again since it will
  // most probably fail again.
M
Maysam Yabandeh 已提交
1893 1894
  if (!is_a_filter_partition &&
      !rep_->table_options.cache_index_and_filter_blocks) {
1895 1896
    return {rep_->filter.get(), /*cache=*/nullptr, /*cache_handle=*/nullptr,
            /*own_value=*/false};
1897 1898
  }

1899 1900 1901
  Cache* block_cache = rep_->table_options.block_cache.get();
  if (rep_->filter_policy == nullptr /* do not use filter */ ||
      block_cache == nullptr /* no block cache at all */) {
1902
    return CachableEntry<FilterBlockReader>();
K
Kai Liu 已提交
1903 1904
  }

1905
  if (!is_a_filter_partition && rep_->filter_entry.IsCached()) {
1906 1907
    return {rep_->filter_entry.GetValue(), /*cache=*/nullptr,
    /*cache_handle=*/nullptr, /*own_value=*/false};
1908 1909 1910 1911
  }

  PERF_TIMER_GUARD(read_filter_block_nanos);

K
Kai Liu 已提交
1912 1913
  // Fetching from the cache
  char cache_key[kMaxCacheKeyPrefixSize + kMaxVarint64Length];
1914
  auto key = GetCacheKey(rep_->cache_key_prefix, rep_->cache_key_prefix_size,
M
Maysam Yabandeh 已提交
1915
                         filter_blk_handle, cache_key);
K
Kai Liu 已提交
1916

1917 1918
  Cache::Handle* cache_handle =
      GetEntryFromCache(block_cache, key, BlockType::kFilter, get_context);
K
Kai Liu 已提交
1919 1920

  FilterBlockReader* filter = nullptr;
1921 1922 1923
  size_t usage = 0;
  bool is_cache_hit = false;
  bool return_empty_reader = false;
K
Kai Liu 已提交
1924
  if (cache_handle != nullptr) {
1925 1926
    filter =
        reinterpret_cast<FilterBlockReader*>(block_cache->Value(cache_handle));
1927 1928
    usage = filter->ApproximateMemoryUsage();
    is_cache_hit = true;
K
Kai Liu 已提交
1929 1930
  } else if (no_io) {
    // Do not invoke any io.
1931
    return_empty_reader = true;
K
Kai Liu 已提交
1932
  } else {
1933 1934
    filter = ReadFilter(prefetch_buffer, filter_blk_handle,
                        is_a_filter_partition, prefix_extractor);
I
Igor Canadi 已提交
1935
    if (filter != nullptr) {
1936
      usage = filter->ApproximateMemoryUsage();
1937
      Status s = block_cache->Insert(
1938
          key, filter, usage, &DeleteCachedFilterEntry, &cache_handle,
1939 1940 1941
          rep_->table_options.cache_index_and_filter_blocks_with_high_priority
              ? Cache::Priority::HIGH
              : Cache::Priority::LOW);
1942
      if (s.ok()) {
1943
        PERF_COUNTER_ADD(filter_block_read_count, 1);
1944
        UpdateCacheInsertionMetrics(BlockType::kFilter, get_context, usage);
1945
      } else {
1946
        RecordTick(rep_->ioptions.statistics, BLOCK_CACHE_ADD_FAILURES);
1947
        delete filter;
1948
        return_empty_reader = true;
1949
      }
K
Kai Liu 已提交
1950 1951 1952
    }
  }

1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970
  if (block_cache_tracer_ && lookup_context) {
    // Avoid making copy of block_key and cf_name when constructing the access
    // record.
    BlockCacheTraceRecord access_record(
        rep_->ioptions.env->NowMicros(),
        /*block_key=*/"", TraceType::kBlockTraceFilterBlock,
        /*block_size=*/usage, rep_->cf_id_for_tracing(),
        /*cf_name=*/"", rep_->level_for_tracing(),
        rep_->sst_number_for_tracing(), lookup_context->caller, is_cache_hit,
        /*no_insert=*/no_io);
    block_cache_tracer_->WriteBlockAccess(access_record, key,
                                          rep_->cf_name_for_tracing(),
                                          /*referenced_key=*/nullptr);
  }

  if (return_empty_reader) {
    return CachableEntry<FilterBlockReader>();
  }
1971
  return {filter, cache_handle ? block_cache : nullptr, cache_handle,
1972
          /*own_value=*/false};
K
Kai Liu 已提交
1973 1974
}

1975
CachableEntry<UncompressionDict> BlockBasedTable::GetUncompressionDict(
1976
    FilePrefetchBuffer* prefetch_buffer, bool no_io, GetContext* get_context,
1977
    BlockCacheLookupContext* lookup_context) const {
1978
  if (!rep_->table_options.cache_index_and_filter_blocks) {
1979 1980
    // block cache is either disabled or not used for meta-blocks. In either
    // case, BlockBasedTableReader is the owner of the uncompression dictionary.
1981 1982
    return {rep_->uncompression_dict.get(), nullptr /* cache */,
            nullptr /* cache_handle */, false /* own_value */};
1983
  }
1984
  if (rep_->compression_dict_handle.IsNull()) {
1985
    return CachableEntry<UncompressionDict>();
1986 1987 1988
  }
  char cache_key_buf[kMaxCacheKeyPrefixSize + kMaxVarint64Length];
  auto cache_key =
1989 1990
      GetCacheKey(rep_->cache_key_prefix, rep_->cache_key_prefix_size,
                  rep_->compression_dict_handle, cache_key_buf);
1991 1992 1993
  auto cache_handle =
      GetEntryFromCache(rep_->table_options.block_cache.get(), cache_key,
                        BlockType::kCompressionDictionary, get_context);
1994
  UncompressionDict* dict = nullptr;
1995 1996
  bool is_cache_hit = false;
  size_t usage = 0;
1997 1998
  if (cache_handle != nullptr) {
    dict = reinterpret_cast<UncompressionDict*>(
1999
        rep_->table_options.block_cache->Value(cache_handle));
2000 2001
    is_cache_hit = true;
    usage = dict->ApproximateMemoryUsage();
2002 2003 2004 2005 2006
  } else if (no_io) {
    // Do not invoke any io.
  } else {
    std::unique_ptr<const BlockContents> compression_dict_block;
    Status s =
2007
        ReadCompressionDictBlock(prefetch_buffer, &compression_dict_block);
2008 2009 2010
    if (s.ok()) {
      assert(compression_dict_block != nullptr);
      // TODO(ajkr): find a way to avoid the `compression_dict_block` data copy
2011 2012 2013 2014
      std::unique_ptr<UncompressionDict> uncompression_dict(
          new UncompressionDict(compression_dict_block->data.ToString(),
                                rep_->blocks_definitely_zstd_compressed,
                                rep_->ioptions.statistics));
2015
      usage = uncompression_dict->ApproximateMemoryUsage();
2016
      s = rep_->table_options.block_cache->Insert(
2017 2018
          cache_key, uncompression_dict.get(), usage,
          &DeleteCachedUncompressionDictEntry, &cache_handle,
2019
          rep_->table_options.cache_index_and_filter_blocks_with_high_priority
2020 2021
              ? Cache::Priority::HIGH
              : Cache::Priority::LOW);
2022 2023 2024 2025 2026 2027

      if (s.ok()) {
        PERF_COUNTER_ADD(compression_dict_block_read_count, 1);
        UpdateCacheInsertionMetrics(BlockType::kCompressionDictionary,
                                    get_context, usage);
        dict = uncompression_dict.release();
2028
      } else {
2029 2030 2031
        RecordTick(rep_->ioptions.statistics, BLOCK_CACHE_ADD_FAILURES);
        assert(dict == nullptr);
        assert(cache_handle == nullptr);
2032 2033 2034
      }
    }
  }
2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048
  if (block_cache_tracer_ && lookup_context) {
    // Avoid making copy of block_key and cf_name when constructing the access
    // record.
    BlockCacheTraceRecord access_record(
        rep_->ioptions.env->NowMicros(),
        /*block_key=*/"", TraceType::kBlockTraceUncompressionDictBlock,
        /*block_size=*/usage, rep_->cf_id_for_tracing(),
        /*cf_name=*/"", rep_->level_for_tracing(),
        rep_->sst_number_for_tracing(), lookup_context->caller, is_cache_hit,
        /*no_insert=*/no_io);
    block_cache_tracer_->WriteBlockAccess(access_record, cache_key,
                                          rep_->cf_name_for_tracing(),
                                          /*referenced_key=*/nullptr);
  }
2049 2050
  return {dict, cache_handle ? rep_->table_options.block_cache.get() : nullptr,
          cache_handle, false /* own_value */};
2051 2052
}

2053 2054
// disable_prefix_seek should be set to true when prefix_extractor found in SST
// differs from the one in mutable_cf_options and index type is HashBasedIndex
2055
InternalIteratorBase<BlockHandle>* BlockBasedTable::NewIndexIterator(
2056
    const ReadOptions& read_options, bool disable_prefix_seek,
2057 2058
    IndexBlockIter* input_iter, GetContext* get_context,
    BlockCacheLookupContext* lookup_context) const {
2059 2060
  assert(rep_ != nullptr);
  assert(rep_->index_reader != nullptr);
2061

2062
  // We don't return pinned data from index blocks, so no need
2063
  // to set `block_contents_pinned`.
2064
  return rep_->index_reader->NewIterator(read_options, disable_prefix_seek,
2065 2066
                                         input_iter, get_context,
                                         lookup_context);
K
Kai Liu 已提交
2067 2068
}

L
Lei Jin 已提交
2069 2070
// Convert an index iterator value (i.e., an encoded BlockHandle)
// into an iterator over the contents of the corresponding block.
2071 2072
// If input_iter is null, new a iterator
// If input_iter is not null, update this iter and return it
M
Maysam Yabandeh 已提交
2073 2074
template <typename TBlockIter>
TBlockIter* BlockBasedTable::NewDataBlockIterator(
2075
    const ReadOptions& ro, const BlockHandle& handle, TBlockIter* input_iter,
2076
    BlockType block_type, bool key_includes_seq, bool index_key_is_full,
2077
    GetContext* get_context, BlockCacheLookupContext* lookup_context, Status s,
2078
    FilePrefetchBuffer* prefetch_buffer) const {
2079 2080
  PERF_TIMER_GUARD(new_table_block_iter_nanos);

2081 2082 2083 2084 2085 2086 2087 2088
  TBlockIter* iter = input_iter != nullptr ? input_iter : new TBlockIter;
  if (!s.ok()) {
    iter->Invalidate(s);
    return iter;
  }

  const bool no_io = (ro.read_tier == kBlockCacheTier);
  auto uncompression_dict_storage =
2089
      GetUncompressionDict(prefetch_buffer, no_io, get_context, lookup_context);
2090
  const UncompressionDict& uncompression_dict =
2091 2092 2093
      uncompression_dict_storage.GetValue() == nullptr
          ? UncompressionDict::GetEmptyDict()
          : *uncompression_dict_storage.GetValue();
2094

L
Lei Jin 已提交
2095
  CachableEntry<Block> block;
2096
  s = RetrieveBlock(prefetch_buffer, ro, handle, uncompression_dict, &block,
2097
                    block_type, get_context, lookup_context);
2098

2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113
  if (!s.ok()) {
    assert(block.IsEmpty());
    iter->Invalidate(s);
    return iter;
  }

  assert(block.GetValue() != nullptr);
  constexpr bool kTotalOrderSeek = true;
  // Block contents are pinned and it is still pinned after the iterator
  // is destroyed as long as cleanup functions are moved to another object,
  // when:
  // 1. block cache handle is set to be released in cleanup function, or
  // 2. it's pointing to immortal source. If own_bytes is true then we are
  //    not reading data from the original source, whether immortal or not.
  //    Otherwise, the block is pinned iff the source is immortal.
2114 2115
  const bool block_contents_pinned =
      block.IsCached() ||
2116
      (!block.GetValue()->own_bytes() && rep_->immortal_table);
2117
  iter = block.GetValue()->NewIterator<TBlockIter>(
2118 2119
      &rep_->internal_comparator, rep_->internal_comparator.user_comparator(),
      iter, rep_->ioptions.statistics, kTotalOrderSeek, key_includes_seq,
2120
      index_key_is_full, block_contents_pinned);
2121 2122

  if (!block.IsCached()) {
2123
    if (!ro.fill_cache && rep_->cache_key_prefix_size != 0) {
2124
      // insert a dummy record to block cache to track the memory usage
2125
      Cache* const block_cache = rep_->table_options.block_cache.get();
2126 2127 2128 2129 2130 2131 2132 2133
      Cache::Handle* cache_handle = nullptr;
      // There are two other types of cache keys: 1) SST cache key added in
      // `MaybeReadBlockAndLoadToCache` 2) dummy cache key added in
      // `write_buffer_manager`. Use longer prefix (41 bytes) to differentiate
      // from SST cache key(31 bytes), and use non-zero prefix to
      // differentiate from `write_buffer_manager`
      const size_t kExtraCacheKeyPrefix = kMaxVarint64Length * 4 + 1;
      char cache_key[kExtraCacheKeyPrefix + kMaxVarint64Length];
2134
      // Prefix: use rep_->cache_key_prefix padded by 0s
2135
      memset(cache_key, 0, kExtraCacheKeyPrefix + kMaxVarint64Length);
2136 2137 2138
      assert(rep_->cache_key_prefix_size != 0);
      assert(rep_->cache_key_prefix_size <= kExtraCacheKeyPrefix);
      memcpy(cache_key, rep_->cache_key_prefix, rep_->cache_key_prefix_size);
2139 2140 2141
      char* end = EncodeVarint64(cache_key + kExtraCacheKeyPrefix,
                                 next_cache_key_id_++);
      assert(end - cache_key <=
2142
             static_cast<int>(kExtraCacheKeyPrefix + kMaxVarint64Length));
2143 2144 2145 2146
      const Slice unique_key(cache_key, static_cast<size_t>(end - cache_key));
      s = block_cache->Insert(unique_key, nullptr,
                              block.GetValue()->ApproximateMemoryUsage(),
                              nullptr, &cache_handle);
2147
      if (s.ok()) {
2148 2149 2150
        assert(cache_handle != nullptr);
        iter->RegisterCleanup(&ForceReleaseCachedEntry, block_cache,
                              cache_handle);
2151
      }
2152
    }
2153 2154
  } else {
    iter->SetCacheHandle(block.GetCacheHandle());
2155 2156
  }

2157
  block.TransferTo(iter);
2158 2159 2160
  return iter;
}

2161
Status BlockBasedTable::MaybeReadBlockAndLoadToCache(
2162
    FilePrefetchBuffer* prefetch_buffer, const ReadOptions& ro,
2163
    const BlockHandle& handle, const UncompressionDict& uncompression_dict,
2164
    CachableEntry<Block>* block_entry, BlockType block_type,
2165
    GetContext* get_context, BlockCacheLookupContext* lookup_context) const {
2166
  assert(block_entry != nullptr);
2167
  const bool no_io = (ro.read_tier == kBlockCacheTier);
2168
  Cache* block_cache = rep_->table_options.block_cache.get();
2169
  // No point to cache compressed blocks if it never goes away
2170
  Cache* block_cache_compressed =
2171 2172
      rep_->immortal_table ? nullptr
                           : rep_->table_options.block_cache_compressed.get();
L
Lei Jin 已提交
2173

2174 2175
  // First, try to get the block from the cache
  //
L
Lei Jin 已提交
2176
  // If either block cache is enabled, we'll try to read from it.
2177
  Status s;
2178 2179 2180 2181
  char cache_key[kMaxCacheKeyPrefixSize + kMaxVarint64Length];
  char compressed_cache_key[kMaxCacheKeyPrefixSize + kMaxVarint64Length];
  Slice key /* key to the block cache */;
  Slice ckey /* key to the compressed block cache */;
2182 2183
  bool is_cache_hit = false;
  bool no_insert = true;
L
Lei Jin 已提交
2184 2185 2186
  if (block_cache != nullptr || block_cache_compressed != nullptr) {
    // create key for block cache
    if (block_cache != nullptr) {
2187
      key = GetCacheKey(rep_->cache_key_prefix, rep_->cache_key_prefix_size,
2188
                        handle, cache_key);
L
Lei Jin 已提交
2189 2190 2191
    }

    if (block_cache_compressed != nullptr) {
2192 2193
      ckey = GetCacheKey(rep_->compressed_cache_key_prefix,
                         rep_->compressed_cache_key_prefix_size, handle,
L
Lei Jin 已提交
2194 2195 2196
                         compressed_cache_key);
    }

2197
    s = GetDataBlockFromCache(key, ckey, block_cache, block_cache_compressed,
2198
                              ro, block_entry, uncompression_dict, block_type,
2199
                              get_context);
2200 2201 2202 2203 2204
    if (block_entry->GetValue()) {
      // TODO(haoyu): Differentiate cache hit on uncompressed block cache and
      // compressed block cache.
      is_cache_hit = true;
    }
2205 2206
    // Can't find the block from the cache. If I/O is allowed, read from the
    // file.
2207
    if (block_entry->GetValue() == nullptr && !no_io && ro.fill_cache) {
2208
      no_insert = false;
2209
      Statistics* statistics = rep_->ioptions.statistics;
2210
      bool do_decompress =
2211
          block_cache_compressed == nullptr && rep_->blocks_maybe_compressed;
2212 2213
      CompressionType raw_block_comp_type;
      BlockContents raw_block_contents;
L
Lei Jin 已提交
2214
      {
2215
        StopWatch sw(rep_->ioptions.env, statistics, READ_BLOCK_GET_MICROS);
2216
        BlockFetcher block_fetcher(
2217 2218 2219 2220 2221 2222
            rep_->file.get(), prefetch_buffer, rep_->footer, ro, handle,
            &raw_block_contents, rep_->ioptions,
            do_decompress /* do uncompress */, rep_->blocks_maybe_compressed,
            uncompression_dict, rep_->persistent_cache_options,
            GetMemoryAllocator(rep_->table_options),
            GetMemoryAllocatorForCompressedBlock(rep_->table_options));
2223 2224
        s = block_fetcher.ReadBlockContents();
        raw_block_comp_type = block_fetcher.get_compression_type();
L
Lei Jin 已提交
2225 2226 2227
      }

      if (s.ok()) {
2228
        SequenceNumber seq_no = rep_->get_global_seqno(block_type);
2229 2230
        // If filling cache is allowed and a cache is configured, try to put the
        // block to the cache.
2231 2232 2233 2234
        s = PutDataBlockToCache(key, ckey, block_cache, block_cache_compressed,
                                block_entry, &raw_block_contents,
                                raw_block_comp_type, uncompression_dict, seq_no,
                                GetMemoryAllocator(rep_->table_options),
2235
                                block_type, get_context);
L
Lei Jin 已提交
2236 2237 2238
      }
    }
  }
2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291

  // Fill lookup_context.
  if (block_cache_tracer_ && lookup_context) {
    size_t usage = 0;
    uint64_t nkeys = 0;
    if (block_entry->GetValue()) {
      // Approximate the number of keys in the block using restarts.
      nkeys = rep_->table_options.block_restart_interval *
              block_entry->GetValue()->NumRestarts();
      usage = block_entry->GetValue()->ApproximateMemoryUsage();
    }
    TraceType trace_block_type = TraceType::kTraceMax;
    switch (block_type) {
      case BlockType::kIndex:
        trace_block_type = TraceType::kBlockTraceIndexBlock;
        break;
      case BlockType::kData:
        trace_block_type = TraceType::kBlockTraceDataBlock;
        break;
      case BlockType::kRangeDeletion:
        trace_block_type = TraceType::kBlockTraceRangeDeletionBlock;
        break;
      default:
        // This cannot happen.
        assert(false);
        break;
    }
    if (BlockCacheTraceHelper::ShouldTraceReferencedKey(
            trace_block_type, lookup_context->caller)) {
      // Defer logging the access to Get() and MultiGet() to trace additional
      // information, e.g., the referenced key,
      // referenced_key_exist_in_block.

      // Make a copy of the block key here since it will be logged later.
      lookup_context->FillLookupContext(
          is_cache_hit, no_insert, trace_block_type,
          /*block_size=*/usage, /*block_key=*/key.ToString(), nkeys);
    } else {
      // Avoid making copy of block_key and cf_name when constructing the access
      // record.
      BlockCacheTraceRecord access_record(
          rep_->ioptions.env->NowMicros(),
          /*block_key=*/"", trace_block_type,
          /*block_size=*/usage, rep_->cf_id_for_tracing(),
          /*cf_name=*/"", rep_->level_for_tracing(),
          rep_->sst_number_for_tracing(), lookup_context->caller, is_cache_hit,
          no_insert);
      block_cache_tracer_->WriteBlockAccess(access_record, key,
                                            rep_->cf_name_for_tracing(),
                                            /*referenced_key=*/nullptr);
    }
  }

2292
  assert(s.ok() || block_entry->GetValue() == nullptr);
2293
  return s;
2294 2295
}

2296
Status BlockBasedTable::RetrieveBlock(
2297
    FilePrefetchBuffer* prefetch_buffer, const ReadOptions& ro,
2298
    const BlockHandle& handle, const UncompressionDict& uncompression_dict,
2299
    CachableEntry<Block>* block_entry, BlockType block_type,
2300
    GetContext* get_context, BlockCacheLookupContext* lookup_context) const {
2301 2302 2303 2304
  assert(block_entry);
  assert(block_entry->IsEmpty());

  Status s;
2305 2306 2307 2308
  if (rep_->table_options.cache_index_and_filter_blocks ||
      (block_type != BlockType::kFilter &&
       block_type != BlockType::kCompressionDictionary &&
       block_type != BlockType::kIndex)) {
2309
    s = MaybeReadBlockAndLoadToCache(prefetch_buffer, ro, handle,
2310
                                     uncompression_dict, block_entry,
2311
                                     block_type, get_context, lookup_context);
2312 2313 2314 2315 2316 2317

    if (!s.ok()) {
      return s;
    }

    if (block_entry->GetValue() != nullptr) {
2318
      assert(s.ok());
2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332
      return s;
    }
  }

  assert(block_entry->IsEmpty());

  const bool no_io = ro.read_tier == kBlockCacheTier;
  if (no_io) {
    return Status::Incomplete("no blocking io");
  }

  std::unique_ptr<Block> block;

  {
2333
    StopWatch sw(rep_->ioptions.env, rep_->ioptions.statistics,
2334 2335
                 READ_BLOCK_GET_MICROS);
    s = ReadBlockFromFile(
2336 2337 2338
        rep_->file.get(), prefetch_buffer, rep_->footer, ro, handle, &block,
        rep_->ioptions, rep_->blocks_maybe_compressed,
        rep_->blocks_maybe_compressed, uncompression_dict,
2339 2340 2341 2342
        rep_->persistent_cache_options, rep_->get_global_seqno(block_type),
        block_type == BlockType::kData
            ? rep_->table_options.read_amp_bytes_per_bit
            : 0,
2343
        GetMemoryAllocator(rep_->table_options));
2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355
  }

  if (!s.ok()) {
    return s;
  }

  block_entry->SetOwnedValue(block.release());

  assert(s.ok());
  return s;
}

2356
BlockBasedTable::PartitionedIndexIteratorState::PartitionedIndexIteratorState(
2357
    const BlockBasedTable* table,
M
Maysam Yabandeh 已提交
2358
    std::unordered_map<uint64_t, CachableEntry<Block>>* block_map,
2359
    bool index_key_includes_seq, bool index_key_is_full)
M
Maysam Yabandeh 已提交
2360 2361
    : table_(table),
      block_map_(block_map),
2362 2363
      index_key_includes_seq_(index_key_includes_seq),
      index_key_is_full_(index_key_is_full) {}
M
Maysam Yabandeh 已提交
2364

2365
InternalIteratorBase<BlockHandle>*
2366
BlockBasedTable::PartitionedIndexIteratorState::NewSecondaryIterator(
2367
    const BlockHandle& handle) {
M
Maysam Yabandeh 已提交
2368
  // Return a block iterator on the index partition
2369 2370 2371 2372
  auto block = block_map_->find(handle.offset());
  // This is a possible scenario since block cache might not have had space
  // for the partition
  if (block != block_map_->end()) {
2373 2374 2375
    auto rep = table_->get_rep();
    assert(rep);

M
Maysam Yabandeh 已提交
2376
    Statistics* kNullStats = nullptr;
2377
    // We don't return pinned data from index blocks, so no need
2378
    // to set `block_contents_pinned`.
2379
    return block->second.GetValue()->NewIterator<IndexBlockIter>(
M
Maysam Yabandeh 已提交
2380
        &rep->internal_comparator, rep->internal_comparator.user_comparator(),
2381
        nullptr, kNullStats, true, index_key_includes_seq_, index_key_is_full_);
2382 2383
  }
  // Create an empty iterator
2384
  return new IndexBlockIter();
2385 2386
}

T
Tyler Harter 已提交
2387 2388
// This will be broken if the user specifies an unusual implementation
// of Options.comparator, or if the user specifies an unusual
2389 2390
// definition of prefixes in BlockBasedTableOptions.filter_policy.
// In particular, we require the following three properties:
T
Tyler Harter 已提交
2391 2392 2393 2394
//
// 1) key.starts_with(prefix(key))
// 2) Compare(prefix(key), key) <= 0.
// 3) If Compare(key1, key2) <= 0, then Compare(prefix(key1), prefix(key2)) <= 0
T
Tyler Harter 已提交
2395
//
K
Kai Liu 已提交
2396 2397 2398
// Otherwise, this method guarantees no I/O will be incurred.
//
// REQUIRES: this method shouldn't be called while the DB lock is held.
2399 2400 2401
bool BlockBasedTable::PrefixMayMatch(
    const Slice& internal_key, const ReadOptions& read_options,
    const SliceTransform* options_prefix_extractor,
2402 2403
    const bool need_upper_bound_check,
    BlockCacheLookupContext* lookup_context) const {
2404
  if (!rep_->filter_policy) {
2405 2406 2407
    return true;
  }

2408 2409 2410 2411 2412 2413 2414 2415 2416 2417
  const SliceTransform* prefix_extractor;

  if (rep_->table_prefix_extractor == nullptr) {
    if (need_upper_bound_check) {
      return true;
    }
    prefix_extractor = options_prefix_extractor;
  } else {
    prefix_extractor = rep_->table_prefix_extractor.get();
  }
2418
  auto user_key = ExtractUserKey(internal_key);
2419
  if (!prefix_extractor->InDomain(user_key)) {
2420 2421
    return true;
  }
L
Lei Jin 已提交
2422

T
Tyler Harter 已提交
2423 2424 2425
  bool may_match = true;
  Status s;

2426
  // First, try check with full filter
2427 2428 2429
  auto filter_entry =
      GetFilter(prefix_extractor, /*prefetch_buffer=*/nullptr, /*no_io=*/false,
                /*get_context=*/nullptr, lookup_context);
2430
  FilterBlockReader* filter = filter_entry.GetValue();
2431
  bool filter_checked = true;
2432 2433
  if (filter != nullptr) {
    if (!filter->IsBlockBased()) {
M
Maysam Yabandeh 已提交
2434
      const Slice* const const_ikey_ptr = &internal_key;
2435 2436 2437
      may_match = filter->RangeMayExist(
          read_options.iterate_upper_bound, user_key, prefix_extractor,
          rep_->internal_comparator.user_comparator(), const_ikey_ptr,
2438
          &filter_checked, need_upper_bound_check, lookup_context);
2439
    } else {
2440 2441 2442 2443 2444
      // if prefix_extractor changed for block based filter, skip filter
      if (need_upper_bound_check) {
        return true;
      }
      auto prefix = prefix_extractor->Transform(user_key);
M
Maysam Yabandeh 已提交
2445 2446 2447 2448 2449 2450 2451 2452 2453
      InternalKey internal_key_prefix(prefix, kMaxSequenceNumber, kTypeValue);
      auto internal_prefix = internal_key_prefix.Encode();

      // To prevent any io operation in this method, we set `read_tier` to make
      // sure we always read index or filter only when they have already been
      // loaded to memory.
      ReadOptions no_io_read_options;
      no_io_read_options.read_tier = kBlockCacheTier;

2454
      // Then, try find it within each block
2455 2456
      // we already know prefix_extractor and prefix_extractor_name must match
      // because `CheckPrefixMayMatch` first checks `check_filter_ == true`
2457 2458 2459 2460
      std::unique_ptr<InternalIteratorBase<BlockHandle>> iiter(NewIndexIterator(
          no_io_read_options,
          /*need_upper_bound_check=*/false, /*input_iter=*/nullptr,
          /*need_upper_bound_check=*/nullptr, lookup_context));
2461 2462 2463 2464 2465 2466 2467 2468
      iiter->Seek(internal_prefix);

      if (!iiter->Valid()) {
        // we're past end of file
        // if it's incomplete, it means that we avoided I/O
        // and we're not really sure that we're past the end
        // of the file
        may_match = iiter->status().IsIncomplete();
2469 2470
      } else if ((rep_->table_properties &&
                          rep_->table_properties->index_key_is_user_key
M
Maysam Yabandeh 已提交
2471 2472
                      ? iiter->key()
                      : ExtractUserKey(iiter->key()))
2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490
                     .starts_with(ExtractUserKey(internal_prefix))) {
        // we need to check for this subtle case because our only
        // guarantee is that "the key is a string >= last key in that data
        // block" according to the doc/table_format.txt spec.
        //
        // Suppose iiter->key() starts with the desired prefix; it is not
        // necessarily the case that the corresponding data block will
        // contain the prefix, since iiter->key() need not be in the
        // block.  However, the next data block may contain the prefix, so
        // we return true to play it safe.
        may_match = true;
      } else if (filter->IsBlockBased()) {
        // iiter->key() does NOT start with the desired prefix.  Because
        // Seek() finds the first key that is >= the seek target, this
        // means that iiter->key() > prefix.  Thus, any data blocks coming
        // after the data block corresponding to iiter->key() cannot
        // possibly contain the key.  Thus, the corresponding data block
        // is the only on could potentially contain the prefix.
2491
        BlockHandle handle = iiter->value();
2492 2493 2494
        may_match = filter->PrefixMayMatch(
            prefix, prefix_extractor, handle.offset(), /*no_io=*/false,
            /*const_key_ptr=*/nullptr, lookup_context);
2495
      }
2496
    }
T
Tyler Harter 已提交
2497
  }
T
Tyler Harter 已提交
2498

2499 2500 2501 2502 2503 2504
  if (filter_checked) {
    Statistics* statistics = rep_->ioptions.statistics;
    RecordTick(statistics, BLOOM_FILTER_PREFIX_CHECKED);
    if (!may_match) {
      RecordTick(statistics, BLOOM_FILTER_PREFIX_USEFUL);
    }
T
Tyler Harter 已提交
2505 2506
  }

T
Tyler Harter 已提交
2507 2508 2509
  return may_match;
}

2510 2511
template <class TBlockIter, typename TValue>
void BlockBasedTableIterator<TBlockIter, TValue>::Seek(const Slice& target) {
2512
  is_out_of_bound_ = false;
2513
  if (!CheckPrefixMayMatch(target)) {
2514 2515 2516 2517
    ResetDataIter();
    return;
  }

2518
  bool need_seek_index = true;
2519
  if (block_iter_points_to_real_block_ && block_iter_.Valid()) {
2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535
    // Reseek.
    prev_index_value_ = index_iter_->value();
    // We can avoid an index seek if:
    // 1. The new seek key is larger than the current key
    // 2. The new seek key is within the upper bound of the block
    // Since we don't necessarily know the internal key for either
    // the current key or the upper bound, we check user keys and
    // exclude the equality case. Considering internal keys can
    // improve for the boundary cases, but it would complicate the
    // code.
    if (user_comparator_.Compare(ExtractUserKey(target),
                                 block_iter_.user_key()) > 0 &&
        user_comparator_.Compare(ExtractUserKey(target),
                                 index_iter_->user_key()) < 0) {
      need_seek_index = false;
    }
2536 2537
  }

2538 2539 2540 2541 2542 2543 2544 2545
  if (need_seek_index) {
    index_iter_->Seek(target);
    if (!index_iter_->Valid()) {
      ResetDataIter();
      return;
    }
    InitDataBlock();
  }
2546

M
Maysam Yabandeh 已提交
2547
  block_iter_.Seek(target);
2548 2549

  FindKeyForward();
2550
  CheckOutOfBound();
M
Maysam Yabandeh 已提交
2551 2552 2553
  assert(
      !block_iter_.Valid() ||
      (key_includes_seq_ && icomp_.Compare(target, block_iter_.key()) <= 0) ||
2554 2555
      (!key_includes_seq_ && user_comparator_.Compare(ExtractUserKey(target),
                                                      block_iter_.key()) <= 0));
2556 2557
}

2558 2559 2560
template <class TBlockIter, typename TValue>
void BlockBasedTableIterator<TBlockIter, TValue>::SeekForPrev(
    const Slice& target) {
2561
  is_out_of_bound_ = false;
2562
  if (!CheckPrefixMayMatch(target)) {
2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594
    ResetDataIter();
    return;
  }

  SavePrevIndexValue();

  // Call Seek() rather than SeekForPrev() in the index block, because the
  // target data block will likely to contain the position for `target`, the
  // same as Seek(), rather than than before.
  // For example, if we have three data blocks, each containing two keys:
  //   [2, 4]  [6, 8] [10, 12]
  //  (the keys in the index block would be [4, 8, 12])
  // and the user calls SeekForPrev(7), we need to go to the second block,
  // just like if they call Seek(7).
  // The only case where the block is difference is when they seek to a position
  // in the boundary. For example, if they SeekForPrev(5), we should go to the
  // first block, rather than the second. However, we don't have the information
  // to distinguish the two unless we read the second block. In this case, we'll
  // end up with reading two blocks.
  index_iter_->Seek(target);

  if (!index_iter_->Valid()) {
    index_iter_->SeekToLast();
    if (!index_iter_->Valid()) {
      ResetDataIter();
      block_iter_points_to_real_block_ = false;
      return;
    }
  }

  InitDataBlock();

M
Maysam Yabandeh 已提交
2595
  block_iter_.SeekForPrev(target);
2596 2597

  FindKeyBackward();
M
Maysam Yabandeh 已提交
2598 2599
  assert(!block_iter_.Valid() ||
         icomp_.Compare(target, block_iter_.key()) >= 0);
2600 2601
}

2602 2603
template <class TBlockIter, typename TValue>
void BlockBasedTableIterator<TBlockIter, TValue>::SeekToFirst() {
2604
  is_out_of_bound_ = false;
2605 2606 2607 2608 2609 2610 2611
  SavePrevIndexValue();
  index_iter_->SeekToFirst();
  if (!index_iter_->Valid()) {
    ResetDataIter();
    return;
  }
  InitDataBlock();
M
Maysam Yabandeh 已提交
2612
  block_iter_.SeekToFirst();
2613
  FindKeyForward();
2614
  CheckOutOfBound();
2615 2616
}

2617 2618
template <class TBlockIter, typename TValue>
void BlockBasedTableIterator<TBlockIter, TValue>::SeekToLast() {
2619
  is_out_of_bound_ = false;
2620 2621 2622 2623 2624 2625 2626
  SavePrevIndexValue();
  index_iter_->SeekToLast();
  if (!index_iter_->Valid()) {
    ResetDataIter();
    return;
  }
  InitDataBlock();
M
Maysam Yabandeh 已提交
2627
  block_iter_.SeekToLast();
2628 2629 2630
  FindKeyBackward();
}

2631 2632
template <class TBlockIter, typename TValue>
void BlockBasedTableIterator<TBlockIter, TValue>::Next() {
2633
  assert(block_iter_points_to_real_block_);
M
Maysam Yabandeh 已提交
2634
  block_iter_.Next();
2635 2636 2637
  FindKeyForward();
}

2638 2639
template <class TBlockIter, typename TValue>
bool BlockBasedTableIterator<TBlockIter, TValue>::NextAndGetResult(
2640
    Slice* ret_key) {
2641 2642 2643
  Next();
  bool is_valid = Valid();
  if (is_valid) {
2644
    *ret_key = key();
2645 2646 2647 2648
  }
  return is_valid;
}

2649 2650
template <class TBlockIter, typename TValue>
void BlockBasedTableIterator<TBlockIter, TValue>::Prev() {
2651
  assert(block_iter_points_to_real_block_);
M
Maysam Yabandeh 已提交
2652
  block_iter_.Prev();
2653 2654 2655
  FindKeyBackward();
}

2656 2657 2658 2659 2660 2661 2662
// Found that 256 KB readahead size provides the best performance, based on
// experiments, for auto readahead. Experiment data is in PR #3282.
template <class TBlockIter, typename TValue>
const size_t
    BlockBasedTableIterator<TBlockIter, TValue>::kMaxAutoReadaheadSize =
        256 * 1024;

2663 2664 2665
template <class TBlockIter, typename TValue>
void BlockBasedTableIterator<TBlockIter, TValue>::InitDataBlock() {
  BlockHandle data_block_handle = index_iter_->value();
2666
  if (!block_iter_points_to_real_block_ ||
2667
      data_block_handle.offset() != prev_index_value_.offset() ||
2668
      // if previous attempt of reading the block missed cache, try again
M
Maysam Yabandeh 已提交
2669
      block_iter_.status().IsIncomplete()) {
2670 2671 2672 2673 2674
    if (block_iter_points_to_real_block_) {
      ResetDataIter();
    }
    auto* rep = table_->get_rep();

2675 2676 2677 2678 2679 2680 2681 2682 2683 2684 2685 2686 2687 2688 2689 2690 2691 2692 2693 2694 2695 2696 2697 2698 2699 2700 2701 2702 2703 2704 2705 2706 2707
    // Prefetch additional data for range scans (iterators). Enabled only for
    // user reads.
    // Implicit auto readahead:
    //   Enabled after 2 sequential IOs when ReadOptions.readahead_size == 0.
    // Explicit user requested readahead:
    //   Enabled from the very first IO when ReadOptions.readahead_size is set.
    if (!for_compaction_) {
      if (read_options_.readahead_size == 0) {
        // Implicit auto readahead
        num_file_reads_++;
        if (num_file_reads_ > kMinNumFileReadsToStartAutoReadahead) {
          if (!rep->file->use_direct_io() &&
              (data_block_handle.offset() +
                   static_cast<size_t>(data_block_handle.size()) +
                   kBlockTrailerSize >
               readahead_limit_)) {
            // Buffered I/O
            // Discarding the return status of Prefetch calls intentionally, as
            // we can fallback to reading from disk if Prefetch fails.
            rep->file->Prefetch(data_block_handle.offset(), readahead_size_);
            readahead_limit_ = static_cast<size_t>(data_block_handle.offset() +
                                                   readahead_size_);
            // Keep exponentially increasing readahead size until
            // kMaxAutoReadaheadSize.
            readahead_size_ =
                std::min(kMaxAutoReadaheadSize, readahead_size_ * 2);
          } else if (rep->file->use_direct_io() && !prefetch_buffer_) {
            // Direct I/O
            // Let FilePrefetchBuffer take care of the readahead.
            prefetch_buffer_.reset(
                new FilePrefetchBuffer(rep->file.get(), kInitAutoReadaheadSize,
                                       kMaxAutoReadaheadSize));
          }
2708
        }
2709 2710 2711 2712 2713 2714 2715
      } else if (!prefetch_buffer_) {
        // Explicit user requested readahead
        // The actual condition is:
        // if (read_options_.readahead_size != 0 && !prefetch_buffer_)
        prefetch_buffer_.reset(new FilePrefetchBuffer(
            rep->file.get(), read_options_.readahead_size,
            read_options_.readahead_size));
2716 2717 2718
      }
    }

2719
    Status s;
2720
    table_->NewDataBlockIterator<TBlockIter>(
2721
        read_options_, data_block_handle, &block_iter_, block_type_,
2722
        key_includes_seq_, index_key_is_full_,
2723
        /*get_context=*/nullptr, &lookup_context_, s, prefetch_buffer_.get());
2724 2725 2726 2727
    block_iter_points_to_real_block_ = true;
  }
}

2728
template <class TBlockIter, typename TValue>
2729
void BlockBasedTableIterator<TBlockIter, TValue>::FindBlockForward() {
2730 2731
  // TODO the while loop inherits from two-level-iterator. We don't know
  // whether a block can be empty so it can be replaced by an "if".
2732
  do {
M
Maysam Yabandeh 已提交
2733
    if (!block_iter_.status().ok()) {
2734 2735
      return;
    }
2736
    // Whether next data block is out of upper bound, if there is one.
2737 2738 2739 2740 2741 2742 2743
    bool next_block_is_out_of_bound = false;
    if (read_options_.iterate_upper_bound != nullptr &&
        block_iter_points_to_real_block_) {
      next_block_is_out_of_bound =
          (user_comparator_.Compare(*read_options_.iterate_upper_bound,
                                    index_iter_->user_key()) <= 0);
    }
2744 2745
    ResetDataIter();
    index_iter_->Next();
2746 2747 2748 2749 2750 2751 2752 2753 2754 2755 2756
    if (next_block_is_out_of_bound) {
      // The next block is out of bound. No need to read it.
      TEST_SYNC_POINT_CALLBACK("BlockBasedTableIterator:out_of_bound", nullptr);
      // We need to make sure this is not the last data block before setting
      // is_out_of_bound_, since the index key for the last data block can be
      // larger than smallest key of the next file on the same level.
      if (index_iter_->Valid()) {
        is_out_of_bound_ = true;
      }
      return;
    }
2757 2758 2759

    if (index_iter_->Valid()) {
      InitDataBlock();
M
Maysam Yabandeh 已提交
2760
      block_iter_.SeekToFirst();
2761 2762 2763
    } else {
      return;
    }
2764 2765 2766 2767 2768 2769 2770 2771 2772
  } while (!block_iter_.Valid());
}

template <class TBlockIter, typename TValue>
void BlockBasedTableIterator<TBlockIter, TValue>::FindKeyForward() {
  assert(!is_out_of_bound_);

  if (!block_iter_.Valid()) {
    FindBlockForward();
2773 2774 2775
  }
}

2776 2777
template <class TBlockIter, typename TValue>
void BlockBasedTableIterator<TBlockIter, TValue>::FindKeyBackward() {
M
Maysam Yabandeh 已提交
2778 2779
  while (!block_iter_.Valid()) {
    if (!block_iter_.status().ok()) {
2780 2781 2782 2783 2784 2785 2786 2787
      return;
    }

    ResetDataIter();
    index_iter_->Prev();

    if (index_iter_->Valid()) {
      InitDataBlock();
M
Maysam Yabandeh 已提交
2788
      block_iter_.SeekToLast();
2789 2790 2791 2792 2793 2794 2795 2796 2797
    } else {
      return;
    }
  }

  // We could have check lower bound here too, but we opt not to do it for
  // code simplicity.
}

2798 2799 2800 2801 2802 2803 2804 2805 2806
template <class TBlockIter, typename TValue>
void BlockBasedTableIterator<TBlockIter, TValue>::CheckOutOfBound() {
  if (read_options_.iterate_upper_bound != nullptr &&
      block_iter_points_to_real_block_ && block_iter_.Valid()) {
    is_out_of_bound_ = user_comparator_.Compare(
                           *read_options_.iterate_upper_bound, user_key()) <= 0;
  }
}

2807 2808
InternalIterator* BlockBasedTable::NewIterator(
    const ReadOptions& read_options, const SliceTransform* prefix_extractor,
2809
    Arena* arena, bool skip_filters, bool for_compaction) {
2810 2811 2812
  BlockCacheLookupContext lookup_context{
      for_compaction ? BlockCacheLookupCaller::kCompaction
                     : BlockCacheLookupCaller::kUserIterator};
2813
  bool need_upper_bound_check =
2814
      PrefixExtractorChanged(rep_->table_properties.get(), prefix_extractor);
2815
  if (arena == nullptr) {
M
Maysam Yabandeh 已提交
2816
    return new BlockBasedTableIterator<DataBlockIter>(
2817
        this, read_options, rep_->internal_comparator,
2818 2819
        NewIndexIterator(
            read_options,
2820
            need_upper_bound_check &&
2821 2822
                rep_->index_type == BlockBasedTableOptions::kHashSearch,
            /*input_iter=*/nullptr, /*get_context=*/nullptr, &lookup_context),
2823
        !skip_filters && !read_options.total_order_seek &&
2824
            prefix_extractor != nullptr,
2825
        need_upper_bound_check, prefix_extractor, BlockType::kData,
2826
        true /*key_includes_seq*/, true /*index_key_is_full*/, for_compaction);
2827
  } else {
M
Maysam Yabandeh 已提交
2828 2829 2830
    auto* mem =
        arena->AllocateAligned(sizeof(BlockBasedTableIterator<DataBlockIter>));
    return new (mem) BlockBasedTableIterator<DataBlockIter>(
2831
        this, read_options, rep_->internal_comparator,
2832 2833 2834
        NewIndexIterator(read_options, need_upper_bound_check,
                         /*input_iter=*/nullptr, /*get_context=*/nullptr,
                         &lookup_context),
2835
        !skip_filters && !read_options.total_order_seek &&
2836
            prefix_extractor != nullptr,
2837
        need_upper_bound_check, prefix_extractor, BlockType::kData,
2838
        true /*key_includes_seq*/, true /*index_key_is_full*/, for_compaction);
2839
  }
J
jorlow@chromium.org 已提交
2840 2841
}

2842
FragmentedRangeTombstoneIterator* BlockBasedTable::NewRangeTombstoneIterator(
2843
    const ReadOptions& read_options) {
2844 2845 2846
  if (rep_->fragmented_range_dels == nullptr) {
    return nullptr;
  }
2847 2848 2849 2850 2851
  SequenceNumber snapshot = kMaxSequenceNumber;
  if (read_options.snapshot != nullptr) {
    snapshot = read_options.snapshot->GetSequenceNumber();
  }
  return new FragmentedRangeTombstoneIterator(
2852
      rep_->fragmented_range_dels, rep_->internal_comparator, snapshot);
2853 2854
}

2855 2856 2857
bool BlockBasedTable::FullFilterKeyMayMatch(
    const ReadOptions& read_options, FilterBlockReader* filter,
    const Slice& internal_key, const bool no_io,
2858 2859
    const SliceTransform* prefix_extractor,
    BlockCacheLookupContext* lookup_context) const {
2860 2861 2862 2863
  if (filter == nullptr || filter->IsBlockBased()) {
    return true;
  }
  Slice user_key = ExtractUserKey(internal_key);
M
Maysam Yabandeh 已提交
2864
  const Slice* const const_ikey_ptr = &internal_key;
2865
  bool may_match = true;
2866
  if (filter->whole_key_filtering()) {
2867 2868 2869
    size_t ts_sz =
        rep_->internal_comparator.user_comparator()->timestamp_size();
    Slice user_key_without_ts = StripTimestampFromUserKey(user_key, ts_sz);
2870 2871 2872
    may_match =
        filter->KeyMayMatch(user_key_without_ts, prefix_extractor, kNotValid,
                            no_io, const_ikey_ptr, lookup_context);
2873
  } else if (!read_options.total_order_seek && prefix_extractor &&
2874
             rep_->table_properties->prefix_extractor_name.compare(
2875 2876 2877 2878
                 prefix_extractor->Name()) == 0 &&
             prefix_extractor->InDomain(user_key) &&
             !filter->PrefixMayMatch(prefix_extractor->Transform(user_key),
                                     prefix_extractor, kNotValid, false,
2879
                                     const_ikey_ptr, lookup_context)) {
2880 2881 2882 2883
    may_match = false;
  }
  if (may_match) {
    RecordTick(rep_->ioptions.statistics, BLOOM_FILTER_FULL_POSITIVE);
2884
    PERF_COUNTER_BY_LEVEL_ADD(bloom_filter_full_positive, 1, rep_->level);
2885
  }
2886
  return may_match;
2887 2888
}

2889 2890 2891
void BlockBasedTable::FullFilterKeysMayMatch(
    const ReadOptions& read_options, FilterBlockReader* filter,
    MultiGetRange* range, const bool no_io,
2892 2893
    const SliceTransform* prefix_extractor,
    BlockCacheLookupContext* lookup_context) const {
2894 2895 2896 2897
  if (filter == nullptr || filter->IsBlockBased()) {
    return;
  }
  if (filter->whole_key_filtering()) {
2898 2899
    filter->KeysMayMatch(range, prefix_extractor, kNotValid, no_io,
                         lookup_context);
2900 2901 2902 2903 2904 2905 2906 2907 2908 2909
  } else if (!read_options.total_order_seek && prefix_extractor &&
             rep_->table_properties->prefix_extractor_name.compare(
                 prefix_extractor->Name()) == 0) {
    for (auto iter = range->begin(); iter != range->end(); ++iter) {
      Slice user_key = iter->lkey->user_key();

      if (!prefix_extractor->InDomain(user_key)) {
        range->SkipKey(iter);
      }
    }
2910 2911
    filter->PrefixesMayMatch(range, prefix_extractor, kNotValid, false,
                             lookup_context);
2912 2913 2914
  }
}

2915
Status BlockBasedTable::Get(const ReadOptions& read_options, const Slice& key,
2916 2917 2918
                            GetContext* get_context,
                            const SliceTransform* prefix_extractor,
                            bool skip_filters) {
M
Maysam Yabandeh 已提交
2919
  assert(key.size() >= 8);  // key must be internal key
S
Sanjay Ghemawat 已提交
2920
  Status s;
M
Maysam Yabandeh 已提交
2921
  const bool no_io = read_options.read_tier == kBlockCacheTier;
2922
  CachableEntry<FilterBlockReader> filter_entry;
2923 2924
  bool may_match;
  FilterBlockReader* filter = nullptr;
2925
  BlockCacheLookupContext lookup_context{BlockCacheLookupCaller::kUserGet};
2926 2927
  {
    if (!skip_filters) {
2928 2929 2930
      filter_entry = GetFilter(prefix_extractor, /*prefetch_buffer=*/nullptr,
                               read_options.read_tier == kBlockCacheTier,
                               get_context, &lookup_context);
2931
    }
2932
    filter = filter_entry.GetValue();
2933

2934 2935 2936
    // First check the full filter
    // If full filter not useful, Then go into each block
    may_match = FullFilterKeyMayMatch(read_options, filter, key, no_io,
2937
                                      prefix_extractor, &lookup_context);
2938 2939
  }
  if (!may_match) {
2940
    RecordTick(rep_->ioptions.statistics, BLOOM_FILTER_USEFUL);
2941
    PERF_COUNTER_BY_LEVEL_ADD(bloom_filter_useful, 1, rep_->level);
2942
  } else {
M
Maysam Yabandeh 已提交
2943
    IndexBlockIter iiter_on_stack;
2944 2945
    // if prefix_extractor found in block differs from options, disable
    // BlockPrefixIndex. Only do this check when index_type is kHashSearch.
2946
    bool need_upper_bound_check = false;
2947
    if (rep_->index_type == BlockBasedTableOptions::kHashSearch) {
2948
      need_upper_bound_check = PrefixExtractorChanged(
2949
          rep_->table_properties.get(), prefix_extractor);
2950
    }
2951 2952 2953
    auto iiter =
        NewIndexIterator(read_options, need_upper_bound_check, &iiter_on_stack,
                         get_context, &lookup_context);
2954
    std::unique_ptr<InternalIteratorBase<BlockHandle>> iiter_unique_ptr;
M
Maysam Yabandeh 已提交
2955
    if (iiter != &iiter_on_stack) {
M
Maysam Yabandeh 已提交
2956
      iiter_unique_ptr.reset(iiter);
M
Maysam Yabandeh 已提交
2957
    }
2958

2959 2960
    size_t ts_sz =
        rep_->internal_comparator.user_comparator()->timestamp_size();
2961
    bool matched = false;  // if such user key mathced a key in SST
2962
    bool done = false;
M
Maysam Yabandeh 已提交
2963
    for (iiter->Seek(key); iiter->Valid() && !done; iiter->Next()) {
2964
      BlockHandle handle = iiter->value();
2965

2966 2967
      bool not_exist_in_filter =
          filter != nullptr && filter->IsBlockBased() == true &&
2968
          !filter->KeyMayMatch(ExtractUserKeyAndStripTimestamp(key, ts_sz),
2969 2970
                               prefix_extractor, handle.offset(), no_io,
                               /*const_ikey_ptr=*/nullptr, &lookup_context);
2971 2972 2973 2974 2975 2976

      if (not_exist_in_filter) {
        // Not found
        // TODO: think about interaction with Merge. If a user key cannot
        // cross one data block, we should be fine.
        RecordTick(rep_->ioptions.statistics, BLOOM_FILTER_USEFUL);
2977
        PERF_COUNTER_BY_LEVEL_ADD(bloom_filter_useful, 1, rep_->level);
2978 2979
        break;
      } else {
2980 2981 2982
        BlockCacheLookupContext lookup_data_block_context{
            BlockCacheLookupCaller::kUserGet};
        bool does_referenced_key_exist = false;
M
Maysam Yabandeh 已提交
2983
        DataBlockIter biter;
2984
        uint64_t referenced_data_size = 0;
M
Maysam Yabandeh 已提交
2985
        NewDataBlockIterator<DataBlockIter>(
2986
            read_options, iiter->value(), &biter, BlockType::kData,
2987
            /*key_includes_seq=*/true,
2988
            /*index_key_is_full=*/true, get_context, &lookup_data_block_context,
2989
            /*s=*/Status(), /*prefetch_buffer*/ nullptr);
2990

2991
        if (read_options.read_tier == kBlockCacheTier &&
M
Maysam Yabandeh 已提交
2992
            biter.status().IsIncomplete()) {
2993
          // couldn't get block from block_cache
2994 2995
          // Update Saver.state to Found because we are only looking for
          // whether we can guarantee the key is not there when "no_io" is set
2996
          get_context->MarkKeyMayExist();
2997 2998
          break;
        }
M
Maysam Yabandeh 已提交
2999 3000
        if (!biter.status().ok()) {
          s = biter.status();
3001 3002 3003
          break;
        }

3004
        bool may_exist = biter.SeekForGet(key);
3005 3006 3007
        // If user-specified timestamp is supported, we cannot end the search
        // just because hash index lookup indicates the key+ts does not exist.
        if (!may_exist && ts_sz == 0) {
3008 3009 3010 3011
          // HashSeek cannot find the key this block and the the iter is not
          // the end of the block, i.e. cannot be in the following blocks
          // either. In this case, the seek_key cannot be found, so we break
          // from the top level for-loop.
3012 3013 3014 3015 3016 3017 3018 3019
          done = true;
        } else {
          // Call the *saver function on each entry/block until it returns false
          for (; biter.Valid(); biter.Next()) {
            ParsedInternalKey parsed_key;
            if (!ParseInternalKey(biter.key(), &parsed_key)) {
              s = Status::Corruption(Slice());
            }
3020

3021 3022 3023 3024 3025 3026 3027 3028
            if (!get_context->SaveValue(
                    parsed_key, biter.value(), &matched,
                    biter.IsValuePinned() ? &biter : nullptr)) {
              does_referenced_key_exist = true;
              referenced_data_size = biter.key().size() + biter.value().size();
              done = true;
              break;
            }
3029
          }
3030 3031 3032 3033 3034 3035 3036 3037 3038 3039 3040 3041 3042 3043 3044 3045 3046 3047 3048 3049
          s = biter.status();
        }
        // Write the block cache access record.
        if (block_cache_tracer_) {
          // Avoid making copy of block_key, cf_name, and referenced_key when
          // constructing the access record.
          BlockCacheTraceRecord access_record(
              rep_->ioptions.env->NowMicros(),
              /*block_key=*/"", lookup_data_block_context.block_type,
              lookup_data_block_context.block_size, rep_->cf_id_for_tracing(),
              /*cf_name=*/"", rep_->level_for_tracing(),
              rep_->sst_number_for_tracing(), lookup_data_block_context.caller,
              lookup_data_block_context.is_cache_hit,
              lookup_data_block_context.no_insert,
              /*referenced_key=*/"", referenced_data_size,
              lookup_data_block_context.num_keys_in_block,
              does_referenced_key_exist);
          block_cache_tracer_->WriteBlockAccess(
              access_record, lookup_data_block_context.block_key,
              rep_->cf_name_for_tracing(), key);
3050
        }
S
Sanjay Ghemawat 已提交
3051
      }
3052

M
Maysam Yabandeh 已提交
3053 3054 3055 3056
      if (done) {
        // Avoid the extra Next which is expensive in two-level indexes
        break;
      }
3057
    }
3058 3059
    if (matched && filter != nullptr && !filter->IsBlockBased()) {
      RecordTick(rep_->ioptions.statistics, BLOOM_FILTER_FULL_TRUE_POSITIVE);
3060 3061
      PERF_COUNTER_BY_LEVEL_ADD(bloom_filter_full_true_positive, 1,
                                rep_->level);
3062
    }
3063
    if (s.ok()) {
M
Maysam Yabandeh 已提交
3064
      s = iiter->status();
S
Sanjay Ghemawat 已提交
3065 3066
    }
  }
K
Kai Liu 已提交
3067

S
Sanjay Ghemawat 已提交
3068 3069 3070
  return s;
}

3071 3072 3073 3074 3075
using MultiGetRange = MultiGetContext::Range;
void BlockBasedTable::MultiGet(const ReadOptions& read_options,
                               const MultiGetRange* mget_range,
                               const SliceTransform* prefix_extractor,
                               bool skip_filters) {
3076
  BlockCacheLookupContext lookup_context{BlockCacheLookupCaller::kUserMGet};
3077 3078 3079 3080 3081 3082 3083 3084
  const bool no_io = read_options.read_tier == kBlockCacheTier;
  CachableEntry<FilterBlockReader> filter_entry;
  FilterBlockReader* filter = nullptr;
  MultiGetRange sst_file_range(*mget_range, mget_range->begin(),
                               mget_range->end());
  {
    if (!skip_filters) {
      // TODO: Figure out where the stats should go
3085
      filter_entry = GetFilter(prefix_extractor, /*prefetch_buffer=*/nullptr,
3086
                               read_options.read_tier == kBlockCacheTier,
3087
                               /*get_context=*/nullptr, &lookup_context);
3088
    }
3089
    filter = filter_entry.GetValue();
3090 3091 3092 3093

    // First check the full filter
    // If full filter not useful, Then go into each block
    FullFilterKeysMayMatch(read_options, filter, &sst_file_range, no_io,
3094
                           prefix_extractor, &lookup_context);
3095 3096 3097 3098 3099 3100 3101 3102 3103 3104
  }
  if (skip_filters || !sst_file_range.empty()) {
    IndexBlockIter iiter_on_stack;
    // if prefix_extractor found in block differs from options, disable
    // BlockPrefixIndex. Only do this check when index_type is kHashSearch.
    bool need_upper_bound_check = false;
    if (rep_->index_type == BlockBasedTableOptions::kHashSearch) {
      need_upper_bound_check = PrefixExtractorChanged(
          rep_->table_properties.get(), prefix_extractor);
    }
3105 3106
    auto iiter =
        NewIndexIterator(read_options, need_upper_bound_check, &iiter_on_stack,
3107
                         sst_file_range.begin()->get_context, &lookup_context);
3108 3109 3110 3111 3112
    std::unique_ptr<InternalIteratorBase<BlockHandle>> iiter_unique_ptr;
    if (iiter != &iiter_on_stack) {
      iiter_unique_ptr.reset(iiter);
    }

3113 3114
    DataBlockIter biter;
    uint64_t offset = std::numeric_limits<uint64_t>::max();
3115 3116 3117 3118 3119 3120 3121 3122
    for (auto miter = sst_file_range.begin(); miter != sst_file_range.end();
         ++miter) {
      Status s;
      GetContext* get_context = miter->get_context;
      const Slice& key = miter->ikey;
      bool matched = false;  // if such user key matched a key in SST
      bool done = false;
      for (iiter->Seek(key); iiter->Valid() && !done; iiter->Next()) {
3123
        bool reusing_block = true;
3124 3125 3126 3127
        uint64_t referenced_data_size = 0;
        bool does_referenced_key_exist = false;
        BlockCacheLookupContext lookup_data_block_context(
            BlockCacheLookupCaller::kUserMGet);
3128 3129 3130 3131
        if (iiter->value().offset() != offset) {
          offset = iiter->value().offset();
          biter.Invalidate(Status::OK());
          NewDataBlockIterator<DataBlockIter>(
3132 3133
              read_options, iiter->value(), &biter, BlockType::kData,
              /*key_includes_seq=*/false,
3134 3135
              /*index_key_is_full=*/true, get_context,
              &lookup_data_block_context, Status(), nullptr);
3136 3137
          reusing_block = false;
        }
3138 3139 3140 3141 3142 3143 3144 3145 3146 3147 3148 3149 3150 3151 3152 3153 3154 3155 3156
        if (read_options.read_tier == kBlockCacheTier &&
            biter.status().IsIncomplete()) {
          // couldn't get block from block_cache
          // Update Saver.state to Found because we are only looking for
          // whether we can guarantee the key is not there when "no_io" is set
          get_context->MarkKeyMayExist();
          break;
        }
        if (!biter.status().ok()) {
          s = biter.status();
          break;
        }

        bool may_exist = biter.SeekForGet(key);
        if (!may_exist) {
          // HashSeek cannot find the key this block and the the iter is not
          // the end of the block, i.e. cannot be in the following blocks
          // either. In this case, the seek_key cannot be found, so we break
          // from the top level for-loop.
3157 3158 3159 3160 3161 3162 3163 3164 3165 3166 3167 3168 3169 3170 3171 3172 3173 3174 3175 3176 3177 3178
          done = true;
        } else {
          // Call the *saver function on each entry/block until it returns false
          for (; biter.Valid(); biter.Next()) {
            ParsedInternalKey parsed_key;
            Cleanable dummy;
            Cleanable* value_pinner = nullptr;

            if (!ParseInternalKey(biter.key(), &parsed_key)) {
              s = Status::Corruption(Slice());
            }
            if (biter.IsValuePinned()) {
              if (reusing_block) {
                Cache* block_cache = rep_->table_options.block_cache.get();
                assert(biter.cache_handle() != nullptr);
                block_cache->Ref(biter.cache_handle());
                dummy.RegisterCleanup(&ReleaseCachedEntry, block_cache,
                                      biter.cache_handle());
                value_pinner = &dummy;
              } else {
                value_pinner = &biter;
              }
3179
            }
3180

3181 3182 3183 3184 3185 3186 3187
            if (!get_context->SaveValue(parsed_key, biter.value(), &matched,
                                        value_pinner)) {
              does_referenced_key_exist = true;
              referenced_data_size = biter.key().size() + biter.value().size();
              done = true;
              break;
            }
3188
          }
3189 3190 3191 3192 3193 3194 3195 3196 3197 3198 3199 3200 3201 3202 3203 3204 3205 3206 3207 3208
          s = biter.status();
        }
        // Write the block cache access.
        if (block_cache_tracer_) {
          // Avoid making copy of block_key, cf_name, and referenced_key when
          // constructing the access record.
          BlockCacheTraceRecord access_record(
              rep_->ioptions.env->NowMicros(),
              /*block_key=*/"", lookup_data_block_context.block_type,
              lookup_data_block_context.block_size, rep_->cf_id_for_tracing(),
              /*cf_name=*/"", rep_->level_for_tracing(),
              rep_->sst_number_for_tracing(), lookup_data_block_context.caller,
              lookup_data_block_context.is_cache_hit,
              lookup_data_block_context.no_insert,
              /*referenced_key=*/"", referenced_data_size,
              lookup_data_block_context.num_keys_in_block,
              does_referenced_key_exist);
          block_cache_tracer_->WriteBlockAccess(
              access_record, lookup_data_block_context.block_key,
              rep_->cf_name_for_tracing(), key);
3209 3210 3211 3212 3213 3214 3215 3216 3217 3218 3219 3220 3221 3222 3223 3224 3225 3226 3227
        }
        if (done) {
          // Avoid the extra Next which is expensive in two-level indexes
          break;
        }
      }
      if (matched && filter != nullptr && !filter->IsBlockBased()) {
        RecordTick(rep_->ioptions.statistics, BLOOM_FILTER_FULL_TRUE_POSITIVE);
        PERF_COUNTER_BY_LEVEL_ADD(bloom_filter_full_true_positive, 1,
                                  rep_->level);
      }
      if (s.ok()) {
        s = iiter->status();
      }
      *(miter->s) = s;
    }
  }
}

3228 3229 3230
Status BlockBasedTable::Prefetch(const Slice* const begin,
                                 const Slice* const end) {
  auto& comparator = rep_->internal_comparator;
M
Maysam Yabandeh 已提交
3231
  auto user_comparator = comparator.user_comparator();
3232 3233 3234 3235
  // pre-condition
  if (begin && end && comparator.Compare(*begin, *end) > 0) {
    return Status::InvalidArgument(*begin, *end);
  }
3236
  BlockCacheLookupContext lookup_context{BlockCacheLookupCaller::kPrefetch};
M
Maysam Yabandeh 已提交
3237
  IndexBlockIter iiter_on_stack;
3238 3239 3240
  auto iiter = NewIndexIterator(ReadOptions(), /*need_upper_bound_check=*/false,
                                &iiter_on_stack, /*get_context=*/nullptr,
                                &lookup_context);
3241
  std::unique_ptr<InternalIteratorBase<BlockHandle>> iiter_unique_ptr;
M
Maysam Yabandeh 已提交
3242
  if (iiter != &iiter_on_stack) {
3243 3244
    iiter_unique_ptr =
        std::unique_ptr<InternalIteratorBase<BlockHandle>>(iiter);
M
Maysam Yabandeh 已提交
3245
  }
3246

M
Maysam Yabandeh 已提交
3247
  if (!iiter->status().ok()) {
3248
    // error opening index iterator
M
Maysam Yabandeh 已提交
3249
    return iiter->status();
3250 3251 3252 3253 3254
  }

  // indicates if we are on the last page that need to be pre-fetched
  bool prefetching_boundary_page = false;

M
Maysam Yabandeh 已提交
3255 3256
  for (begin ? iiter->Seek(*begin) : iiter->SeekToFirst(); iiter->Valid();
       iiter->Next()) {
3257
    BlockHandle block_handle = iiter->value();
3258 3259
    const bool is_user_key = rep_->table_properties &&
                             rep_->table_properties->index_key_is_user_key > 0;
M
Maysam Yabandeh 已提交
3260 3261 3262 3263
    if (end &&
        ((!is_user_key && comparator.Compare(iiter->key(), *end) >= 0) ||
         (is_user_key &&
          user_comparator->Compare(iiter->key(), ExtractUserKey(*end)) >= 0))) {
3264 3265 3266 3267 3268 3269 3270 3271 3272 3273
      if (prefetching_boundary_page) {
        break;
      }

      // The index entry represents the last key in the data block.
      // We should load this page into memory as well, but no more
      prefetching_boundary_page = true;
    }

    // Load the block specified by the block_handle into the block cache
M
Maysam Yabandeh 已提交
3274
    DataBlockIter biter;
3275 3276 3277 3278 3279 3280

    NewDataBlockIterator<DataBlockIter>(
        ReadOptions(), block_handle, &biter, /*type=*/BlockType::kData,
        /*key_includes_seq=*/true, /*index_key_is_full=*/true,
        /*get_context=*/nullptr, &lookup_context, Status(),
        /*prefetch_buffer=*/nullptr);
3281 3282 3283 3284 3285 3286 3287 3288 3289 3290

    if (!biter.status().ok()) {
      // there was an unexpected error while pre-fetching
      return biter.status();
    }
  }

  return Status::OK();
}

A
Aaron G 已提交
3291
Status BlockBasedTable::VerifyChecksum() {
3292 3293
  // TODO(haoyu): This function is called by external sst ingestion and the
  // verify checksum public API. We don't log its block cache accesses for now.
A
Aaron G 已提交
3294 3295 3296 3297
  Status s;
  // Check Meta blocks
  std::unique_ptr<Block> meta;
  std::unique_ptr<InternalIterator> meta_iter;
3298
  s = ReadMetaBlock(nullptr /* prefetch buffer */, &meta, &meta_iter);
A
Aaron G 已提交
3299
  if (s.ok()) {
3300
    s = VerifyChecksumInMetaBlocks(meta_iter.get());
A
Aaron G 已提交
3301 3302 3303 3304 3305 3306 3307
    if (!s.ok()) {
      return s;
    }
  } else {
    return s;
  }
  // Check Data blocks
M
Maysam Yabandeh 已提交
3308
  IndexBlockIter iiter_on_stack;
3309 3310 3311
  InternalIteratorBase<BlockHandle>* iiter = NewIndexIterator(
      ReadOptions(), /*need_upper_bound_check=*/false, &iiter_on_stack,
      /*get_context=*/nullptr, /*lookup_contex=*/nullptr);
3312
  std::unique_ptr<InternalIteratorBase<BlockHandle>> iiter_unique_ptr;
A
Aaron G 已提交
3313
  if (iiter != &iiter_on_stack) {
3314 3315
    iiter_unique_ptr =
        std::unique_ptr<InternalIteratorBase<BlockHandle>>(iiter);
A
Aaron G 已提交
3316 3317 3318 3319 3320 3321 3322 3323 3324
  }
  if (!iiter->status().ok()) {
    // error opening index iterator
    return iiter->status();
  }
  s = VerifyChecksumInBlocks(iiter);
  return s;
}

3325 3326
Status BlockBasedTable::VerifyChecksumInBlocks(
    InternalIteratorBase<BlockHandle>* index_iter) {
A
Aaron G 已提交
3327 3328 3329 3330 3331 3332
  Status s;
  for (index_iter->SeekToFirst(); index_iter->Valid(); index_iter->Next()) {
    s = index_iter->status();
    if (!s.ok()) {
      break;
    }
3333 3334
    BlockHandle handle = index_iter->value();
    BlockContents contents;
3335 3336 3337 3338
    BlockFetcher block_fetcher(
        rep_->file.get(), nullptr /* prefetch buffer */, rep_->footer,
        ReadOptions(), handle, &contents, rep_->ioptions,
        false /* decompress */, false /*maybe_compressed*/,
3339
        UncompressionDict::GetEmptyDict(), rep_->persistent_cache_options);
3340 3341 3342 3343 3344 3345 3346 3347
    s = block_fetcher.ReadBlockContents();
    if (!s.ok()) {
      break;
    }
  }
  return s;
}

3348
Status BlockBasedTable::VerifyChecksumInMetaBlocks(
3349 3350 3351 3352
    InternalIteratorBase<Slice>* index_iter) {
  Status s;
  for (index_iter->SeekToFirst(); index_iter->Valid(); index_iter->Next()) {
    s = index_iter->status();
A
Aaron G 已提交
3353 3354 3355
    if (!s.ok()) {
      break;
    }
3356 3357 3358
    BlockHandle handle;
    Slice input = index_iter->value();
    s = handle.DecodeFrom(&input);
A
Aaron G 已提交
3359
    BlockContents contents;
3360 3361 3362 3363
    BlockFetcher block_fetcher(
        rep_->file.get(), nullptr /* prefetch buffer */, rep_->footer,
        ReadOptions(), handle, &contents, rep_->ioptions,
        false /* decompress */, false /*maybe_compressed*/,
3364
        UncompressionDict::GetEmptyDict(), rep_->persistent_cache_options);
S
Siying Dong 已提交
3365
    s = block_fetcher.ReadBlockContents();
3366 3367
    if (s.IsCorruption() && index_iter->key() == kPropertiesBlock) {
      TableProperties* table_properties;
3368
      s = TryReadPropertiesWithGlobalSeqno(nullptr /* prefetch_buffer */,
3369 3370 3371 3372
                                           index_iter->value(),
                                           &table_properties);
      delete table_properties;
    }
A
Aaron G 已提交
3373 3374 3375 3376 3377 3378 3379
    if (!s.ok()) {
      break;
    }
  }
  return s;
}

3380 3381 3382 3383 3384 3385 3386 3387 3388
bool BlockBasedTable::TEST_BlockInCache(const BlockHandle& handle) const {
  assert(rep_ != nullptr);

  Cache* const cache = rep_->table_options.block_cache.get();
  if (cache == nullptr) {
    return false;
  }

  char cache_key_storage[kMaxCacheKeyPrefixSize + kMaxVarint64Length];
3389 3390 3391
  Slice cache_key =
      GetCacheKey(rep_->cache_key_prefix, rep_->cache_key_prefix_size, handle,
                  cache_key_storage);
3392 3393 3394 3395 3396 3397 3398 3399 3400 3401 3402

  Cache::Handle* const cache_handle = cache->Lookup(cache_key);
  if (cache_handle == nullptr) {
    return false;
  }

  cache->Release(cache_handle);

  return true;
}

S
Siying Dong 已提交
3403 3404
bool BlockBasedTable::TEST_KeyInCache(const ReadOptions& options,
                                      const Slice& key) {
3405 3406 3407
  std::unique_ptr<InternalIteratorBase<BlockHandle>> iiter(NewIndexIterator(
      options, /*need_upper_bound_check=*/false, /*input_iter=*/nullptr,
      /*get_context=*/nullptr, /*lookup_contex=*/nullptr));
I
Igor Canadi 已提交
3408 3409 3410
  iiter->Seek(key);
  assert(iiter->Valid());

3411
  return TEST_BlockInCache(iiter->value());
3412
}
S
Sanjay Ghemawat 已提交
3413

3414
BlockBasedTableOptions::IndexType BlockBasedTable::UpdateIndexType() {
3415 3416
  // Some old version of block-based tables don't have index type present in
  // table properties. If that's the case we can safely use the kBinarySearch.
3417 3418
  BlockBasedTableOptions::IndexType index_type_on_file =
      BlockBasedTableOptions::kBinarySearch;
3419 3420 3421 3422 3423 3424
  if (rep_->table_properties) {
    auto& props = rep_->table_properties->user_collected_properties;
    auto pos = props.find(BlockBasedTablePropertyNames::kIndexType);
    if (pos != props.end()) {
      index_type_on_file = static_cast<BlockBasedTableOptions::IndexType>(
          DecodeFixed32(pos->second.c_str()));
3425 3426
      // update index_type with the true type
      rep_->index_type = index_type_on_file;
3427 3428
    }
  }
3429 3430 3431 3432 3433 3434 3435 3436 3437 3438
  return index_type_on_file;
}

// REQUIRES: The following fields of rep_ should have already been populated:
//  1. file
//  2. index_handle,
//  3. options
//  4. internal_comparator
//  5. index_type
Status BlockBasedTable::CreateIndexReader(
3439 3440
    FilePrefetchBuffer* prefetch_buffer,
    InternalIterator* preloaded_meta_index_iter, bool use_cache, bool prefetch,
3441 3442
    bool pin, IndexReader** index_reader,
    BlockCacheLookupContext* lookup_context) {
3443
  auto index_type_on_file = rep_->index_type;
3444

3445 3446
  // kHashSearch requires non-empty prefix_extractor but bypass checking
  // prefix_extractor here since we have no access to MutableCFOptions.
3447
  // Add need_upper_bound_check flag in  BlockBasedTable::NewIndexIterator.
3448 3449
  // If prefix_extractor does not match prefix_extractor_name from table
  // properties, turn off Hash Index by setting total_order_seek to true
3450

K
Kai Liu 已提交
3451
  switch (index_type_on_file) {
M
Maysam Yabandeh 已提交
3452
    case BlockBasedTableOptions::kTwoLevelIndexSearch: {
3453
      return PartitionIndexReader::Create(this, prefetch_buffer, use_cache,
3454 3455
                                          prefetch, pin, index_reader,
                                          lookup_context);
M
Maysam Yabandeh 已提交
3456
    }
3457
    case BlockBasedTableOptions::kBinarySearch: {
3458
      return BinarySearchIndexReader::Create(this, prefetch_buffer, use_cache,
3459 3460
                                             prefetch, pin, index_reader,
                                             lookup_context);
3461 3462
    }
    case BlockBasedTableOptions::kHashSearch: {
K
Kai Liu 已提交
3463
      std::unique_ptr<Block> meta_guard;
S
sdong 已提交
3464
      std::unique_ptr<InternalIterator> meta_iter_guard;
K
Kai Liu 已提交
3465 3466
      auto meta_index_iter = preloaded_meta_index_iter;
      if (meta_index_iter == nullptr) {
3467
        auto s = ReadMetaBlock(prefetch_buffer, &meta_guard, &meta_iter_guard);
K
Kai Liu 已提交
3468
        if (!s.ok()) {
3469 3470
          // we simply fall back to binary search in case there is any
          // problem with prefix hash index loading.
3471 3472 3473
          ROCKS_LOG_WARN(rep_->ioptions.info_log,
                         "Unable to read the metaindex block."
                         " Fall back to binary search index.");
3474 3475 3476
          return BinarySearchIndexReader::Create(this, prefetch_buffer,
                                                 use_cache, prefetch, pin,
                                                 index_reader, lookup_context);
K
Kai Liu 已提交
3477 3478 3479 3480
        }
        meta_index_iter = meta_iter_guard.get();
      }

3481
      return HashIndexReader::Create(this, prefetch_buffer, meta_index_iter,
3482 3483
                                     use_cache, prefetch, pin, index_reader,
                                     lookup_context);
3484 3485 3486
    }
    default: {
      std::string error_message =
3487
          "Unrecognized index type: " + ToString(index_type_on_file);
3488
      return Status::InvalidArgument(error_message.c_str());
3489 3490 3491 3492
    }
  }
}

3493 3494 3495 3496 3497
uint64_t BlockBasedTable::ApproximateOffsetOf(const Slice& key,
                                              bool for_compaction) {
  BlockCacheLookupContext context(
      for_compaction ? BlockCacheLookupCaller::kCompaction
                     : BlockCacheLookupCaller::kUserApproximateSize);
3498
  std::unique_ptr<InternalIteratorBase<BlockHandle>> index_iter(
3499 3500 3501
      NewIndexIterator(ReadOptions(), /*need_upper_bound_check=*/false,
                       /*input_iter=*/nullptr, /*get_context=*/nullptr,
                       /*lookup_contex=*/&context));
K
Kai Liu 已提交
3502

J
jorlow@chromium.org 已提交
3503 3504 3505
  index_iter->Seek(key);
  uint64_t result;
  if (index_iter->Valid()) {
3506 3507
    BlockHandle handle = index_iter->value();
    result = handle.offset();
J
jorlow@chromium.org 已提交
3508
  } else {
K
Kai Liu 已提交
3509 3510 3511
    // key is past the last key in the file. If table_properties is not
    // available, approximate the offset by returning the offset of the
    // metaindex block (which is right near the end of the file).
3512 3513 3514 3515
    result = 0;
    if (rep_->table_properties) {
      result = rep_->table_properties->data_size;
    }
K
Kai Liu 已提交
3516 3517
    // table_properties is not present in the table.
    if (result == 0) {
I
xxHash  
Igor Canadi 已提交
3518
      result = rep_->footer.metaindex_handle().offset();
K
Kai Liu 已提交
3519
    }
J
jorlow@chromium.org 已提交
3520 3521 3522 3523
  }
  return result;
}

3524 3525 3526 3527
bool BlockBasedTable::TEST_filter_block_preloaded() const {
  return rep_->filter != nullptr;
}

3528 3529 3530 3531
bool BlockBasedTable::TEST_IndexBlockInCache() const {
  assert(rep_ != nullptr);

  return TEST_BlockInCache(rep_->footer.index_handle());
3532 3533
}

O
omegaga 已提交
3534 3535
Status BlockBasedTable::GetKVPairsFromDataBlocks(
    std::vector<KVPairBlock>* kv_pair_blocks) {
3536
  std::unique_ptr<InternalIteratorBase<BlockHandle>> blockhandles_iter(
3537 3538 3539
      NewIndexIterator(ReadOptions(), /*need_upper_bound_check=*/false,
                       /*input_iter=*/nullptr, /*get_context=*/nullptr,
                       /*lookup_contex=*/nullptr));
O
omegaga 已提交
3540 3541 3542 3543 3544 3545 3546 3547 3548 3549 3550 3551 3552 3553 3554 3555

  Status s = blockhandles_iter->status();
  if (!s.ok()) {
    // Cannot read Index Block
    return s;
  }

  for (blockhandles_iter->SeekToFirst(); blockhandles_iter->Valid();
       blockhandles_iter->Next()) {
    s = blockhandles_iter->status();

    if (!s.ok()) {
      break;
    }

    std::unique_ptr<InternalIterator> datablock_iter;
M
Maysam Yabandeh 已提交
3556
    datablock_iter.reset(NewDataBlockIterator<DataBlockIter>(
3557 3558 3559 3560 3561
        ReadOptions(), blockhandles_iter->value(), /*input_iter=*/nullptr,
        /*type=*/BlockType::kData,
        /*key_includes_seq=*/true, /*index_key_is_full=*/true,
        /*get_context=*/nullptr, /*lookup_context=*/nullptr, Status(),
        /*prefetch_buffer=*/nullptr));
O
omegaga 已提交
3562 3563 3564 3565 3566 3567 3568 3569 3570 3571 3572 3573 3574 3575 3576 3577 3578 3579 3580 3581 3582 3583 3584 3585 3586 3587 3588 3589
    s = datablock_iter->status();

    if (!s.ok()) {
      // Error reading the block - Skipped
      continue;
    }

    KVPairBlock kv_pair_block;
    for (datablock_iter->SeekToFirst(); datablock_iter->Valid();
         datablock_iter->Next()) {
      s = datablock_iter->status();
      if (!s.ok()) {
        // Error reading the block - Skipped
        break;
      }
      const Slice& key = datablock_iter->key();
      const Slice& value = datablock_iter->value();
      std::string key_copy = std::string(key.data(), key.size());
      std::string value_copy = std::string(value.data(), value.size());

      kv_pair_block.push_back(
          std::make_pair(std::move(key_copy), std::move(value_copy)));
    }
    kv_pair_blocks->push_back(std::move(kv_pair_block));
  }
  return Status::OK();
}

3590 3591
Status BlockBasedTable::DumpTable(WritableFile* out_file,
                                  const SliceTransform* prefix_extractor) {
3592 3593 3594 3595 3596 3597 3598 3599 3600 3601 3602 3603 3604
  // Output Footer
  out_file->Append(
      "Footer Details:\n"
      "--------------------------------------\n"
      "  ");
  out_file->Append(rep_->footer.ToString().c_str());
  out_file->Append("\n");

  // Output MetaIndex
  out_file->Append(
      "Metaindex Details:\n"
      "--------------------------------------\n");
  std::unique_ptr<Block> meta;
S
sdong 已提交
3605
  std::unique_ptr<InternalIterator> meta_iter;
3606
  Status s = ReadMetaBlock(nullptr /* prefetch_buffer */, &meta, &meta_iter);
3607 3608 3609 3610 3611 3612 3613 3614 3615 3616
  if (s.ok()) {
    for (meta_iter->SeekToFirst(); meta_iter->Valid(); meta_iter->Next()) {
      s = meta_iter->status();
      if (!s.ok()) {
        return s;
      }
      if (meta_iter->key() == rocksdb::kPropertiesBlock) {
        out_file->Append("  Properties block handle: ");
        out_file->Append(meta_iter->value().ToString(true).c_str());
        out_file->Append("\n");
3617 3618 3619 3620
      } else if (meta_iter->key() == rocksdb::kCompressionDictBlock) {
        out_file->Append("  Compression dictionary block handle: ");
        out_file->Append(meta_iter->value().ToString(true).c_str());
        out_file->Append("\n");
3621 3622 3623 3624 3625
      } else if (strstr(meta_iter->key().ToString().c_str(),
                        "filter.rocksdb.") != nullptr) {
        out_file->Append("  Filter block handle: ");
        out_file->Append(meta_iter->value().ToString(true).c_str());
        out_file->Append("\n");
3626 3627 3628 3629
      } else if (meta_iter->key() == rocksdb::kRangeDelBlock) {
        out_file->Append("  Range deletion block handle: ");
        out_file->Append(meta_iter->value().ToString(true).c_str());
        out_file->Append("\n");
3630 3631 3632 3633 3634 3635 3636 3637 3638 3639 3640 3641 3642 3643 3644 3645 3646 3647 3648
      }
    }
    out_file->Append("\n");
  } else {
    return s;
  }

  // Output TableProperties
  const rocksdb::TableProperties* table_properties;
  table_properties = rep_->table_properties.get();

  if (table_properties != nullptr) {
    out_file->Append(
        "Table Properties:\n"
        "--------------------------------------\n"
        "  ");
    out_file->Append(table_properties->ToString("\n  ", ": ").c_str());
    out_file->Append("\n");

3649 3650 3651 3652 3653 3654 3655 3656 3657 3658 3659 3660 3661 3662 3663
    // Output Filter blocks
    if (!rep_->filter && !table_properties->filter_policy_name.empty()) {
      // Support only BloomFilter as off now
      rocksdb::BlockBasedTableOptions table_options;
      table_options.filter_policy.reset(rocksdb::NewBloomFilterPolicy(1));
      if (table_properties->filter_policy_name.compare(
              table_options.filter_policy->Name()) == 0) {
        std::string filter_block_key = kFilterBlockPrefix;
        filter_block_key.append(table_properties->filter_policy_name);
        BlockHandle handle;
        if (FindMetaBlock(meta_iter.get(), filter_block_key, &handle).ok()) {
          BlockContents block;
          BlockFetcher block_fetcher(
              rep_->file.get(), nullptr /* prefetch_buffer */, rep_->footer,
              ReadOptions(), handle, &block, rep_->ioptions,
3664
              false /*decompress*/, false /*maybe_compressed*/,
3665
              UncompressionDict::GetEmptyDict(),
3666
              rep_->persistent_cache_options);
3667 3668 3669 3670 3671 3672 3673
          s = block_fetcher.ReadBlockContents();
          if (!s.ok()) {
            rep_->filter.reset(new BlockBasedFilterBlockReader(
                prefix_extractor, table_options,
                table_options.whole_key_filtering, std::move(block),
                rep_->ioptions.statistics));
          }
3674 3675 3676 3677 3678 3679 3680 3681 3682 3683 3684 3685 3686 3687 3688 3689 3690 3691
        }
      }
    }
  }
  if (rep_->filter) {
    out_file->Append(
        "Filter Details:\n"
        "--------------------------------------\n"
        "  ");
    out_file->Append(rep_->filter->ToString().c_str());
    out_file->Append("\n");
  }

  // Output Index block
  s = DumpIndexBlock(out_file);
  if (!s.ok()) {
    return s;
  }
3692 3693

  // Output compression dictionary
3694 3695
  if (!rep_->compression_dict_handle.IsNull()) {
    std::unique_ptr<const BlockContents> compression_dict_block;
3696
    s = ReadCompressionDictBlock(nullptr /* prefetch_buffer */,
3697 3698 3699 3700 3701 3702
                                 &compression_dict_block);
    if (!s.ok()) {
      return s;
    }
    assert(compression_dict_block != nullptr);
    auto compression_dict = compression_dict_block->data;
3703 3704 3705 3706 3707 3708 3709 3710 3711 3712 3713
    out_file->Append(
        "Compression Dictionary:\n"
        "--------------------------------------\n");
    out_file->Append("  size (bytes): ");
    out_file->Append(rocksdb::ToString(compression_dict.size()));
    out_file->Append("\n\n");
    out_file->Append("  HEX    ");
    out_file->Append(compression_dict.ToString(true).c_str());
    out_file->Append("\n\n");
  }

3714
  // Output range deletions block
A
Andrew Kryczka 已提交
3715
  auto* range_del_iter = NewRangeTombstoneIterator(ReadOptions());
A
Andrew Kryczka 已提交
3716 3717 3718 3719 3720 3721 3722 3723 3724 3725 3726
  if (range_del_iter != nullptr) {
    range_del_iter->SeekToFirst();
    if (range_del_iter->Valid()) {
      out_file->Append(
          "Range deletions:\n"
          "--------------------------------------\n"
          "  ");
      for (; range_del_iter->Valid(); range_del_iter->Next()) {
        DumpKeyValue(range_del_iter->key(), range_del_iter->value(), out_file);
      }
      out_file->Append("\n");
3727
    }
A
Andrew Kryczka 已提交
3728
    delete range_del_iter;
3729
  }
3730 3731 3732 3733 3734 3735
  // Output Data blocks
  s = DumpDataBlocks(out_file);

  return s;
}

3736
void BlockBasedTable::Close() {
3737 3738 3739
  if (rep_->closed) {
    return;
  }
3740 3741 3742 3743 3744

  Cache* const cache = rep_->table_options.block_cache.get();

  // cleanup index, filter, and compression dictionary blocks
  // to avoid accessing dangling pointers
3745 3746
  if (!rep_->table_options.no_block_cache) {
    char cache_key[kMaxCacheKeyPrefixSize + kMaxVarint64Length];
3747

3748 3749
    // Get the filter block key
    auto key = GetCacheKey(rep_->cache_key_prefix, rep_->cache_key_prefix_size,
M
Maysam Yabandeh 已提交
3750
                           rep_->filter_handle, cache_key);
3751 3752 3753 3754 3755 3756 3757 3758
    cache->Erase(key);

    if (!rep_->compression_dict_handle.IsNull()) {
      // Get the compression dictionary block key
      key = GetCacheKey(rep_->cache_key_prefix, rep_->cache_key_prefix_size,
                        rep_->compression_dict_handle, cache_key);
      cache->Erase(key);
    }
3759
  }
3760

3761
  rep_->closed = true;
3762 3763
}

3764 3765 3766 3767
Status BlockBasedTable::DumpIndexBlock(WritableFile* out_file) {
  out_file->Append(
      "Index Details:\n"
      "--------------------------------------\n");
3768
  std::unique_ptr<InternalIteratorBase<BlockHandle>> blockhandles_iter(
3769 3770 3771
      NewIndexIterator(ReadOptions(), /*need_upper_bound_check=*/false,
                       /*input_iter=*/nullptr, /*get_context=*/nullptr,
                       /*lookup_contex=*/nullptr));
3772 3773 3774 3775 3776 3777 3778 3779 3780 3781 3782 3783 3784 3785 3786
  Status s = blockhandles_iter->status();
  if (!s.ok()) {
    out_file->Append("Can not read Index Block \n\n");
    return s;
  }

  out_file->Append("  Block key hex dump: Data block handle\n");
  out_file->Append("  Block key ascii\n\n");
  for (blockhandles_iter->SeekToFirst(); blockhandles_iter->Valid();
       blockhandles_iter->Next()) {
    s = blockhandles_iter->status();
    if (!s.ok()) {
      break;
    }
    Slice key = blockhandles_iter->key();
M
Maysam Yabandeh 已提交
3787
    Slice user_key;
3788
    InternalKey ikey;
3789 3790 3791 3792
    if (rep_->table_properties &&
        rep_->table_properties->index_key_is_user_key != 0) {
      user_key = key;
    } else {
M
Maysam Yabandeh 已提交
3793 3794 3795
      ikey.DecodeFrom(key);
      user_key = ikey.user_key();
    }
3796 3797

    out_file->Append("  HEX    ");
M
Maysam Yabandeh 已提交
3798
    out_file->Append(user_key.ToString(true).c_str());
3799 3800 3801 3802
    out_file->Append(": ");
    out_file->Append(blockhandles_iter->value().ToString(true).c_str());
    out_file->Append("\n");

M
Maysam Yabandeh 已提交
3803
    std::string str_key = user_key.ToString();
3804 3805 3806 3807 3808 3809 3810 3811 3812 3813 3814 3815 3816 3817 3818
    std::string res_key("");
    char cspace = ' ';
    for (size_t i = 0; i < str_key.size(); i++) {
      res_key.append(&str_key[i], 1);
      res_key.append(1, cspace);
    }
    out_file->Append("  ASCII  ");
    out_file->Append(res_key.c_str());
    out_file->Append("\n  ------\n");
  }
  out_file->Append("\n");
  return Status::OK();
}

Status BlockBasedTable::DumpDataBlocks(WritableFile* out_file) {
3819
  std::unique_ptr<InternalIteratorBase<BlockHandle>> blockhandles_iter(
3820 3821 3822
      NewIndexIterator(ReadOptions(), /*need_upper_bound_check=*/false,
                       /*input_iter=*/nullptr, /*get_context=*/nullptr,
                       /*lookup_contex=*/nullptr));
3823 3824 3825 3826 3827 3828
  Status s = blockhandles_iter->status();
  if (!s.ok()) {
    out_file->Append("Can not read Index Block \n\n");
    return s;
  }

3829 3830 3831 3832
  uint64_t datablock_size_min = std::numeric_limits<uint64_t>::max();
  uint64_t datablock_size_max = 0;
  uint64_t datablock_size_sum = 0;

3833 3834 3835 3836 3837 3838 3839 3840
  size_t block_id = 1;
  for (blockhandles_iter->SeekToFirst(); blockhandles_iter->Valid();
       block_id++, blockhandles_iter->Next()) {
    s = blockhandles_iter->status();
    if (!s.ok()) {
      break;
    }

3841
    BlockHandle bh = blockhandles_iter->value();
3842 3843 3844 3845 3846
    uint64_t datablock_size = bh.size();
    datablock_size_min = std::min(datablock_size_min, datablock_size);
    datablock_size_max = std::max(datablock_size_max, datablock_size);
    datablock_size_sum += datablock_size;

3847
    out_file->Append("Data Block # ");
S
sdong 已提交
3848
    out_file->Append(rocksdb::ToString(block_id));
3849 3850 3851 3852 3853
    out_file->Append(" @ ");
    out_file->Append(blockhandles_iter->value().ToString(true).c_str());
    out_file->Append("\n");
    out_file->Append("--------------------------------------\n");

S
sdong 已提交
3854
    std::unique_ptr<InternalIterator> datablock_iter;
M
Maysam Yabandeh 已提交
3855
    datablock_iter.reset(NewDataBlockIterator<DataBlockIter>(
3856 3857 3858 3859 3860
        ReadOptions(), blockhandles_iter->value(), /*input_iter=*/nullptr,
        /*type=*/BlockType::kData,
        /*key_includes_seq=*/true, /*index_key_is_full=*/true,
        /*get_context=*/nullptr, /*lookup_context=*/nullptr, Status(),
        /*prefetch_buffer=*/nullptr));
3861 3862 3863 3864 3865 3866 3867 3868 3869 3870 3871 3872 3873 3874
    s = datablock_iter->status();

    if (!s.ok()) {
      out_file->Append("Error reading the block - Skipped \n\n");
      continue;
    }

    for (datablock_iter->SeekToFirst(); datablock_iter->Valid();
         datablock_iter->Next()) {
      s = datablock_iter->status();
      if (!s.ok()) {
        out_file->Append("Error reading the block - Skipped \n");
        break;
      }
3875
      DumpKeyValue(datablock_iter->key(), datablock_iter->value(), out_file);
3876 3877 3878
    }
    out_file->Append("\n");
  }
3879 3880 3881 3882 3883 3884 3885 3886 3887 3888 3889 3890 3891 3892 3893 3894 3895 3896

  uint64_t num_datablocks = block_id - 1;
  if (num_datablocks) {
    double datablock_size_avg =
        static_cast<double>(datablock_size_sum) / num_datablocks;
    out_file->Append("Data Block Summary:\n");
    out_file->Append("--------------------------------------");
    out_file->Append("\n  # data blocks: ");
    out_file->Append(rocksdb::ToString(num_datablocks));
    out_file->Append("\n  min data block size: ");
    out_file->Append(rocksdb::ToString(datablock_size_min));
    out_file->Append("\n  max data block size: ");
    out_file->Append(rocksdb::ToString(datablock_size_max));
    out_file->Append("\n  avg data block size: ");
    out_file->Append(rocksdb::ToString(datablock_size_avg));
    out_file->Append("\n");
  }

3897 3898 3899
  return Status::OK();
}

3900 3901 3902 3903 3904 3905 3906 3907 3908 3909 3910 3911 3912 3913 3914 3915
void BlockBasedTable::DumpKeyValue(const Slice& key, const Slice& value,
                                   WritableFile* out_file) {
  InternalKey ikey;
  ikey.DecodeFrom(key);

  out_file->Append("  HEX    ");
  out_file->Append(ikey.user_key().ToString(true).c_str());
  out_file->Append(": ");
  out_file->Append(value.ToString(true).c_str());
  out_file->Append("\n");

  std::string str_key = ikey.user_key().ToString();
  std::string str_value = value.ToString();
  std::string res_key(""), res_value("");
  char cspace = ' ';
  for (size_t i = 0; i < str_key.size(); i++) {
3916 3917 3918 3919 3920
    if (str_key[i] == '\0') {
      res_key.append("\\0", 2);
    } else {
      res_key.append(&str_key[i], 1);
    }
3921 3922 3923
    res_key.append(1, cspace);
  }
  for (size_t i = 0; i < str_value.size(); i++) {
3924 3925 3926 3927 3928
    if (str_value[i] == '\0') {
      res_value.append("\\0", 2);
    } else {
      res_value.append(&str_value[i], 1);
    }
3929 3930 3931 3932 3933 3934 3935 3936 3937 3938
    res_value.append(1, cspace);
  }

  out_file->Append("  ASCII  ");
  out_file->Append(res_key.c_str());
  out_file->Append(": ");
  out_file->Append(res_value.c_str());
  out_file->Append("\n  ------\n");
}

3939 3940
namespace {

A
Andrew Kryczka 已提交
3941
void DeleteCachedFilterEntry(const Slice& /*key*/, void* value) {
3942 3943 3944
  FilterBlockReader* filter = reinterpret_cast<FilterBlockReader*>(value);
  if (filter->statistics() != nullptr) {
    RecordTick(filter->statistics(), BLOCK_CACHE_FILTER_BYTES_EVICT,
3945
               filter->ApproximateMemoryUsage());
3946 3947 3948 3949
  }
  delete filter;
}

3950 3951 3952 3953 3954 3955 3956
void DeleteCachedUncompressionDictEntry(const Slice& /*key*/, void* value) {
  UncompressionDict* dict = reinterpret_cast<UncompressionDict*>(value);
  RecordTick(dict->statistics(), BLOCK_CACHE_COMPRESSION_DICT_BYTES_EVICT,
             dict->ApproximateMemoryUsage());
  delete dict;
}

3957 3958
}  // anonymous namespace

3959
}  // namespace rocksdb