block_based_table_reader.cc 148.6 KB
Newer Older
1
//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
S
Siying Dong 已提交
2 3 4
//  This source code is licensed under both the GPLv2 (found in the
//  COPYING file in the root directory) and Apache 2.0 License
//  (found in the LICENSE.Apache file in the root directory).
5
//
J
jorlow@chromium.org 已提交
6 7 8
// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file. See the AUTHORS file for names of contributors.
9
#include "table/block_based/block_based_table_reader.h"
10

11
#include <algorithm>
12
#include <array>
13
#include <limits>
14 15
#include <string>
#include <utility>
O
omegaga 已提交
16
#include <vector>
17

T
Tyler Harter 已提交
18
#include "db/dbformat.h"
19
#include "db/pinned_iterators_manager.h"
T
Tyler Harter 已提交
20

21
#include "rocksdb/cache.h"
22 23 24
#include "rocksdb/comparator.h"
#include "rocksdb/env.h"
#include "rocksdb/filter_policy.h"
25
#include "rocksdb/iterator.h"
26 27
#include "rocksdb/options.h"
#include "rocksdb/statistics.h"
S
Siying Dong 已提交
28
#include "rocksdb/table.h"
29
#include "rocksdb/table_properties.h"
30

31 32 33 34 35 36 37
#include "table/block_based/block.h"
#include "table/block_based/block_based_filter_block.h"
#include "table/block_based/block_based_table_factory.h"
#include "table/block_based/block_prefix_index.h"
#include "table/block_based/filter_block.h"
#include "table/block_based/full_filter_block.h"
#include "table/block_based/partitioned_filter_block.h"
38
#include "table/block_fetcher.h"
J
jorlow@chromium.org 已提交
39
#include "table/format.h"
K
krad 已提交
40
#include "table/get_context.h"
S
sdong 已提交
41
#include "table/internal_iterator.h"
42
#include "table/meta_blocks.h"
43
#include "table/multiget_context.h"
K
krad 已提交
44
#include "table/persistent_cache_helper.h"
45
#include "table/sst_file_writer_collectors.h"
J
jorlow@chromium.org 已提交
46
#include "table/two_level_iterator.h"
47

48
#include "monitoring/perf_context_imp.h"
49
#include "test_util/sync_point.h"
J
jorlow@chromium.org 已提交
50
#include "util/coding.h"
51
#include "util/crc32c.h"
52
#include "util/file_reader_writer.h"
53
#include "util/stop_watch.h"
54
#include "util/string_util.h"
55
#include "util/xxhash.h"
J
jorlow@chromium.org 已提交
56

57
namespace rocksdb {
J
jorlow@chromium.org 已提交
58

I
xxHash  
Igor Canadi 已提交
59
extern const uint64_t kBlockBasedTableMagicNumber;
K
Kai Liu 已提交
60 61
extern const std::string kHashIndexPrefixesBlock;
extern const std::string kHashIndexPrefixesMetadataBlock;
62 63 64

typedef BlockBasedTable::IndexReader IndexReader;

M
Maysam Yabandeh 已提交
65 66 67 68 69
BlockBasedTable::~BlockBasedTable() {
  Close();
  delete rep_;
}

70 71
std::atomic<uint64_t> BlockBasedTable::next_cache_key_id_(0);

72 73 74 75 76
namespace {
// Read the block identified by "handle" from "file".
// The only relevant option is options.verify_checksums for now.
// On failure return non-OK.
// On success fill *result and return OK - caller owns *result
77
// @param uncompression_dict Data for presetting the compression library's
78
//    dictionary.
79 80 81 82
Status ReadBlockFromFile(
    RandomAccessFileReader* file, FilePrefetchBuffer* prefetch_buffer,
    const Footer& footer, const ReadOptions& options, const BlockHandle& handle,
    std::unique_ptr<Block>* result, const ImmutableCFOptions& ioptions,
83
    bool do_uncompress, bool maybe_compressed, BlockType block_type,
84
    const UncompressionDict& uncompression_dict,
85
    const PersistentCacheOptions& cache_options, SequenceNumber global_seqno,
86
    size_t read_amp_bytes_per_bit, MemoryAllocator* memory_allocator) {
87
  BlockContents contents;
88 89
  BlockFetcher block_fetcher(file, prefetch_buffer, footer, options, handle,
                             &contents, ioptions, do_uncompress,
90
                             maybe_compressed, block_type, uncompression_dict,
91
                             cache_options, memory_allocator);
S
Siying Dong 已提交
92
  Status s = block_fetcher.ReadBlockContents();
93
  if (s.ok()) {
94 95
    result->reset(new Block(std::move(contents), global_seqno,
                            read_amp_bytes_per_bit, ioptions.statistics));
96 97 98 99 100
  }

  return s;
}

Y
Yi Wu 已提交
101
inline MemoryAllocator* GetMemoryAllocator(
102
    const BlockBasedTableOptions& table_options) {
103 104 105
  return table_options.block_cache.get()
             ? table_options.block_cache->memory_allocator()
             : nullptr;
106 107
}

108 109 110 111 112 113 114
inline MemoryAllocator* GetMemoryAllocatorForCompressedBlock(
    const BlockBasedTableOptions& table_options) {
  return table_options.block_cache_compressed.get()
             ? table_options.block_cache_compressed->memory_allocator()
             : nullptr;
}

115 116
// Delete the entry resided in the cache.
template <class Entry>
A
Andrew Kryczka 已提交
117
void DeleteCachedEntry(const Slice& /*key*/, void* value) {
118 119 120 121
  auto entry = reinterpret_cast<Entry*>(value);
  delete entry;
}

122
void DeleteCachedFilterEntry(const Slice& key, void* value);
123
void DeleteCachedUncompressionDictEntry(const Slice& key, void* value);
124

125 126 127 128
// Release the cached entry and decrement its ref count.
void ForceReleaseCachedEntry(void* arg, void* h) {
  Cache* cache = reinterpret_cast<Cache*>(arg);
  Cache::Handle* handle = reinterpret_cast<Cache::Handle*>(h);
129
  cache->Release(handle, true /* force_erase */);
130 131
}

132 133 134 135 136 137 138 139
// Release the cached entry and decrement its ref count.
// Do not force erase
void ReleaseCachedEntry(void* arg, void* h) {
  Cache* cache = reinterpret_cast<Cache*>(arg);
  Cache::Handle* handle = reinterpret_cast<Cache::Handle*>(h);
  cache->Release(handle, false /* force_erase */);
}

140 141 142
// For hash based index, return true if prefix_extractor and
// prefix_extractor_block mismatch, false otherwise. This flag will be used
// as total_order_seek via NewIndexIterator
143 144
bool PrefixExtractorChanged(const TableProperties* table_properties,
                            const SliceTransform* prefix_extractor) {
145 146 147
  // BlockBasedTableOptions::kHashSearch requires prefix_extractor to be set.
  // Turn off hash index in prefix_extractor is not set; if  prefix_extractor
  // is set but prefix_extractor_block is not set, also disable hash index
148 149
  if (prefix_extractor == nullptr || table_properties == nullptr ||
      table_properties->prefix_extractor_name.empty()) {
150 151
    return true;
  }
152

153
  // prefix_extractor and prefix_extractor_block are both non-empty
154 155
  if (table_properties->prefix_extractor_name.compare(
          prefix_extractor->Name()) != 0) {
156 157 158 159 160 161
    return true;
  } else {
    return false;
  }
}

162 163
}  // namespace

164 165 166 167 168
// Encapsulates common functionality for the various index reader
// implementations. Provides access to the index block regardless of whether
// it is owned by the reader or stored in the cache, or whether it is pinned
// in the cache or not.
class BlockBasedTable::IndexReaderCommon : public BlockBasedTable::IndexReader {
169
 public:
170 171
  IndexReaderCommon(const BlockBasedTable* t,
                    CachableEntry<Block>&& index_block)
172
      : table_(t), index_block_(std::move(index_block)) {
173 174 175
    assert(table_ != nullptr);
  }

176
 protected:
177
  static Status ReadIndexBlock(const BlockBasedTable* table,
178 179 180
                               FilePrefetchBuffer* prefetch_buffer,
                               const ReadOptions& read_options,
                               GetContext* get_context,
181
                               BlockCacheLookupContext* lookup_context,
182
                               CachableEntry<Block>* index_block);
183

184
  const BlockBasedTable* table() const { return table_; }
185 186 187 188 189 190 191 192 193 194 195 196 197

  const InternalKeyComparator* internal_comparator() const {
    assert(table_ != nullptr);
    assert(table_->get_rep() != nullptr);

    return &table_->get_rep()->internal_comparator;
  }

  bool index_key_includes_seq() const {
    assert(table_ != nullptr);
    assert(table_->get_rep() != nullptr);

    const TableProperties* const properties =
198
        table_->get_rep()->table_properties.get();
199 200 201 202 203 204 205 206 207

    return properties == nullptr || !properties->index_key_is_user_key;
  }

  bool index_value_is_full() const {
    assert(table_ != nullptr);
    assert(table_->get_rep() != nullptr);

    const TableProperties* const properties =
208
        table_->get_rep()->table_properties.get();
209 210 211 212

    return properties == nullptr || !properties->index_value_is_delta_encoded;
  }

213
  Status GetOrReadIndexBlock(bool no_io, GetContext* get_context,
214
                             BlockCacheLookupContext* lookup_context,
215 216 217 218
                             CachableEntry<Block>* index_block) const;

  size_t ApproximateIndexBlockMemoryUsage() const {
    assert(!index_block_.GetOwnValue() || index_block_.GetValue() != nullptr);
219 220 221
    return index_block_.GetOwnValue()
               ? index_block_.GetValue()->ApproximateMemoryUsage()
               : 0;
222 223
  }

224
 private:
225
  const BlockBasedTable* table_;
226 227 228 229
  CachableEntry<Block> index_block_;
};

Status BlockBasedTable::IndexReaderCommon::ReadIndexBlock(
230
    const BlockBasedTable* table, FilePrefetchBuffer* prefetch_buffer,
231
    const ReadOptions& read_options, GetContext* get_context,
232
    BlockCacheLookupContext* lookup_context,
233
    CachableEntry<Block>* index_block) {
234 235 236 237 238 239 240 241 242
  PERF_TIMER_GUARD(read_index_block_nanos);

  assert(table != nullptr);
  assert(index_block != nullptr);
  assert(index_block->IsEmpty());

  const Rep* const rep = table->get_rep();
  assert(rep != nullptr);

243 244
  const Status s = table->RetrieveBlock(
      prefetch_buffer, read_options, rep->footer.index_handle(),
245
      UncompressionDict::GetEmptyDict(), index_block, BlockType::kIndex,
246
      get_context, lookup_context);
247 248 249 250 251

  return s;
}

Status BlockBasedTable::IndexReaderCommon::GetOrReadIndexBlock(
252
    bool no_io, GetContext* get_context,
253
    BlockCacheLookupContext* lookup_context,
254
    CachableEntry<Block>* index_block) const {
255 256 257
  assert(index_block != nullptr);

  if (!index_block_.IsEmpty()) {
258
    index_block->SetUnownedValue(index_block_.GetValue());
259 260 261
    return Status::OK();
  }

262 263 264 265 266
  ReadOptions read_options;
  if (no_io) {
    read_options.read_tier = kBlockCacheTier;
  }

267 268
  return ReadIndexBlock(table_, /*prefetch_buffer=*/nullptr, read_options,
                        get_context, lookup_context, index_block);
269 270
}

M
Maysam Yabandeh 已提交
271
// Index that allows binary search lookup in a two-level index structure.
272
class PartitionIndexReader : public BlockBasedTable::IndexReaderCommon {
M
Maysam Yabandeh 已提交
273 274 275 276 277
 public:
  // Read the partition index from the file and create an instance for
  // `PartitionIndexReader`.
  // On success, index_reader will be populated; otherwise it will remain
  // unmodified.
278
  static Status Create(const BlockBasedTable* table,
279
                       FilePrefetchBuffer* prefetch_buffer, bool use_cache,
280 281
                       bool prefetch, bool pin, IndexReader** index_reader,
                       BlockCacheLookupContext* lookup_context) {
282 283 284 285 286 287 288
    assert(table != nullptr);
    assert(table->get_rep());
    assert(!pin || prefetch);
    assert(index_reader != nullptr);

    CachableEntry<Block> index_block;
    if (prefetch || !use_cache) {
289 290 291
      const Status s =
          ReadIndexBlock(table, prefetch_buffer, ReadOptions(),
                         /*get_context=*/nullptr, lookup_context, &index_block);
292 293 294
      if (!s.ok()) {
        return s;
      }
M
Maysam Yabandeh 已提交
295

296 297 298
      if (use_cache && !pin) {
        index_block.Reset();
      }
M
Maysam Yabandeh 已提交
299 300
    }

301 302 303
    *index_reader = new PartitionIndexReader(table, std::move(index_block));

    return Status::OK();
M
Maysam Yabandeh 已提交
304 305 306
  }

  // return a two-level iterator: first level is on the partition index
307
  InternalIteratorBase<BlockHandle>* NewIterator(
308
      const ReadOptions& read_options, bool /* disable_prefix_seek */,
309 310
      IndexBlockIter* iter, GetContext* get_context,
      BlockCacheLookupContext* lookup_context) override {
311
    const bool no_io = (read_options.read_tier == kBlockCacheTier);
312
    CachableEntry<Block> index_block;
313 314
    const Status s =
        GetOrReadIndexBlock(no_io, get_context, lookup_context, &index_block);
315 316 317 318 319 320 321 322 323 324 325
    if (!s.ok()) {
      if (iter != nullptr) {
        iter->Invalidate(s);
        return iter;
      }

      return NewErrorInternalIterator<BlockHandle>(s);
    }

    InternalIteratorBase<BlockHandle>* it = nullptr;

M
Maysam Yabandeh 已提交
326
    Statistics* kNullStats = nullptr;
M
Maysam Yabandeh 已提交
327
    // Filters are already checked before seeking the index
328
    if (!partition_map_.empty()) {
329
      // We don't return pinned data from index blocks, so no need
330
      // to set `block_contents_pinned`.
331
      it = NewTwoLevelIterator(
332
          new BlockBasedTable::PartitionedIndexIteratorState(
333 334 335 336 337 338
              table(), &partition_map_, index_key_includes_seq(),
              index_value_is_full()),
          index_block.GetValue()->NewIterator<IndexBlockIter>(
              internal_comparator(), internal_comparator()->user_comparator(),
              nullptr, kNullStats, true, index_key_includes_seq(),
              index_value_is_full()));
339
    } else {
340 341 342
      ReadOptions ro;
      ro.fill_cache = read_options.fill_cache;
      // We don't return pinned data from index blocks, so no need
343
      // to set `block_contents_pinned`.
344 345 346 347 348 349
      it = new BlockBasedTableIterator<IndexBlockIter, BlockHandle>(
          table(), ro, *internal_comparator(),
          index_block.GetValue()->NewIterator<IndexBlockIter>(
              internal_comparator(), internal_comparator()->user_comparator(),
              nullptr, kNullStats, true, index_key_includes_seq(),
              index_value_is_full()),
350
          false, true, /* prefix_extractor */ nullptr, BlockType::kIndex,
351
          index_key_includes_seq(), index_value_is_full());
352
    }
353 354 355 356 357 358

    assert(it != nullptr);
    index_block.TransferTo(it);

    return it;

M
Maysam Yabandeh 已提交
359
    // TODO(myabandeh): Update TwoLevelIterator to be able to make use of
M
Maysam Yabandeh 已提交
360 361 362 363 364
    // on-stack BlockIter while the state is on heap. Currentlly it assumes
    // the first level iter is always on heap and will attempt to delete it
    // in its destructor.
  }

365
  void CacheDependencies(bool pin) override {
M
Maysam Yabandeh 已提交
366
    // Before read partitions, prefetch them to avoid lots of IOs
367
    BlockCacheLookupContext lookup_context{BlockCacheLookupCaller::kPrefetch};
368
    auto rep = table()->rep_;
M
Maysam Yabandeh 已提交
369
    IndexBlockIter biter;
M
Maysam Yabandeh 已提交
370
    BlockHandle handle;
M
Maysam Yabandeh 已提交
371
    Statistics* kNullStats = nullptr;
372 373

    CachableEntry<Block> index_block;
374
    Status s = GetOrReadIndexBlock(false /* no_io */, nullptr /* get_context */,
375
                                   &lookup_context, &index_block);
376 377 378
    if (!s.ok()) {
      ROCKS_LOG_WARN(rep->ioptions.info_log,
                     "Error retrieving top-level index block while trying to "
379 380
                     "cache index partitions: %s",
                     s.ToString().c_str());
381 382 383 384
      return;
    }

    // We don't return pinned data from index blocks, so no need
385
    // to set `block_contents_pinned`.
386 387 388
    index_block.GetValue()->NewIterator<IndexBlockIter>(
        internal_comparator(), internal_comparator()->user_comparator(), &biter,
        kNullStats, true, index_key_includes_seq(), index_value_is_full());
M
Maysam Yabandeh 已提交
389 390 391
    // Index partitions are assumed to be consecuitive. Prefetch them all.
    // Read the first block offset
    biter.SeekToFirst();
392 393 394 395
    if (!biter.Valid()) {
      // Empty index.
      return;
    }
396
    handle = biter.value();
M
Maysam Yabandeh 已提交
397 398 399 400
    uint64_t prefetch_off = handle.offset();

    // Read the last block's offset
    biter.SeekToLast();
401 402 403 404
    if (!biter.Valid()) {
      // Empty index.
      return;
    }
405
    handle = biter.value();
M
Maysam Yabandeh 已提交
406 407 408
    uint64_t last_off = handle.offset() + handle.size() + kBlockTrailerSize;
    uint64_t prefetch_len = last_off - prefetch_off;
    std::unique_ptr<FilePrefetchBuffer> prefetch_buffer;
409
    auto& file = rep->file;
M
Maysam Yabandeh 已提交
410
    prefetch_buffer.reset(new FilePrefetchBuffer());
411 412
    s = prefetch_buffer->Prefetch(file.get(), prefetch_off,
                                  static_cast<size_t>(prefetch_len));
M
Maysam Yabandeh 已提交
413 414 415 416 417

    // After prefetch, read the partitions one by one
    biter.SeekToFirst();
    auto ro = ReadOptions();
    for (; biter.Valid(); biter.Next()) {
418
      handle = biter.value();
419
      CachableEntry<Block> block;
420 421
      // TODO: Support counter batch update for partitioned index and
      // filter blocks
422 423
      s = table()->MaybeReadBlockAndLoadToCache(
          prefetch_buffer.get(), ro, handle, UncompressionDict::GetEmptyDict(),
424
          &block, BlockType::kIndex, /*get_context=*/nullptr, &lookup_context);
M
Maysam Yabandeh 已提交
425

426 427 428
      assert(s.ok() || block.GetValue() == nullptr);
      if (s.ok() && block.GetValue() != nullptr) {
        if (block.IsCached()) {
429
          if (pin) {
430
            partition_map_[handle.offset()] = std::move(block);
431
          }
M
Maysam Yabandeh 已提交
432 433 434
        }
      }
    }
M
Maysam Yabandeh 已提交
435 436
  }

437
  size_t ApproximateMemoryUsage() const override {
438
    size_t usage = ApproximateIndexBlockMemoryUsage();
439 440 441 442 443 444 445
#ifdef ROCKSDB_MALLOC_USABLE_SIZE
    usage += malloc_usable_size((void*)this);
#else
    usage += sizeof(*this);
#endif  // ROCKSDB_MALLOC_USABLE_SIZE
    // TODO(myabandeh): more accurate estimate of partition_map_ mem usage
    return usage;
M
Maysam Yabandeh 已提交
446 447 448
  }

 private:
449 450
  PartitionIndexReader(const BlockBasedTable* t,
                       CachableEntry<Block>&& index_block)
451
      : IndexReaderCommon(t, std::move(index_block)) {}
452

453
  std::unordered_map<uint64_t, CachableEntry<Block>> partition_map_;
M
Maysam Yabandeh 已提交
454 455
};

456 457 458
// Index that allows binary search lookup for the first key of each block.
// This class can be viewed as a thin wrapper for `Block` class which already
// supports binary search.
459
class BinarySearchIndexReader : public BlockBasedTable::IndexReaderCommon {
460 461 462
 public:
  // Read index from the file and create an intance for
  // `BinarySearchIndexReader`.
463 464
  // On success, index_reader will be populated; otherwise it will remain
  // unmodified.
465
  static Status Create(const BlockBasedTable* table,
466
                       FilePrefetchBuffer* prefetch_buffer, bool use_cache,
467 468
                       bool prefetch, bool pin, IndexReader** index_reader,
                       BlockCacheLookupContext* lookup_context) {
469 470 471 472 473 474 475
    assert(table != nullptr);
    assert(table->get_rep());
    assert(!pin || prefetch);
    assert(index_reader != nullptr);

    CachableEntry<Block> index_block;
    if (prefetch || !use_cache) {
476 477 478
      const Status s =
          ReadIndexBlock(table, prefetch_buffer, ReadOptions(),
                         /*get_context=*/nullptr, lookup_context, &index_block);
479 480 481
      if (!s.ok()) {
        return s;
      }
482

483 484 485
      if (use_cache && !pin) {
        index_block.Reset();
      }
486 487
    }

488 489 490
    *index_reader = new BinarySearchIndexReader(table, std::move(index_block));

    return Status::OK();
491 492
  }

493
  InternalIteratorBase<BlockHandle>* NewIterator(
494
      const ReadOptions& read_options, bool /* disable_prefix_seek */,
495 496
      IndexBlockIter* iter, GetContext* get_context,
      BlockCacheLookupContext* lookup_context) override {
497
    const bool no_io = (read_options.read_tier == kBlockCacheTier);
498
    CachableEntry<Block> index_block;
499 500
    const Status s =
        GetOrReadIndexBlock(no_io, get_context, lookup_context, &index_block);
501 502 503 504 505 506 507 508 509
    if (!s.ok()) {
      if (iter != nullptr) {
        iter->Invalidate(s);
        return iter;
      }

      return NewErrorInternalIterator<BlockHandle>(s);
    }

M
Maysam Yabandeh 已提交
510
    Statistics* kNullStats = nullptr;
511
    // We don't return pinned data from index blocks, so no need
512
    // to set `block_contents_pinned`.
513 514 515
    auto it = index_block.GetValue()->NewIterator<IndexBlockIter>(
        internal_comparator(), internal_comparator()->user_comparator(), iter,
        kNullStats, true, index_key_includes_seq(), index_value_is_full());
516

517 518 519 520 521
    assert(it != nullptr);
    index_block.TransferTo(it);

    return it;
  }
522

523
  size_t ApproximateMemoryUsage() const override {
524
    size_t usage = ApproximateIndexBlockMemoryUsage();
525 526 527 528 529 530
#ifdef ROCKSDB_MALLOC_USABLE_SIZE
    usage += malloc_usable_size((void*)this);
#else
    usage += sizeof(*this);
#endif  // ROCKSDB_MALLOC_USABLE_SIZE
    return usage;
531 532
  }

533
 private:
534
  BinarySearchIndexReader(const BlockBasedTable* t,
535
                          CachableEntry<Block>&& index_block)
536
      : IndexReaderCommon(t, std::move(index_block)) {}
537 538 539 540
};

// Index that leverages an internal hash table to quicken the lookup for a given
// key.
541
class HashIndexReader : public BlockBasedTable::IndexReaderCommon {
542
 public:
543
  static Status Create(const BlockBasedTable* table,
544 545
                       FilePrefetchBuffer* prefetch_buffer,
                       InternalIterator* meta_index_iter, bool use_cache,
546 547
                       bool prefetch, bool pin, IndexReader** index_reader,
                       BlockCacheLookupContext* lookup_context) {
548 549 550 551 552 553 554 555 556
    assert(table != nullptr);
    assert(index_reader != nullptr);
    assert(!pin || prefetch);

    auto rep = table->get_rep();
    assert(rep != nullptr);

    CachableEntry<Block> index_block;
    if (prefetch || !use_cache) {
557 558 559
      const Status s =
          ReadIndexBlock(table, prefetch_buffer, ReadOptions(),
                         /*get_context=*/nullptr, lookup_context, &index_block);
560 561 562
      if (!s.ok()) {
        return s;
      }
563

564 565 566
      if (use_cache && !pin) {
        index_block.Reset();
      }
567 568
    }

569 570 571 572
    // Note, failure to create prefix hash index does not need to be a
    // hard error. We can still fall back to the original binary search index.
    // So, Create will succeed regardless, from this point on.

573
    auto new_index_reader = new HashIndexReader(table, std::move(index_block));
574 575
    *index_reader = new_index_reader;

K
Kai Liu 已提交
576 577
    // Get prefixes block
    BlockHandle prefixes_handle;
578 579
    Status s = FindMetaBlock(meta_index_iter, kHashIndexPrefixesBlock,
                             &prefixes_handle);
K
Kai Liu 已提交
580
    if (!s.ok()) {
581 582
      // TODO: log error
      return Status::OK();
K
Kai Liu 已提交
583 584 585 586 587 588 589
    }

    // Get index metadata block
    BlockHandle prefixes_meta_handle;
    s = FindMetaBlock(meta_index_iter, kHashIndexPrefixesMetadataBlock,
                      &prefixes_meta_handle);
    if (!s.ok()) {
590 591
      // TODO: log error
      return Status::OK();
K
Kai Liu 已提交
592 593
    }

594 595 596 597 598
    RandomAccessFileReader* const file = rep->file.get();
    const Footer& footer = rep->footer;
    const ImmutableCFOptions& ioptions = rep->ioptions;
    const PersistentCacheOptions& cache_options = rep->persistent_cache_options;
    MemoryAllocator* const memory_allocator =
599
        GetMemoryAllocator(rep->table_options);
600

K
Kai Liu 已提交
601 602
    // Read contents for the blocks
    BlockContents prefixes_contents;
S
Siying Dong 已提交
603 604
    BlockFetcher prefixes_block_fetcher(
        file, prefetch_buffer, footer, ReadOptions(), prefixes_handle,
605
        &prefixes_contents, ioptions, true /*decompress*/,
606 607
        true /*maybe_compressed*/, BlockType::kHashIndexPrefixes,
        UncompressionDict::GetEmptyDict(), cache_options, memory_allocator);
S
Siying Dong 已提交
608
    s = prefixes_block_fetcher.ReadBlockContents();
K
Kai Liu 已提交
609 610 611 612
    if (!s.ok()) {
      return s;
    }
    BlockContents prefixes_meta_contents;
S
Siying Dong 已提交
613 614
    BlockFetcher prefixes_meta_block_fetcher(
        file, prefetch_buffer, footer, ReadOptions(), prefixes_meta_handle,
615
        &prefixes_meta_contents, ioptions, true /*decompress*/,
616 617
        true /*maybe_compressed*/, BlockType::kHashIndexMetadata,
        UncompressionDict::GetEmptyDict(), cache_options, memory_allocator);
618
    s = prefixes_meta_block_fetcher.ReadBlockContents();
K
Kai Liu 已提交
619
    if (!s.ok()) {
620 621
      // TODO: log error
      return Status::OK();
K
Kai Liu 已提交
622 623
    }

624
    BlockPrefixIndex* prefix_index = nullptr;
625 626
    s = BlockPrefixIndex::Create(rep->internal_prefix_transform.get(),
                                 prefixes_contents.data,
627 628 629
                                 prefixes_meta_contents.data, &prefix_index);
    // TODO: log error
    if (s.ok()) {
M
Maysam Yabandeh 已提交
630
      new_index_reader->prefix_index_.reset(prefix_index);
K
Kai Liu 已提交
631 632
    }

633
    return Status::OK();
634 635
  }

636
  InternalIteratorBase<BlockHandle>* NewIterator(
637
      const ReadOptions& read_options, bool disable_prefix_seek,
638 639
      IndexBlockIter* iter, GetContext* get_context,
      BlockCacheLookupContext* lookup_context) override {
640
    const bool no_io = (read_options.read_tier == kBlockCacheTier);
641
    CachableEntry<Block> index_block;
642 643
    const Status s =
        GetOrReadIndexBlock(no_io, get_context, lookup_context, &index_block);
644 645 646 647 648 649 650 651 652
    if (!s.ok()) {
      if (iter != nullptr) {
        iter->Invalidate(s);
        return iter;
      }

      return NewErrorInternalIterator<BlockHandle>(s);
    }

M
Maysam Yabandeh 已提交
653
    Statistics* kNullStats = nullptr;
654 655
    const bool total_order_seek =
        read_options.total_order_seek || disable_prefix_seek;
656
    // We don't return pinned data from index blocks, so no need
657
    // to set `block_contents_pinned`.
658 659 660 661 662
    auto it = index_block.GetValue()->NewIterator<IndexBlockIter>(
        internal_comparator(), internal_comparator()->user_comparator(), iter,
        kNullStats, total_order_seek, index_key_includes_seq(),
        index_value_is_full(), false /* block_contents_pinned */,
        prefix_index_.get());
663

664 665 666 667 668
    assert(it != nullptr);
    index_block.TransferTo(it);

    return it;
  }
669

670
  size_t ApproximateMemoryUsage() const override {
671
    size_t usage = ApproximateIndexBlockMemoryUsage();
672 673 674
#ifdef ROCKSDB_MALLOC_USABLE_SIZE
    usage += malloc_usable_size((void*)this);
#else
M
Maysam Yabandeh 已提交
675 676 677
    if (prefix_index_) {
      usage += prefix_index_->ApproximateMemoryUsage();
    }
678 679 680
    usage += sizeof(*this);
#endif  // ROCKSDB_MALLOC_USABLE_SIZE
    return usage;
681 682
  }

683
 private:
684
  HashIndexReader(const BlockBasedTable* t, CachableEntry<Block>&& index_block)
685
      : IndexReaderCommon(t, std::move(index_block)) {}
K
Kai Liu 已提交
686

M
Maysam Yabandeh 已提交
687
  std::unique_ptr<BlockPrefixIndex> prefix_index_;
688 689
};

690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861
void BlockBasedTable::UpdateCacheHitMetrics(BlockType block_type,
                                            GetContext* get_context,
                                            size_t usage) const {
  Statistics* const statistics = rep_->ioptions.statistics;

  PERF_COUNTER_ADD(block_cache_hit_count, 1);
  PERF_COUNTER_BY_LEVEL_ADD(block_cache_hit_count, 1,
                            static_cast<uint32_t>(rep_->level));

  if (get_context) {
    ++get_context->get_context_stats_.num_cache_hit;
    get_context->get_context_stats_.num_cache_bytes_read += usage;
  } else {
    RecordTick(statistics, BLOCK_CACHE_HIT);
    RecordTick(statistics, BLOCK_CACHE_BYTES_READ, usage);
  }

  switch (block_type) {
    case BlockType::kFilter:
      PERF_COUNTER_ADD(block_cache_filter_hit_count, 1);

      if (get_context) {
        ++get_context->get_context_stats_.num_cache_filter_hit;
      } else {
        RecordTick(statistics, BLOCK_CACHE_FILTER_HIT);
      }
      break;

    case BlockType::kCompressionDictionary:
      // TODO: introduce perf counter for compression dictionary hit count
      if (get_context) {
        ++get_context->get_context_stats_.num_cache_compression_dict_hit;
      } else {
        RecordTick(statistics, BLOCK_CACHE_COMPRESSION_DICT_HIT);
      }
      break;

    case BlockType::kIndex:
      PERF_COUNTER_ADD(block_cache_index_hit_count, 1);

      if (get_context) {
        ++get_context->get_context_stats_.num_cache_index_hit;
      } else {
        RecordTick(statistics, BLOCK_CACHE_INDEX_HIT);
      }
      break;

    default:
      // TODO: introduce dedicated tickers/statistics/counters
      // for range tombstones
      if (get_context) {
        ++get_context->get_context_stats_.num_cache_data_hit;
      } else {
        RecordTick(statistics, BLOCK_CACHE_DATA_HIT);
      }
      break;
  }
}

void BlockBasedTable::UpdateCacheMissMetrics(BlockType block_type,
                                             GetContext* get_context) const {
  Statistics* const statistics = rep_->ioptions.statistics;

  // TODO: introduce aggregate (not per-level) block cache miss count
  PERF_COUNTER_BY_LEVEL_ADD(block_cache_miss_count, 1,
                            static_cast<uint32_t>(rep_->level));

  if (get_context) {
    ++get_context->get_context_stats_.num_cache_miss;
  } else {
    RecordTick(statistics, BLOCK_CACHE_MISS);
  }

  // TODO: introduce perf counters for misses per block type
  switch (block_type) {
    case BlockType::kFilter:
      if (get_context) {
        ++get_context->get_context_stats_.num_cache_filter_miss;
      } else {
        RecordTick(statistics, BLOCK_CACHE_FILTER_MISS);
      }
      break;

    case BlockType::kCompressionDictionary:
      if (get_context) {
        ++get_context->get_context_stats_.num_cache_compression_dict_miss;
      } else {
        RecordTick(statistics, BLOCK_CACHE_COMPRESSION_DICT_MISS);
      }
      break;

    case BlockType::kIndex:
      if (get_context) {
        ++get_context->get_context_stats_.num_cache_index_miss;
      } else {
        RecordTick(statistics, BLOCK_CACHE_INDEX_MISS);
      }
      break;

    default:
      // TODO: introduce dedicated tickers/statistics/counters
      // for range tombstones
      if (get_context) {
        ++get_context->get_context_stats_.num_cache_data_miss;
      } else {
        RecordTick(statistics, BLOCK_CACHE_DATA_MISS);
      }
      break;
  }
}

void BlockBasedTable::UpdateCacheInsertionMetrics(BlockType block_type,
                                                  GetContext* get_context,
                                                  size_t usage) const {
  Statistics* const statistics = rep_->ioptions.statistics;

  // TODO: introduce perf counters for block cache insertions
  if (get_context) {
    ++get_context->get_context_stats_.num_cache_add;
    get_context->get_context_stats_.num_cache_bytes_write += usage;
  } else {
    RecordTick(statistics, BLOCK_CACHE_ADD);
    RecordTick(statistics, BLOCK_CACHE_BYTES_WRITE, usage);
  }

  switch (block_type) {
    case BlockType::kFilter:
      if (get_context) {
        ++get_context->get_context_stats_.num_cache_filter_add;
        get_context->get_context_stats_.num_cache_filter_bytes_insert += usage;
      } else {
        RecordTick(statistics, BLOCK_CACHE_FILTER_ADD);
        RecordTick(statistics, BLOCK_CACHE_FILTER_BYTES_INSERT, usage);
      }
      break;

    case BlockType::kCompressionDictionary:
      if (get_context) {
        ++get_context->get_context_stats_.num_cache_compression_dict_add;
        get_context->get_context_stats_
            .num_cache_compression_dict_bytes_insert += usage;
      } else {
        RecordTick(statistics, BLOCK_CACHE_COMPRESSION_DICT_ADD);
        RecordTick(statistics, BLOCK_CACHE_COMPRESSION_DICT_BYTES_INSERT,
                   usage);
      }
      break;

    case BlockType::kIndex:
      if (get_context) {
        ++get_context->get_context_stats_.num_cache_index_add;
        get_context->get_context_stats_.num_cache_index_bytes_insert += usage;
      } else {
        RecordTick(statistics, BLOCK_CACHE_INDEX_ADD);
        RecordTick(statistics, BLOCK_CACHE_INDEX_BYTES_INSERT, usage);
      }
      break;

    default:
      // TODO: introduce dedicated tickers/statistics/counters
      // for range tombstones
      if (get_context) {
        ++get_context->get_context_stats_.num_cache_data_add;
        get_context->get_context_stats_.num_cache_data_bytes_insert += usage;
      } else {
        RecordTick(statistics, BLOCK_CACHE_DATA_ADD);
        RecordTick(statistics, BLOCK_CACHE_DATA_BYTES_INSERT, usage);
      }
      break;
  }
}

862
Cache::Handle* BlockBasedTable::GetEntryFromCache(
863
    Cache* block_cache, const Slice& key, BlockType block_type,
864
    GetContext* get_context) const {
865 866
  auto cache_handle = block_cache->Lookup(key, rep_->ioptions.statistics);

867
  if (cache_handle != nullptr) {
868 869
    UpdateCacheHitMetrics(block_type, get_context,
                          block_cache->GetUsage(cache_handle));
870
  } else {
871
    UpdateCacheMissMetrics(block_type, get_context);
872 873 874 875 876
  }

  return cache_handle;
}

877
// Helper function to setup the cache key's prefix for the Table.
878
void BlockBasedTable::SetupCacheKeyPrefix(Rep* rep) {
879 880
  assert(kMaxCacheKeyPrefixSize >= 10);
  rep->cache_key_prefix_size = 0;
881
  rep->compressed_cache_key_prefix_size = 0;
882
  if (rep->table_options.block_cache != nullptr) {
883 884
    GenerateCachePrefix(rep->table_options.block_cache.get(), rep->file->file(),
                        &rep->cache_key_prefix[0], &rep->cache_key_prefix_size);
885
  }
K
krad 已提交
886 887 888 889 890
  if (rep->table_options.persistent_cache != nullptr) {
    GenerateCachePrefix(/*cache=*/nullptr, rep->file->file(),
                        &rep->persistent_cache_key_prefix[0],
                        &rep->persistent_cache_key_prefix_size);
  }
891 892
  if (rep->table_options.block_cache_compressed != nullptr) {
    GenerateCachePrefix(rep->table_options.block_cache_compressed.get(),
893
                        rep->file->file(), &rep->compressed_cache_key_prefix[0],
894 895 896 897
                        &rep->compressed_cache_key_prefix_size);
  }
}

898 899
void BlockBasedTable::GenerateCachePrefix(Cache* cc, RandomAccessFile* file,
                                          char* buffer, size_t* size) {
900 901 902 903 904
  // generate an id from the file
  *size = file->GetUniqueId(buffer, kMaxCacheKeyPrefixSize);

  // If the prefix wasn't generated or was too long,
  // create one from the cache.
K
krad 已提交
905
  if (cc && *size == 0) {
906 907 908 909 910
    char* end = EncodeVarint64(buffer, cc->NewId());
    *size = static_cast<size_t>(end - buffer);
  }
}

911 912
void BlockBasedTable::GenerateCachePrefix(Cache* cc, WritableFile* file,
                                          char* buffer, size_t* size) {
913 914 915 916 917 918 919 920
  // generate an id from the file
  *size = file->GetUniqueId(buffer, kMaxCacheKeyPrefixSize);

  // If the prefix wasn't generated or was too long,
  // create one from the cache.
  if (*size == 0) {
    char* end = EncodeVarint64(buffer, cc->NewId());
    *size = static_cast<size_t>(end - buffer);
921 922 923
  }
}

924 925 926 927 928 929 930 931 932 933 934 935
namespace {
// Return True if table_properties has `user_prop_name` has a `true` value
// or it doesn't contain this property (for backward compatible).
bool IsFeatureSupported(const TableProperties& table_properties,
                        const std::string& user_prop_name, Logger* info_log) {
  auto& props = table_properties.user_collected_properties;
  auto pos = props.find(user_prop_name);
  // Older version doesn't have this value set. Skip this check.
  if (pos != props.end()) {
    if (pos->second == kPropFalse) {
      return false;
    } else if (pos->second != kPropTrue) {
936 937
      ROCKS_LOG_WARN(info_log, "Property %s has invalidate value %s",
                     user_prop_name.c_str(), pos->second.c_str());
938 939 940 941
    }
  }
  return true;
}
942

943 944 945 946 947 948 949
// Caller has to ensure seqno is not nullptr.
Status GetGlobalSequenceNumber(const TableProperties& table_properties,
                               SequenceNumber largest_seqno,
                               SequenceNumber* seqno) {
  const auto& props = table_properties.user_collected_properties;
  const auto version_pos = props.find(ExternalSstFilePropertyNames::kVersion);
  const auto seqno_pos = props.find(ExternalSstFilePropertyNames::kGlobalSeqno);
950

951
  *seqno = kDisableGlobalSequenceNumber;
952 953
  if (version_pos == props.end()) {
    if (seqno_pos != props.end()) {
954
      std::array<char, 200> msg_buf;
955
      // This is not an external sst file, global_seqno is not supported.
956 957
      snprintf(
          msg_buf.data(), msg_buf.max_size(),
958 959
          "A non-external sst file have global seqno property with value %s",
          seqno_pos->second.c_str());
960
      return Status::Corruption(msg_buf.data());
961
    }
962
    return Status::OK();
963 964 965 966 967
  }

  uint32_t version = DecodeFixed32(version_pos->second.c_str());
  if (version < 2) {
    if (seqno_pos != props.end() || version != 1) {
968
      std::array<char, 200> msg_buf;
969
      // This is a v1 external sst file, global_seqno is not supported.
970 971 972 973 974
      snprintf(msg_buf.data(), msg_buf.max_size(),
               "An external sst file with version %u have global seqno "
               "property with value %s",
               version, seqno_pos->second.c_str());
      return Status::Corruption(msg_buf.data());
975
    }
976
    return Status::OK();
977 978
  }

979 980 981 982 983 984 985
  // Since we have a plan to deprecate global_seqno, we do not return failure
  // if seqno_pos == props.end(). We rely on version_pos to detect whether the
  // SST is external.
  SequenceNumber global_seqno(0);
  if (seqno_pos != props.end()) {
    global_seqno = DecodeFixed64(seqno_pos->second.c_str());
  }
986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001
  // SstTableReader open table reader with kMaxSequenceNumber as largest_seqno
  // to denote it is unknown.
  if (largest_seqno < kMaxSequenceNumber) {
    if (global_seqno == 0) {
      global_seqno = largest_seqno;
    }
    if (global_seqno != largest_seqno) {
      std::array<char, 200> msg_buf;
      snprintf(
          msg_buf.data(), msg_buf.max_size(),
          "An external sst file with version %u have global seqno property "
          "with value %s, while largest seqno in the file is %llu",
          version, seqno_pos->second.c_str(),
          static_cast<unsigned long long>(largest_seqno));
      return Status::Corruption(msg_buf.data());
    }
1002
  }
1003
  *seqno = global_seqno;
1004 1005

  if (global_seqno > kMaxSequenceNumber) {
1006 1007 1008 1009 1010 1011
    std::array<char, 200> msg_buf;
    snprintf(msg_buf.data(), msg_buf.max_size(),
             "An external sst file with version %u have global seqno property "
             "with value %llu, which is greater than kMaxSequenceNumber",
             version, static_cast<unsigned long long>(global_seqno));
    return Status::Corruption(msg_buf.data());
1012 1013
  }

1014
  return Status::OK();
1015
}
1016 1017
}  // namespace

K
krad 已提交
1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029
Slice BlockBasedTable::GetCacheKey(const char* cache_key_prefix,
                                   size_t cache_key_prefix_size,
                                   const BlockHandle& handle, char* cache_key) {
  assert(cache_key != nullptr);
  assert(cache_key_prefix_size != 0);
  assert(cache_key_prefix_size <= kMaxCacheKeyPrefixSize);
  memcpy(cache_key, cache_key_prefix, cache_key_prefix_size);
  char* end =
      EncodeVarint64(cache_key + cache_key_prefix_size, handle.offset());
  return Slice(cache_key, static_cast<size_t>(end - cache_key));
}

1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040
Status BlockBasedTable::Open(
    const ImmutableCFOptions& ioptions, const EnvOptions& env_options,
    const BlockBasedTableOptions& table_options,
    const InternalKeyComparator& internal_comparator,
    std::unique_ptr<RandomAccessFileReader>&& file, uint64_t file_size,
    std::unique_ptr<TableReader>* table_reader,
    const SliceTransform* prefix_extractor,
    const bool prefetch_index_and_filter_in_cache, const bool skip_filters,
    const int level, const bool immortal_table,
    const SequenceNumber largest_seqno, TailPrefetchStats* tail_prefetch_stats,
    BlockCacheTracer* const block_cache_tracer) {
S
Siying Dong 已提交
1041
  table_reader->reset();
1042

1043
  Status s;
1044
  Footer footer;
1045 1046
  std::unique_ptr<FilePrefetchBuffer> prefetch_buffer;

1047 1048 1049
  // prefetch both index and filters, down to all partitions
  const bool prefetch_all = prefetch_index_and_filter_in_cache || level == 0;
  const bool preload_all = !table_options.cache_index_and_filter_blocks;
1050

1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061
  s = PrefetchTail(file.get(), file_size, tail_prefetch_stats, prefetch_all,
                   preload_all, &prefetch_buffer);

  // Read in the following order:
  //    1. Footer
  //    2. [metaindex block]
  //    3. [meta block: properties]
  //    4. [meta block: range deletion tombstone]
  //    5. [meta block: compression dictionary]
  //    6. [meta block: index]
  //    7. [meta block: filter]
1062 1063
  s = ReadFooterFromFile(file.get(), prefetch_buffer.get(), file_size, &footer,
                         kBlockBasedTableMagicNumber);
1064 1065 1066
  if (!s.ok()) {
    return s;
  }
1067
  if (!BlockBasedTableSupportedVersion(footer.version())) {
1068
    return Status::Corruption(
1069
        "Unknown Footer version. Maybe this file was created with newer "
1070 1071
        "version of RocksDB?");
  }
J
jorlow@chromium.org 已提交
1072

A
Aaron Gao 已提交
1073
  // We've successfully read the footer. We are ready to serve requests.
1074 1075 1076
  // Better not mutate rep_ after the creation. eg. internal_prefix_transform
  // raw pointer will be used to create HashIndexReader, whose reset may
  // access a dangling pointer.
1077
  BlockCacheLookupContext lookup_context{BlockCacheLookupCaller::kPrefetch};
1078
  Rep* rep = new BlockBasedTable::Rep(ioptions, env_options, table_options,
1079
                                      internal_comparator, skip_filters, level,
1080
                                      immortal_table);
K
Kai Liu 已提交
1081
  rep->file = std::move(file);
I
xxHash  
Igor Canadi 已提交
1082
  rep->footer = footer;
1083
  rep->index_type = table_options.index_type;
1084
  rep->hash_index_allow_collision = table_options.hash_index_allow_collision;
1085 1086
  // We need to wrap data with internal_prefix_transform to make sure it can
  // handle prefix correctly.
1087
  rep->internal_prefix_transform.reset(
1088
      new InternalKeySliceTransform(prefix_extractor));
1089
  SetupCacheKeyPrefix(rep);
1090 1091
  std::unique_ptr<BlockBasedTable> new_table(
      new BlockBasedTable(rep, block_cache_tracer));
K
Kai Liu 已提交
1092

K
krad 已提交
1093 1094 1095 1096 1097
  // page cache options
  rep->persistent_cache_options =
      PersistentCacheOptions(rep->table_options.persistent_cache,
                             std::string(rep->persistent_cache_key_prefix,
                                         rep->persistent_cache_key_prefix_size),
1098
                             rep->ioptions.statistics);
K
krad 已提交
1099

1100 1101 1102 1103 1104
  // Meta-blocks are not dictionary compressed. Explicitly set the dictionary
  // handle to null, otherwise it may be seen as uninitialized during the below
  // meta-block reads.
  rep->compression_dict_handle = BlockHandle::NullBlockHandle();

1105
  // Read metaindex
K
Kai Liu 已提交
1106
  std::unique_ptr<Block> meta;
S
sdong 已提交
1107
  std::unique_ptr<InternalIterator> meta_iter;
1108
  s = new_table->ReadMetaBlock(prefetch_buffer.get(), &meta, &meta_iter);
1109 1110 1111
  if (!s.ok()) {
    return s;
  }
K
Kai Liu 已提交
1112

1113 1114
  s = new_table->ReadPropertiesBlock(prefetch_buffer.get(), meta_iter.get(),
                                     largest_seqno);
1115 1116 1117
  if (!s.ok()) {
    return s;
  }
1118
  s = new_table->ReadRangeDelBlock(prefetch_buffer.get(), meta_iter.get(),
1119
                                   internal_comparator, &lookup_context);
1120 1121 1122
  if (!s.ok()) {
    return s;
  }
1123 1124
  s = new_table->PrefetchIndexAndFilterBlocks(
      prefetch_buffer.get(), meta_iter.get(), new_table.get(), prefetch_all,
1125
      table_options, level, &lookup_context);
1126 1127 1128 1129 1130 1131 1132 1133

  if (s.ok()) {
    // Update tail prefetch stats
    assert(prefetch_buffer.get() != nullptr);
    if (tail_prefetch_stats != nullptr) {
      assert(prefetch_buffer->min_offset_read() < file_size);
      tail_prefetch_stats->RecordEffectiveSize(
          static_cast<size_t>(file_size) - prefetch_buffer->min_offset_read());
I
Igor Canadi 已提交
1134
    }
1135 1136

    *table_reader = std::move(new_table);
I
Igor Canadi 已提交
1137 1138
  }

1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184
  return s;
}

Status BlockBasedTable::PrefetchTail(
    RandomAccessFileReader* file, uint64_t file_size,
    TailPrefetchStats* tail_prefetch_stats, const bool prefetch_all,
    const bool preload_all,
    std::unique_ptr<FilePrefetchBuffer>* prefetch_buffer) {
  size_t tail_prefetch_size = 0;
  if (tail_prefetch_stats != nullptr) {
    // Multiple threads may get a 0 (no history) when running in parallel,
    // but it will get cleared after the first of them finishes.
    tail_prefetch_size = tail_prefetch_stats->GetSuggestedPrefetchSize();
  }
  if (tail_prefetch_size == 0) {
    // Before read footer, readahead backwards to prefetch data. Do more
    // readahead if we're going to read index/filter.
    // TODO: This may incorrectly select small readahead in case partitioned
    // index/filter is enabled and top-level partition pinning is enabled.
    // That's because we need to issue readahead before we read the properties,
    // at which point we don't yet know the index type.
    tail_prefetch_size = prefetch_all || preload_all ? 512 * 1024 : 4 * 1024;
  }
  size_t prefetch_off;
  size_t prefetch_len;
  if (file_size < tail_prefetch_size) {
    prefetch_off = 0;
    prefetch_len = static_cast<size_t>(file_size);
  } else {
    prefetch_off = static_cast<size_t>(file_size - tail_prefetch_size);
    prefetch_len = tail_prefetch_size;
  }
  TEST_SYNC_POINT_CALLBACK("BlockBasedTable::Open::TailPrefetchLen",
                           &tail_prefetch_size);
  Status s;
  // TODO should not have this special logic in the future.
  if (!file->use_direct_io()) {
    prefetch_buffer->reset(new FilePrefetchBuffer(nullptr, 0, 0, false, true));
    s = file->Prefetch(prefetch_off, prefetch_len);
  } else {
    prefetch_buffer->reset(new FilePrefetchBuffer(nullptr, 0, 0, true, true));
    s = (*prefetch_buffer)->Prefetch(file, prefetch_off, prefetch_len);
  }
  return s;
}

1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211
Status VerifyChecksum(const ChecksumType type, const char* buf, size_t len,
                      uint32_t expected) {
  Status s;
  uint32_t actual = 0;
  switch (type) {
    case kNoChecksum:
      break;
    case kCRC32c:
      expected = crc32c::Unmask(expected);
      actual = crc32c::Value(buf, len);
      break;
    case kxxHash:
      actual = XXH32(buf, static_cast<int>(len), 0);
      break;
    case kxxHash64:
      actual = static_cast<uint32_t>(XXH64(buf, static_cast<int>(len), 0) &
                                     uint64_t{0xffffffff});
      break;
    default:
      s = Status::Corruption("unknown checksum type");
  }
  if (s.ok() && actual != expected) {
    s = Status::Corruption("properties block checksum mismatched");
  }
  return s;
}

1212
Status BlockBasedTable::TryReadPropertiesWithGlobalSeqno(
1213
    FilePrefetchBuffer* prefetch_buffer, const Slice& handle_value,
1214 1215 1216 1217 1218 1219 1220 1221 1222 1223
    TableProperties** table_properties) {
  assert(table_properties != nullptr);
  // If this is an external SST file ingested with write_global_seqno set to
  // true, then we expect the checksum mismatch because checksum was written
  // by SstFileWriter, but its global seqno in the properties block may have
  // been changed during ingestion. In this case, we read the properties
  // block, copy it to a memory buffer, change the global seqno to its
  // original value, i.e. 0, and verify the checksum again.
  BlockHandle props_block_handle;
  CacheAllocationPtr tmp_buf;
1224 1225
  Status s = ReadProperties(handle_value, rep_->file.get(), prefetch_buffer,
                            rep_->footer, rep_->ioptions, table_properties,
1226 1227 1228 1229 1230 1231 1232 1233
                            false /* verify_checksum */, &props_block_handle,
                            &tmp_buf, false /* compression_type_missing */,
                            nullptr /* memory_allocator */);
  if (s.ok() && tmp_buf) {
    const auto seqno_pos_iter =
        (*table_properties)
            ->properties_offsets.find(
                ExternalSstFilePropertyNames::kGlobalSeqno);
1234
    size_t block_size = static_cast<size_t>(props_block_handle.size());
1235 1236 1237 1238 1239 1240
    if (seqno_pos_iter != (*table_properties)->properties_offsets.end()) {
      uint64_t global_seqno_offset = seqno_pos_iter->second;
      EncodeFixed64(
          tmp_buf.get() + global_seqno_offset - props_block_handle.offset(), 0);
    }
    uint32_t value = DecodeFixed32(tmp_buf.get() + block_size + 1);
1241
    s = rocksdb::VerifyChecksum(rep_->footer.checksum(), tmp_buf.get(),
1242 1243 1244 1245 1246
                                block_size + 1, value);
  }
  return s;
}

1247
Status BlockBasedTable::ReadPropertiesBlock(
1248
    FilePrefetchBuffer* prefetch_buffer, InternalIterator* meta_iter,
1249
    const SequenceNumber largest_seqno) {
1250
  bool found_properties_block = true;
1251 1252
  Status s;
  s = SeekToPropertiesBlock(meta_iter, &found_properties_block);
1253

1254
  if (!s.ok()) {
1255
    ROCKS_LOG_WARN(rep_->ioptions.info_log,
1256 1257
                   "Error when seeking to properties block from file: %s",
                   s.ToString().c_str());
1258
  } else if (found_properties_block) {
K
Kai Liu 已提交
1259
    s = meta_iter->status();
K
kailiu 已提交
1260
    TableProperties* table_properties = nullptr;
K
Kai Liu 已提交
1261
    if (s.ok()) {
1262
      s = ReadProperties(
1263 1264
          meta_iter->value(), rep_->file.get(), prefetch_buffer, rep_->footer,
          rep_->ioptions, &table_properties, true /* verify_checksum */,
1265 1266 1267 1268 1269
          nullptr /* ret_block_handle */, nullptr /* ret_block_contents */,
          false /* compression_type_missing */, nullptr /* memory_allocator */);
    }

    if (s.IsCorruption()) {
1270 1271
      s = TryReadPropertiesWithGlobalSeqno(prefetch_buffer, meta_iter->value(),
                                           &table_properties);
1272 1273 1274 1275
    }
    std::unique_ptr<TableProperties> props_guard;
    if (table_properties != nullptr) {
      props_guard.reset(table_properties);
K
Kai Liu 已提交
1276
    }
J
jorlow@chromium.org 已提交
1277

K
Kai Liu 已提交
1278
    if (!s.ok()) {
1279
      ROCKS_LOG_WARN(rep_->ioptions.info_log,
1280 1281 1282
                     "Encountered error while reading data from properties "
                     "block %s",
                     s.ToString().c_str());
K
kailiu 已提交
1283
    } else {
1284
      assert(table_properties != nullptr);
1285 1286 1287 1288 1289 1290
      rep_->table_properties.reset(props_guard.release());
      rep_->blocks_maybe_compressed =
          rep_->table_properties->compression_name !=
          CompressionTypeToString(kNoCompression);
      rep_->blocks_definitely_zstd_compressed =
          (rep_->table_properties->compression_name ==
1291
               CompressionTypeToString(kZSTD) ||
1292
           rep_->table_properties->compression_name ==
1293
               CompressionTypeToString(kZSTDNotFinalCompression));
K
Kai Liu 已提交
1294
    }
1295
  } else {
1296
    ROCKS_LOG_ERROR(rep_->ioptions.info_log,
1297
                    "Cannot find Properties block from file.");
K
Kai Liu 已提交
1298
  }
1299
#ifndef ROCKSDB_LITE
1300 1301 1302
  if (rep_->table_properties) {
    ParseSliceTransform(rep_->table_properties->prefix_extractor_name,
                        &(rep_->table_prefix_extractor));
1303 1304
  }
#endif  // ROCKSDB_LITE
K
Kai Liu 已提交
1305

1306
  // Read the table properties, if provided.
1307 1308 1309
  if (rep_->table_properties) {
    rep_->whole_key_filtering &=
        IsFeatureSupported(*(rep_->table_properties),
1310
                           BlockBasedTablePropertyNames::kWholeKeyFiltering,
1311 1312 1313 1314 1315 1316 1317 1318
                           rep_->ioptions.info_log);
    rep_->prefix_filtering &=
        IsFeatureSupported(*(rep_->table_properties),
                           BlockBasedTablePropertyNames::kPrefixFiltering,
                           rep_->ioptions.info_log);

    s = GetGlobalSequenceNumber(*(rep_->table_properties), largest_seqno,
                                &(rep_->global_seqno));
1319
    if (!s.ok()) {
1320
      ROCKS_LOG_ERROR(rep_->ioptions.info_log, "%s", s.ToString().c_str());
1321
    }
1322
  }
1323 1324
  return s;
}
1325

1326
Status BlockBasedTable::ReadRangeDelBlock(
1327
    FilePrefetchBuffer* prefetch_buffer, InternalIterator* meta_iter,
1328 1329
    const InternalKeyComparator& internal_comparator,
    BlockCacheLookupContext* lookup_context) {
1330
  Status s;
1331
  bool found_range_del_block;
1332 1333
  BlockHandle range_del_handle;
  s = SeekToRangeDelBlock(meta_iter, &found_range_del_block, &range_del_handle);
1334
  if (!s.ok()) {
1335
    ROCKS_LOG_WARN(
1336
        rep_->ioptions.info_log,
1337 1338
        "Error when seeking to range delete tombstones block from file: %s",
        s.ToString().c_str());
1339
  } else if (found_range_del_block && !range_del_handle.IsNull()) {
1340
    ReadOptions read_options;
1341
    std::unique_ptr<InternalIterator> iter(NewDataBlockIterator<DataBlockIter>(
1342 1343 1344 1345
        read_options, range_del_handle,
        /*input_iter=*/nullptr, BlockType::kRangeDeletion,
        /*key_includes_seq=*/true, /*index_key_is_full=*/true,
        /*get_context=*/nullptr, lookup_context, Status(), prefetch_buffer));
1346 1347
    assert(iter != nullptr);
    s = iter->status();
1348 1349
    if (!s.ok()) {
      ROCKS_LOG_WARN(
1350
          rep_->ioptions.info_log,
1351 1352
          "Encountered error while reading data from range del block %s",
          s.ToString().c_str());
1353
    } else {
1354
      rep_->fragmented_range_dels =
1355 1356
          std::make_shared<FragmentedRangeTombstoneList>(std::move(iter),
                                                         internal_comparator);
1357 1358
    }
  }
1359 1360 1361 1362
  return s;
}

Status BlockBasedTable::ReadCompressionDictBlock(
1363 1364
    FilePrefetchBuffer* prefetch_buffer,
    std::unique_ptr<const BlockContents>* compression_dict_block) const {
1365
  assert(compression_dict_block != nullptr);
1366
  Status s;
1367
  if (!rep_->compression_dict_handle.IsNull()) {
1368 1369 1370
    std::unique_ptr<BlockContents> compression_dict_cont{new BlockContents()};
    PersistentCacheOptions cache_options;
    ReadOptions read_options;
1371
    read_options.verify_checksums = true;
1372
    BlockFetcher compression_block_fetcher(
1373 1374 1375
        rep_->file.get(), prefetch_buffer, rep_->footer, read_options,
        rep_->compression_dict_handle, compression_dict_cont.get(),
        rep_->ioptions, false /* decompress */, false /*maybe_compressed*/,
1376 1377
        BlockType::kCompressionDictionary, UncompressionDict::GetEmptyDict(),
        cache_options);
1378 1379 1380 1381
    s = compression_block_fetcher.ReadBlockContents();

    if (!s.ok()) {
      ROCKS_LOG_WARN(
1382
          rep_->ioptions.info_log,
1383 1384 1385 1386
          "Encountered error while reading data from compression dictionary "
          "block %s",
          s.ToString().c_str());
    } else {
1387
      *compression_dict_block = std::move(compression_dict_cont);
1388 1389 1390 1391 1392 1393
    }
  }
  return s;
}

Status BlockBasedTable::PrefetchIndexAndFilterBlocks(
1394
    FilePrefetchBuffer* prefetch_buffer, InternalIterator* meta_iter,
1395
    BlockBasedTable* new_table, bool prefetch_all,
1396 1397
    const BlockBasedTableOptions& table_options, const int level,
    BlockCacheLookupContext* lookup_context) {
1398 1399 1400
  Status s;

  // Find filter handle and filter type
1401
  if (rep_->filter_policy) {
1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419
    for (auto filter_type :
         {Rep::FilterType::kFullFilter, Rep::FilterType::kPartitionedFilter,
          Rep::FilterType::kBlockFilter}) {
      std::string prefix;
      switch (filter_type) {
        case Rep::FilterType::kFullFilter:
          prefix = kFullFilterBlockPrefix;
          break;
        case Rep::FilterType::kPartitionedFilter:
          prefix = kPartitionedFilterBlockPrefix;
          break;
        case Rep::FilterType::kBlockFilter:
          prefix = kFilterBlockPrefix;
          break;
        default:
          assert(0);
      }
      std::string filter_block_key = prefix;
1420 1421
      filter_block_key.append(rep_->filter_policy->Name());
      if (FindMetaBlock(meta_iter, filter_block_key, &rep_->filter_handle)
1422
              .ok()) {
1423
        rep_->filter_type = filter_type;
1424 1425 1426 1427
        break;
      }
    }
  }
1428

1429 1430 1431 1432
  {
    // Find compression dictionary handle
    bool found_compression_dict;
    s = SeekToCompressionDictBlock(meta_iter, &found_compression_dict,
1433
                                   &rep_->compression_dict_handle);
1434 1435
  }

1436
  BlockBasedTableOptions::IndexType index_type = new_table->UpdateIndexType();
1437 1438 1439

  const bool use_cache = table_options.cache_index_and_filter_blocks;

1440 1441 1442 1443 1444 1445 1446
  // prefetch the first level of index
  const bool prefetch_index =
      prefetch_all ||
      (table_options.pin_top_level_index_and_filter &&
       index_type == BlockBasedTableOptions::kTwoLevelIndexSearch);
  // prefetch the first level of filter
  const bool prefetch_filter =
1447 1448 1449
      prefetch_all ||
      (table_options.pin_top_level_index_and_filter &&
       rep_->filter_type == Rep::FilterType::kPartitionedFilter);
1450
  // Partition fitlers cannot be enabled without partition indexes
1451
  assert(!prefetch_filter || prefetch_index);
1452 1453
  // pin both index and filters, down to all partitions
  const bool pin_all =
1454
      rep_->table_options.pin_l0_filter_and_index_blocks_in_cache && level == 0;
1455 1456 1457 1458 1459 1460 1461
  // pin the first level of index
  const bool pin_index =
      pin_all || (table_options.pin_top_level_index_and_filter &&
                  index_type == BlockBasedTableOptions::kTwoLevelIndexSearch);
  // pin the first level of filter
  const bool pin_filter =
      pin_all || (table_options.pin_top_level_index_and_filter &&
1462
                  rep_->filter_type == Rep::FilterType::kPartitionedFilter);
1463 1464 1465 1466

  IndexReader* index_reader = nullptr;
  if (s.ok()) {
    s = new_table->CreateIndexReader(prefetch_buffer, meta_iter, use_cache,
1467 1468
                                     prefetch_index, pin_index, &index_reader,
                                     lookup_context);
1469 1470
    if (s.ok()) {
      assert(index_reader != nullptr);
1471
      rep_->index_reader.reset(index_reader);
1472 1473 1474 1475
      // The partitions of partitioned index are always stored in cache. They
      // are hence follow the configuration for pin and prefetch regardless of
      // the value of cache_index_and_filter_blocks
      if (prefetch_all) {
1476
        rep_->index_reader->CacheDependencies(pin_all);
1477 1478 1479 1480 1481 1482 1483
      }
    } else {
      delete index_reader;
      index_reader = nullptr;
    }
  }

1484
  // pre-fetching of blocks is turned on
1485
  // Will use block cache for meta-blocks access
1486
  // Always prefetch index and filter for level 0
1487
  // TODO(ajkr): also prefetch compression dictionary block
1488 1489
  // TODO(ajkr): also pin compression dictionary block when
  // `pin_l0_filter_and_index_blocks_in_cache == true`.
1490
  if (table_options.cache_index_and_filter_blocks) {
1491 1492 1493
    assert(table_options.block_cache != nullptr);
    if (s.ok() && prefetch_filter) {
      // Hack: Call GetFilter() to implicitly add filter to the block_cache
1494
      auto filter_entry =
1495 1496 1497
          new_table->GetFilter(rep_->table_prefix_extractor.get(),
                               /*prefetch_buffer=*/nullptr, /*no_io=*/false,
                               /*get_context=*/nullptr, lookup_context);
1498 1499
      if (filter_entry.GetValue() != nullptr && prefetch_all) {
        filter_entry.GetValue()->CacheDependencies(
1500
            pin_all, rep_->table_prefix_extractor.get());
1501 1502 1503 1504 1505
      }
      // if pin_filter is true then save it in rep_->filter_entry; it will be
      // released in the destructor only, hence it will be pinned in the
      // cache while this reader is alive
      if (pin_filter) {
1506
        rep_->filter_entry = std::move(filter_entry);
K
Kai Liu 已提交
1507
      }
1508 1509
    }
  } else {
1510
    std::unique_ptr<const BlockContents> compression_dict_block;
1511 1512
    if (s.ok()) {
      // Set filter block
1513
      if (rep_->filter_policy) {
M
Maysam Yabandeh 已提交
1514
        const bool is_a_filter_partition = true;
1515 1516 1517 1518
        auto filter = new_table->ReadFilter(
            prefetch_buffer, rep_->filter_handle, !is_a_filter_partition,
            rep_->table_prefix_extractor.get());
        rep_->filter.reset(filter);
1519 1520
        // Refer to the comment above about paritioned indexes always being
        // cached
1521
        if (filter && prefetch_all) {
1522 1523
          filter->CacheDependencies(pin_all,
                                    rep_->table_prefix_extractor.get());
M
Maysam Yabandeh 已提交
1524
        }
1525
      }
1526
      s = ReadCompressionDictBlock(prefetch_buffer, &compression_dict_block);
K
Kai Liu 已提交
1527
    }
1528
    if (s.ok() && !rep_->compression_dict_handle.IsNull()) {
1529 1530
      assert(compression_dict_block != nullptr);
      // TODO(ajkr): find a way to avoid the `compression_dict_block` data copy
1531
      rep_->uncompression_dict.reset(new UncompressionDict(
1532
          compression_dict_block->data.ToString(),
1533
          rep_->blocks_definitely_zstd_compressed, rep_->ioptions.statistics));
1534
    }
K
Kai Liu 已提交
1535
  }
J
jorlow@chromium.org 已提交
1536 1537 1538
  return s;
}

S
Siying Dong 已提交
1539
void BlockBasedTable::SetupForCompaction() {
1540
  switch (rep_->ioptions.access_hint_on_compaction_start) {
1541 1542 1543
    case Options::NONE:
      break;
    case Options::NORMAL:
1544
      rep_->file->file()->Hint(RandomAccessFile::NORMAL);
1545 1546
      break;
    case Options::SEQUENTIAL:
1547
      rep_->file->file()->Hint(RandomAccessFile::SEQUENTIAL);
1548 1549
      break;
    case Options::WILLNEED:
1550
      rep_->file->file()->Hint(RandomAccessFile::WILLNEED);
1551 1552 1553 1554 1555 1556
      break;
    default:
      assert(false);
  }
}

K
kailiu 已提交
1557 1558
std::shared_ptr<const TableProperties> BlockBasedTable::GetTableProperties()
    const {
K
kailiu 已提交
1559
  return rep_->table_properties;
K
Kai Liu 已提交
1560
}
S
Sanjay Ghemawat 已提交
1561

1562 1563 1564 1565 1566 1567 1568 1569
size_t BlockBasedTable::ApproximateMemoryUsage() const {
  size_t usage = 0;
  if (rep_->filter) {
    usage += rep_->filter->ApproximateMemoryUsage();
  }
  if (rep_->index_reader) {
    usage += rep_->index_reader->ApproximateMemoryUsage();
  }
1570 1571 1572
  if (rep_->uncompression_dict) {
    usage += rep_->uncompression_dict->ApproximateMemoryUsage();
  }
1573 1574 1575
  return usage;
}

K
Kai Liu 已提交
1576 1577
// Load the meta-block from the file. On success, return the loaded meta block
// and its iterator.
1578
Status BlockBasedTable::ReadMetaBlock(FilePrefetchBuffer* prefetch_buffer,
S
sdong 已提交
1579 1580
                                      std::unique_ptr<Block>* meta_block,
                                      std::unique_ptr<InternalIterator>* iter) {
S
Sanjay Ghemawat 已提交
1581 1582
  // TODO(sanjay): Skip this if footer.metaindex_handle() size indicates
  // it is an empty block.
1583
  std::unique_ptr<Block> meta;
K
Kai Liu 已提交
1584
  Status s = ReadBlockFromFile(
1585 1586
      rep_->file.get(), prefetch_buffer, rep_->footer, ReadOptions(),
      rep_->footer.metaindex_handle(), &meta, rep_->ioptions,
1587
      true /* decompress */, true /*maybe_compressed*/, BlockType::kMetaIndex,
1588
      UncompressionDict::GetEmptyDict(), rep_->persistent_cache_options,
1589
      kDisableGlobalSequenceNumber, 0 /* read_amp_bytes_per_bit */,
1590
      GetMemoryAllocator(rep_->table_options));
K
Kai Liu 已提交
1591

K
Kai Liu 已提交
1592
  if (!s.ok()) {
1593
    ROCKS_LOG_ERROR(rep_->ioptions.info_log,
1594 1595 1596
                    "Encountered error while reading data from properties"
                    " block %s",
                    s.ToString().c_str());
K
Kai Liu 已提交
1597
    return s;
S
Sanjay Ghemawat 已提交
1598
  }
K
Kai Liu 已提交
1599

1600
  *meta_block = std::move(meta);
K
Kai Liu 已提交
1601
  // meta block uses bytewise comparator.
1602 1603
  iter->reset(meta_block->get()->NewIterator<DataBlockIter>(
      BytewiseComparator(), BytewiseComparator()));
K
Kai Liu 已提交
1604
  return Status::OK();
S
Sanjay Ghemawat 已提交
1605 1606
}

1607 1608
Status BlockBasedTable::GetDataBlockFromCache(
    const Slice& block_cache_key, const Slice& compressed_block_cache_key,
1609
    Cache* block_cache, Cache* block_cache_compressed,
1610
    const ReadOptions& read_options, CachableEntry<Block>* block,
1611
    const UncompressionDict& uncompression_dict, BlockType block_type,
1612 1613
    GetContext* get_context) const {
  const size_t read_amp_bytes_per_bit =
1614 1615 1616
      block_type == BlockType::kData
          ? rep_->table_options.read_amp_bytes_per_bit
          : 0;
1617 1618 1619
  assert(block);
  assert(block->IsEmpty());

1620
  Status s;
1621
  BlockContents* compressed_block = nullptr;
1622 1623 1624 1625
  Cache::Handle* block_cache_compressed_handle = nullptr;

  // Lookup uncompressed cache first
  if (block_cache != nullptr) {
1626 1627
    auto cache_handle = GetEntryFromCache(block_cache, block_cache_key,
                                          block_type, get_context);
1628 1629 1630 1631
    if (cache_handle != nullptr) {
      block->SetCachedValue(
          reinterpret_cast<Block*>(block_cache->Value(cache_handle)),
          block_cache, cache_handle);
1632 1633 1634 1635 1636
      return s;
    }
  }

  // If not found, search from the compressed block cache.
1637
  assert(block->IsEmpty());
1638 1639 1640 1641 1642 1643 1644 1645

  if (block_cache_compressed == nullptr) {
    return s;
  }

  assert(!compressed_block_cache_key.empty());
  block_cache_compressed_handle =
      block_cache_compressed->Lookup(compressed_block_cache_key);
1646 1647 1648

  Statistics* statistics = rep_->ioptions.statistics;

1649 1650 1651 1652 1653 1654 1655 1656 1657
  // if we found in the compressed cache, then uncompress and insert into
  // uncompressed cache
  if (block_cache_compressed_handle == nullptr) {
    RecordTick(statistics, BLOCK_CACHE_COMPRESSED_MISS);
    return s;
  }

  // found compressed block
  RecordTick(statistics, BLOCK_CACHE_COMPRESSED_HIT);
1658
  compressed_block = reinterpret_cast<BlockContents*>(
1659
      block_cache_compressed->Value(block_cache_compressed_handle));
1660 1661
  CompressionType compression_type = compressed_block->get_compression_type();
  assert(compression_type != kNoCompression);
1662 1663 1664

  // Retrieve the uncompressed contents into a new buffer
  BlockContents contents;
1665
  UncompressionContext context(compression_type);
1666
  UncompressionInfo info(context, uncompression_dict, compression_type);
1667 1668 1669 1670
  s = UncompressBlockContents(
      info, compressed_block->data.data(), compressed_block->data.size(),
      &contents, rep_->table_options.format_version, rep_->ioptions,
      GetMemoryAllocator(rep_->table_options));
1671 1672 1673

  // Insert uncompressed block into block cache
  if (s.ok()) {
1674
    std::unique_ptr<Block> block_holder(
1675
        new Block(std::move(contents), rep_->get_global_seqno(block_type),
1676
                  read_amp_bytes_per_bit, statistics));  // uncompressed block
1677 1678

    if (block_cache != nullptr && block_holder->own_bytes() &&
1679
        read_options.fill_cache) {
1680 1681 1682
      size_t charge = block_holder->ApproximateMemoryUsage();
      Cache::Handle* cache_handle = nullptr;
      s = block_cache->Insert(block_cache_key, block_holder.get(), charge,
1683
                              &DeleteCachedEntry<Block>, &cache_handle);
1684
#ifndef NDEBUG
1685
      block_cache->TEST_mark_as_data_block(block_cache_key, charge);
1686
#endif  // NDEBUG
1687
      if (s.ok()) {
1688 1689 1690 1691
        assert(cache_handle != nullptr);
        block->SetCachedValue(block_holder.release(), block_cache,
                              cache_handle);

1692
        UpdateCacheInsertionMetrics(block_type, get_context, charge);
1693 1694 1695
      } else {
        RecordTick(statistics, BLOCK_CACHE_ADD_FAILURES);
      }
1696 1697
    } else {
      block->SetOwnedValue(block_holder.release());
1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708
    }
  }

  // Release hold on compressed cache entry
  block_cache_compressed->Release(block_cache_compressed_handle);
  return s;
}

Status BlockBasedTable::PutDataBlockToCache(
    const Slice& block_cache_key, const Slice& compressed_block_cache_key,
    Cache* block_cache, Cache* block_cache_compressed,
1709
    CachableEntry<Block>* cached_block, BlockContents* raw_block_contents,
1710
    CompressionType raw_block_comp_type,
1711
    const UncompressionDict& uncompression_dict, SequenceNumber seq_no,
1712
    MemoryAllocator* memory_allocator, BlockType block_type,
1713 1714 1715 1716
    GetContext* get_context) const {
  const ImmutableCFOptions& ioptions = rep_->ioptions;
  const uint32_t format_version = rep_->table_options.format_version;
  const size_t read_amp_bytes_per_bit =
1717 1718 1719
      block_type == BlockType::kData
          ? rep_->table_options.read_amp_bytes_per_bit
          : 0;
1720
  const Cache::Priority priority =
1721 1722 1723 1724
      rep_->table_options.cache_index_and_filter_blocks_with_high_priority &&
              (block_type == BlockType::kFilter ||
               block_type == BlockType::kCompressionDictionary ||
               block_type == BlockType::kIndex)
1725 1726
          ? Cache::Priority::HIGH
          : Cache::Priority::LOW;
1727 1728
  assert(cached_block);
  assert(cached_block->IsEmpty());
1729
  assert(raw_block_comp_type == kNoCompression ||
1730 1731 1732
         block_cache_compressed != nullptr);

  Status s;
1733
  Statistics* statistics = ioptions.statistics;
1734 1735

  std::unique_ptr<Block> block_holder;
1736
  if (raw_block_comp_type != kNoCompression) {
1737 1738
    // Retrieve the uncompressed contents into a new buffer
    BlockContents uncompressed_block_contents;
1739
    UncompressionContext context(raw_block_comp_type);
1740
    UncompressionInfo info(context, uncompression_dict, raw_block_comp_type);
1741 1742 1743 1744
    s = UncompressBlockContents(info, raw_block_contents->data.data(),
                                raw_block_contents->data.size(),
                                &uncompressed_block_contents, format_version,
                                ioptions, memory_allocator);
1745 1746 1747
    if (!s.ok()) {
      return s;
    }
1748

1749 1750
    block_holder.reset(new Block(std::move(uncompressed_block_contents), seq_no,
                                 read_amp_bytes_per_bit, statistics));
1751
  } else {
1752 1753
    block_holder.reset(new Block(std::move(*raw_block_contents), seq_no,
                                 read_amp_bytes_per_bit, statistics));
1754 1755 1756 1757
  }

  // Insert compressed block into compressed block cache.
  // Release the hold on the compressed cache entry immediately.
1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772
  if (block_cache_compressed != nullptr &&
      raw_block_comp_type != kNoCompression && raw_block_contents != nullptr &&
      raw_block_contents->own_bytes()) {
#ifndef NDEBUG
    assert(raw_block_contents->is_raw_block);
#endif  // NDEBUG

    // We cannot directly put raw_block_contents because this could point to
    // an object in the stack.
    BlockContents* block_cont_for_comp_cache =
        new BlockContents(std::move(*raw_block_contents));
    s = block_cache_compressed->Insert(
        compressed_block_cache_key, block_cont_for_comp_cache,
        block_cont_for_comp_cache->ApproximateMemoryUsage(),
        &DeleteCachedEntry<BlockContents>);
1773 1774 1775 1776 1777
    if (s.ok()) {
      // Avoid the following code to delete this cached block.
      RecordTick(statistics, BLOCK_CACHE_COMPRESSED_ADD);
    } else {
      RecordTick(statistics, BLOCK_CACHE_COMPRESSED_ADD_FAILURES);
1778
      delete block_cont_for_comp_cache;
1779
    }
1780 1781 1782
  }

  // insert into uncompressed block cache
1783 1784 1785 1786
  if (block_cache != nullptr && block_holder->own_bytes()) {
    size_t charge = block_holder->ApproximateMemoryUsage();
    Cache::Handle* cache_handle = nullptr;
    s = block_cache->Insert(block_cache_key, block_holder.get(), charge,
1787
                            &DeleteCachedEntry<Block>, &cache_handle, priority);
1788
#ifndef NDEBUG
1789
    block_cache->TEST_mark_as_data_block(block_cache_key, charge);
1790
#endif  // NDEBUG
1791
    if (s.ok()) {
1792 1793 1794 1795
      assert(cache_handle != nullptr);
      cached_block->SetCachedValue(block_holder.release(), block_cache,
                                   cache_handle);

1796
      UpdateCacheInsertionMetrics(block_type, get_context, charge);
1797 1798 1799
    } else {
      RecordTick(statistics, BLOCK_CACHE_ADD_FAILURES);
    }
1800 1801
  } else {
    cached_block->SetOwnedValue(block_holder.release());
1802 1803 1804 1805 1806
  }

  return s;
}

M
Maysam Yabandeh 已提交
1807
FilterBlockReader* BlockBasedTable::ReadFilter(
1808
    FilePrefetchBuffer* prefetch_buffer, const BlockHandle& filter_handle,
1809 1810
    const bool is_a_filter_partition,
    const SliceTransform* prefix_extractor) const {
M
Maysam Yabandeh 已提交
1811
  auto& rep = rep_;
K
Kai Liu 已提交
1812 1813
  // TODO: We might want to unify with ReadBlockFromFile() if we start
  // requiring checksum verification in Table::Open.
I
Igor Canadi 已提交
1814 1815 1816 1817
  if (rep->filter_type == Rep::FilterType::kNoFilter) {
    return nullptr;
  }
  BlockContents block;
S
Siying Dong 已提交
1818

1819 1820 1821
  BlockFetcher block_fetcher(
      rep->file.get(), prefetch_buffer, rep->footer, ReadOptions(),
      filter_handle, &block, rep->ioptions, false /* decompress */,
1822 1823 1824
      false /*maybe_compressed*/, BlockType::kFilter,
      UncompressionDict::GetEmptyDict(), rep->persistent_cache_options,
      GetMemoryAllocator(rep->table_options));
S
Siying Dong 已提交
1825 1826 1827
  Status s = block_fetcher.ReadBlockContents();

  if (!s.ok()) {
I
Igor Canadi 已提交
1828 1829 1830 1831 1832 1833
    // Error reading the block
    return nullptr;
  }

  assert(rep->filter_policy);

M
Maysam Yabandeh 已提交
1834 1835 1836 1837 1838 1839 1840 1841 1842
  auto filter_type = rep->filter_type;
  if (rep->filter_type == Rep::FilterType::kPartitionedFilter &&
      is_a_filter_partition) {
    filter_type = Rep::FilterType::kFullFilter;
  }

  switch (filter_type) {
    case Rep::FilterType::kPartitionedFilter: {
      return new PartitionedFilterBlockReader(
1843
          rep->prefix_filtering ? prefix_extractor : nullptr,
M
Maysam Yabandeh 已提交
1844
          rep->whole_key_filtering, std::move(block), nullptr,
1845 1846
          rep->ioptions.statistics, rep->internal_comparator, this,
          rep_->table_properties == nullptr ||
1847 1848 1849
              rep_->table_properties->index_key_is_user_key == 0,
          rep_->table_properties == nullptr ||
              rep_->table_properties->index_value_is_delta_encoded == 0);
M
Maysam Yabandeh 已提交
1850 1851 1852 1853
    }

    case Rep::FilterType::kBlockFilter:
      return new BlockBasedFilterBlockReader(
1854
          rep->prefix_filtering ? prefix_extractor : nullptr,
M
Maysam Yabandeh 已提交
1855 1856 1857 1858 1859 1860 1861
          rep->table_options, rep->whole_key_filtering, std::move(block),
          rep->ioptions.statistics);

    case Rep::FilterType::kFullFilter: {
      auto filter_bits_reader =
          rep->filter_policy->GetFilterBitsReader(block.data);
      assert(filter_bits_reader != nullptr);
I
Igor Canadi 已提交
1862
      return new FullFilterBlockReader(
1863
          rep->prefix_filtering ? prefix_extractor : nullptr,
1864 1865
          rep->whole_key_filtering, std::move(block), filter_bits_reader,
          rep->ioptions.statistics);
1866
    }
I
Igor Canadi 已提交
1867

M
Maysam Yabandeh 已提交
1868 1869 1870 1871 1872 1873
    default:
      // filter_type is either kNoFilter (exited the function at the first if),
      // or it must be covered in this switch block
      assert(false);
      return nullptr;
  }
S
Sanjay Ghemawat 已提交
1874 1875
}

1876
CachableEntry<FilterBlockReader> BlockBasedTable::GetFilter(
1877
    const SliceTransform* prefix_extractor, FilePrefetchBuffer* prefetch_buffer,
1878 1879
    bool no_io, GetContext* get_context,
    BlockCacheLookupContext* lookup_context) const {
M
Maysam Yabandeh 已提交
1880 1881
  const BlockHandle& filter_blk_handle = rep_->filter_handle;
  const bool is_a_filter_partition = true;
1882
  return GetFilter(prefetch_buffer, filter_blk_handle, !is_a_filter_partition,
1883
                   no_io, get_context, lookup_context, prefix_extractor);
M
Maysam Yabandeh 已提交
1884 1885
}

1886
CachableEntry<FilterBlockReader> BlockBasedTable::GetFilter(
1887
    FilePrefetchBuffer* prefetch_buffer, const BlockHandle& filter_blk_handle,
1888
    const bool is_a_filter_partition, bool no_io, GetContext* get_context,
1889
    BlockCacheLookupContext* lookup_context,
1890
    const SliceTransform* prefix_extractor) const {
1891 1892 1893 1894
  // If cache_index_and_filter_blocks is false, filter should be pre-populated.
  // We will return rep_->filter anyway. rep_->filter can be nullptr if filter
  // read fails at Open() time. We don't want to reload again since it will
  // most probably fail again.
M
Maysam Yabandeh 已提交
1895 1896
  if (!is_a_filter_partition &&
      !rep_->table_options.cache_index_and_filter_blocks) {
1897 1898
    return {rep_->filter.get(), /*cache=*/nullptr, /*cache_handle=*/nullptr,
            /*own_value=*/false};
1899 1900
  }

1901 1902 1903
  Cache* block_cache = rep_->table_options.block_cache.get();
  if (rep_->filter_policy == nullptr /* do not use filter */ ||
      block_cache == nullptr /* no block cache at all */) {
1904
    return CachableEntry<FilterBlockReader>();
K
Kai Liu 已提交
1905 1906
  }

1907
  if (!is_a_filter_partition && rep_->filter_entry.IsCached()) {
1908 1909
    return {rep_->filter_entry.GetValue(), /*cache=*/nullptr,
    /*cache_handle=*/nullptr, /*own_value=*/false};
1910 1911 1912 1913
  }

  PERF_TIMER_GUARD(read_filter_block_nanos);

K
Kai Liu 已提交
1914 1915
  // Fetching from the cache
  char cache_key[kMaxCacheKeyPrefixSize + kMaxVarint64Length];
1916
  auto key = GetCacheKey(rep_->cache_key_prefix, rep_->cache_key_prefix_size,
M
Maysam Yabandeh 已提交
1917
                         filter_blk_handle, cache_key);
K
Kai Liu 已提交
1918

1919 1920
  Cache::Handle* cache_handle =
      GetEntryFromCache(block_cache, key, BlockType::kFilter, get_context);
K
Kai Liu 已提交
1921 1922

  FilterBlockReader* filter = nullptr;
1923 1924 1925
  size_t usage = 0;
  bool is_cache_hit = false;
  bool return_empty_reader = false;
K
Kai Liu 已提交
1926
  if (cache_handle != nullptr) {
1927 1928
    filter =
        reinterpret_cast<FilterBlockReader*>(block_cache->Value(cache_handle));
1929 1930
    usage = filter->ApproximateMemoryUsage();
    is_cache_hit = true;
K
Kai Liu 已提交
1931 1932
  } else if (no_io) {
    // Do not invoke any io.
1933
    return_empty_reader = true;
K
Kai Liu 已提交
1934
  } else {
1935 1936
    filter = ReadFilter(prefetch_buffer, filter_blk_handle,
                        is_a_filter_partition, prefix_extractor);
I
Igor Canadi 已提交
1937
    if (filter != nullptr) {
1938
      usage = filter->ApproximateMemoryUsage();
1939
      Status s = block_cache->Insert(
1940
          key, filter, usage, &DeleteCachedFilterEntry, &cache_handle,
1941 1942 1943
          rep_->table_options.cache_index_and_filter_blocks_with_high_priority
              ? Cache::Priority::HIGH
              : Cache::Priority::LOW);
1944
      if (s.ok()) {
1945
        UpdateCacheInsertionMetrics(BlockType::kFilter, get_context, usage);
1946
      } else {
1947
        RecordTick(rep_->ioptions.statistics, BLOCK_CACHE_ADD_FAILURES);
1948
        delete filter;
1949
        return_empty_reader = true;
1950
      }
K
Kai Liu 已提交
1951 1952 1953
    }
  }

1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971
  if (block_cache_tracer_ && lookup_context) {
    // Avoid making copy of block_key and cf_name when constructing the access
    // record.
    BlockCacheTraceRecord access_record(
        rep_->ioptions.env->NowMicros(),
        /*block_key=*/"", TraceType::kBlockTraceFilterBlock,
        /*block_size=*/usage, rep_->cf_id_for_tracing(),
        /*cf_name=*/"", rep_->level_for_tracing(),
        rep_->sst_number_for_tracing(), lookup_context->caller, is_cache_hit,
        /*no_insert=*/no_io);
    block_cache_tracer_->WriteBlockAccess(access_record, key,
                                          rep_->cf_name_for_tracing(),
                                          /*referenced_key=*/nullptr);
  }

  if (return_empty_reader) {
    return CachableEntry<FilterBlockReader>();
  }
1972
  return {filter, cache_handle ? block_cache : nullptr, cache_handle,
1973
          /*own_value=*/false};
K
Kai Liu 已提交
1974 1975
}

1976
CachableEntry<UncompressionDict> BlockBasedTable::GetUncompressionDict(
1977
    FilePrefetchBuffer* prefetch_buffer, bool no_io, GetContext* get_context,
1978
    BlockCacheLookupContext* lookup_context) const {
1979
  if (!rep_->table_options.cache_index_and_filter_blocks) {
1980 1981
    // block cache is either disabled or not used for meta-blocks. In either
    // case, BlockBasedTableReader is the owner of the uncompression dictionary.
1982 1983
    return {rep_->uncompression_dict.get(), nullptr /* cache */,
            nullptr /* cache_handle */, false /* own_value */};
1984
  }
1985
  if (rep_->compression_dict_handle.IsNull()) {
1986
    return CachableEntry<UncompressionDict>();
1987 1988 1989
  }
  char cache_key_buf[kMaxCacheKeyPrefixSize + kMaxVarint64Length];
  auto cache_key =
1990 1991
      GetCacheKey(rep_->cache_key_prefix, rep_->cache_key_prefix_size,
                  rep_->compression_dict_handle, cache_key_buf);
1992 1993 1994
  auto cache_handle =
      GetEntryFromCache(rep_->table_options.block_cache.get(), cache_key,
                        BlockType::kCompressionDictionary, get_context);
1995
  UncompressionDict* dict = nullptr;
1996 1997
  bool is_cache_hit = false;
  size_t usage = 0;
1998 1999
  if (cache_handle != nullptr) {
    dict = reinterpret_cast<UncompressionDict*>(
2000
        rep_->table_options.block_cache->Value(cache_handle));
2001 2002
    is_cache_hit = true;
    usage = dict->ApproximateMemoryUsage();
2003 2004 2005 2006 2007
  } else if (no_io) {
    // Do not invoke any io.
  } else {
    std::unique_ptr<const BlockContents> compression_dict_block;
    Status s =
2008
        ReadCompressionDictBlock(prefetch_buffer, &compression_dict_block);
2009 2010 2011
    if (s.ok()) {
      assert(compression_dict_block != nullptr);
      // TODO(ajkr): find a way to avoid the `compression_dict_block` data copy
2012 2013 2014 2015
      std::unique_ptr<UncompressionDict> uncompression_dict(
          new UncompressionDict(compression_dict_block->data.ToString(),
                                rep_->blocks_definitely_zstd_compressed,
                                rep_->ioptions.statistics));
2016
      usage = uncompression_dict->ApproximateMemoryUsage();
2017
      s = rep_->table_options.block_cache->Insert(
2018 2019
          cache_key, uncompression_dict.get(), usage,
          &DeleteCachedUncompressionDictEntry, &cache_handle,
2020
          rep_->table_options.cache_index_and_filter_blocks_with_high_priority
2021 2022
              ? Cache::Priority::HIGH
              : Cache::Priority::LOW);
2023 2024 2025 2026 2027

      if (s.ok()) {
        UpdateCacheInsertionMetrics(BlockType::kCompressionDictionary,
                                    get_context, usage);
        dict = uncompression_dict.release();
2028
      } else {
2029 2030 2031
        RecordTick(rep_->ioptions.statistics, BLOCK_CACHE_ADD_FAILURES);
        assert(dict == nullptr);
        assert(cache_handle == nullptr);
2032 2033 2034
      }
    }
  }
2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048
  if (block_cache_tracer_ && lookup_context) {
    // Avoid making copy of block_key and cf_name when constructing the access
    // record.
    BlockCacheTraceRecord access_record(
        rep_->ioptions.env->NowMicros(),
        /*block_key=*/"", TraceType::kBlockTraceUncompressionDictBlock,
        /*block_size=*/usage, rep_->cf_id_for_tracing(),
        /*cf_name=*/"", rep_->level_for_tracing(),
        rep_->sst_number_for_tracing(), lookup_context->caller, is_cache_hit,
        /*no_insert=*/no_io);
    block_cache_tracer_->WriteBlockAccess(access_record, cache_key,
                                          rep_->cf_name_for_tracing(),
                                          /*referenced_key=*/nullptr);
  }
2049 2050
  return {dict, cache_handle ? rep_->table_options.block_cache.get() : nullptr,
          cache_handle, false /* own_value */};
2051 2052
}

2053 2054
// disable_prefix_seek should be set to true when prefix_extractor found in SST
// differs from the one in mutable_cf_options and index type is HashBasedIndex
2055
InternalIteratorBase<BlockHandle>* BlockBasedTable::NewIndexIterator(
2056
    const ReadOptions& read_options, bool disable_prefix_seek,
2057 2058
    IndexBlockIter* input_iter, GetContext* get_context,
    BlockCacheLookupContext* lookup_context) const {
2059 2060
  assert(rep_ != nullptr);
  assert(rep_->index_reader != nullptr);
2061

2062
  // We don't return pinned data from index blocks, so no need
2063
  // to set `block_contents_pinned`.
2064
  return rep_->index_reader->NewIterator(read_options, disable_prefix_seek,
2065 2066
                                         input_iter, get_context,
                                         lookup_context);
K
Kai Liu 已提交
2067 2068
}

L
Lei Jin 已提交
2069 2070
// Convert an index iterator value (i.e., an encoded BlockHandle)
// into an iterator over the contents of the corresponding block.
2071 2072
// If input_iter is null, new a iterator
// If input_iter is not null, update this iter and return it
M
Maysam Yabandeh 已提交
2073 2074
template <typename TBlockIter>
TBlockIter* BlockBasedTable::NewDataBlockIterator(
2075
    const ReadOptions& ro, const BlockHandle& handle, TBlockIter* input_iter,
2076
    BlockType block_type, bool key_includes_seq, bool index_key_is_full,
2077
    GetContext* get_context, BlockCacheLookupContext* lookup_context, Status s,
2078
    FilePrefetchBuffer* prefetch_buffer) const {
2079 2080
  PERF_TIMER_GUARD(new_table_block_iter_nanos);

2081 2082 2083 2084 2085 2086 2087 2088
  TBlockIter* iter = input_iter != nullptr ? input_iter : new TBlockIter;
  if (!s.ok()) {
    iter->Invalidate(s);
    return iter;
  }

  const bool no_io = (ro.read_tier == kBlockCacheTier);
  auto uncompression_dict_storage =
2089
      GetUncompressionDict(prefetch_buffer, no_io, get_context, lookup_context);
2090
  const UncompressionDict& uncompression_dict =
2091 2092 2093
      uncompression_dict_storage.GetValue() == nullptr
          ? UncompressionDict::GetEmptyDict()
          : *uncompression_dict_storage.GetValue();
2094

L
Lei Jin 已提交
2095
  CachableEntry<Block> block;
2096
  s = RetrieveBlock(prefetch_buffer, ro, handle, uncompression_dict, &block,
2097
                    block_type, get_context, lookup_context);
2098

2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113
  if (!s.ok()) {
    assert(block.IsEmpty());
    iter->Invalidate(s);
    return iter;
  }

  assert(block.GetValue() != nullptr);
  constexpr bool kTotalOrderSeek = true;
  // Block contents are pinned and it is still pinned after the iterator
  // is destroyed as long as cleanup functions are moved to another object,
  // when:
  // 1. block cache handle is set to be released in cleanup function, or
  // 2. it's pointing to immortal source. If own_bytes is true then we are
  //    not reading data from the original source, whether immortal or not.
  //    Otherwise, the block is pinned iff the source is immortal.
2114 2115
  const bool block_contents_pinned =
      block.IsCached() ||
2116
      (!block.GetValue()->own_bytes() && rep_->immortal_table);
2117
  iter = block.GetValue()->NewIterator<TBlockIter>(
2118 2119
      &rep_->internal_comparator, rep_->internal_comparator.user_comparator(),
      iter, rep_->ioptions.statistics, kTotalOrderSeek, key_includes_seq,
2120
      index_key_is_full, block_contents_pinned);
2121 2122

  if (!block.IsCached()) {
2123
    if (!ro.fill_cache && rep_->cache_key_prefix_size != 0) {
2124
      // insert a dummy record to block cache to track the memory usage
2125
      Cache* const block_cache = rep_->table_options.block_cache.get();
2126 2127 2128 2129 2130 2131 2132 2133
      Cache::Handle* cache_handle = nullptr;
      // There are two other types of cache keys: 1) SST cache key added in
      // `MaybeReadBlockAndLoadToCache` 2) dummy cache key added in
      // `write_buffer_manager`. Use longer prefix (41 bytes) to differentiate
      // from SST cache key(31 bytes), and use non-zero prefix to
      // differentiate from `write_buffer_manager`
      const size_t kExtraCacheKeyPrefix = kMaxVarint64Length * 4 + 1;
      char cache_key[kExtraCacheKeyPrefix + kMaxVarint64Length];
2134
      // Prefix: use rep_->cache_key_prefix padded by 0s
2135
      memset(cache_key, 0, kExtraCacheKeyPrefix + kMaxVarint64Length);
2136 2137 2138
      assert(rep_->cache_key_prefix_size != 0);
      assert(rep_->cache_key_prefix_size <= kExtraCacheKeyPrefix);
      memcpy(cache_key, rep_->cache_key_prefix, rep_->cache_key_prefix_size);
2139 2140 2141
      char* end = EncodeVarint64(cache_key + kExtraCacheKeyPrefix,
                                 next_cache_key_id_++);
      assert(end - cache_key <=
2142
             static_cast<int>(kExtraCacheKeyPrefix + kMaxVarint64Length));
2143 2144 2145 2146
      const Slice unique_key(cache_key, static_cast<size_t>(end - cache_key));
      s = block_cache->Insert(unique_key, nullptr,
                              block.GetValue()->ApproximateMemoryUsage(),
                              nullptr, &cache_handle);
2147
      if (s.ok()) {
2148 2149 2150
        assert(cache_handle != nullptr);
        iter->RegisterCleanup(&ForceReleaseCachedEntry, block_cache,
                              cache_handle);
2151
      }
2152
    }
2153 2154
  } else {
    iter->SetCacheHandle(block.GetCacheHandle());
2155 2156
  }

2157
  block.TransferTo(iter);
2158 2159 2160
  return iter;
}

2161
Status BlockBasedTable::MaybeReadBlockAndLoadToCache(
2162
    FilePrefetchBuffer* prefetch_buffer, const ReadOptions& ro,
2163
    const BlockHandle& handle, const UncompressionDict& uncompression_dict,
2164
    CachableEntry<Block>* block_entry, BlockType block_type,
2165
    GetContext* get_context, BlockCacheLookupContext* lookup_context) const {
2166
  assert(block_entry != nullptr);
2167
  const bool no_io = (ro.read_tier == kBlockCacheTier);
2168
  Cache* block_cache = rep_->table_options.block_cache.get();
2169
  // No point to cache compressed blocks if it never goes away
2170
  Cache* block_cache_compressed =
2171 2172
      rep_->immortal_table ? nullptr
                           : rep_->table_options.block_cache_compressed.get();
L
Lei Jin 已提交
2173

2174 2175
  // First, try to get the block from the cache
  //
L
Lei Jin 已提交
2176
  // If either block cache is enabled, we'll try to read from it.
2177
  Status s;
2178 2179 2180 2181
  char cache_key[kMaxCacheKeyPrefixSize + kMaxVarint64Length];
  char compressed_cache_key[kMaxCacheKeyPrefixSize + kMaxVarint64Length];
  Slice key /* key to the block cache */;
  Slice ckey /* key to the compressed block cache */;
2182 2183
  bool is_cache_hit = false;
  bool no_insert = true;
L
Lei Jin 已提交
2184 2185 2186
  if (block_cache != nullptr || block_cache_compressed != nullptr) {
    // create key for block cache
    if (block_cache != nullptr) {
2187
      key = GetCacheKey(rep_->cache_key_prefix, rep_->cache_key_prefix_size,
2188
                        handle, cache_key);
L
Lei Jin 已提交
2189 2190 2191
    }

    if (block_cache_compressed != nullptr) {
2192 2193
      ckey = GetCacheKey(rep_->compressed_cache_key_prefix,
                         rep_->compressed_cache_key_prefix_size, handle,
L
Lei Jin 已提交
2194 2195 2196
                         compressed_cache_key);
    }

2197
    s = GetDataBlockFromCache(key, ckey, block_cache, block_cache_compressed,
2198
                              ro, block_entry, uncompression_dict, block_type,
2199
                              get_context);
2200 2201 2202 2203 2204
    if (block_entry->GetValue()) {
      // TODO(haoyu): Differentiate cache hit on uncompressed block cache and
      // compressed block cache.
      is_cache_hit = true;
    }
2205 2206
    // Can't find the block from the cache. If I/O is allowed, read from the
    // file.
2207
    if (block_entry->GetValue() == nullptr && !no_io && ro.fill_cache) {
2208
      no_insert = false;
2209
      Statistics* statistics = rep_->ioptions.statistics;
2210
      bool do_decompress =
2211
          block_cache_compressed == nullptr && rep_->blocks_maybe_compressed;
2212 2213
      CompressionType raw_block_comp_type;
      BlockContents raw_block_contents;
L
Lei Jin 已提交
2214
      {
2215
        StopWatch sw(rep_->ioptions.env, statistics, READ_BLOCK_GET_MICROS);
2216
        BlockFetcher block_fetcher(
2217 2218 2219
            rep_->file.get(), prefetch_buffer, rep_->footer, ro, handle,
            &raw_block_contents, rep_->ioptions,
            do_decompress /* do uncompress */, rep_->blocks_maybe_compressed,
2220
            block_type, uncompression_dict, rep_->persistent_cache_options,
2221 2222
            GetMemoryAllocator(rep_->table_options),
            GetMemoryAllocatorForCompressedBlock(rep_->table_options));
2223 2224
        s = block_fetcher.ReadBlockContents();
        raw_block_comp_type = block_fetcher.get_compression_type();
L
Lei Jin 已提交
2225 2226 2227
      }

      if (s.ok()) {
2228
        SequenceNumber seq_no = rep_->get_global_seqno(block_type);
2229 2230
        // If filling cache is allowed and a cache is configured, try to put the
        // block to the cache.
2231 2232 2233 2234
        s = PutDataBlockToCache(key, ckey, block_cache, block_cache_compressed,
                                block_entry, &raw_block_contents,
                                raw_block_comp_type, uncompression_dict, seq_no,
                                GetMemoryAllocator(rep_->table_options),
2235
                                block_type, get_context);
L
Lei Jin 已提交
2236 2237 2238
      }
    }
  }
2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291

  // Fill lookup_context.
  if (block_cache_tracer_ && lookup_context) {
    size_t usage = 0;
    uint64_t nkeys = 0;
    if (block_entry->GetValue()) {
      // Approximate the number of keys in the block using restarts.
      nkeys = rep_->table_options.block_restart_interval *
              block_entry->GetValue()->NumRestarts();
      usage = block_entry->GetValue()->ApproximateMemoryUsage();
    }
    TraceType trace_block_type = TraceType::kTraceMax;
    switch (block_type) {
      case BlockType::kIndex:
        trace_block_type = TraceType::kBlockTraceIndexBlock;
        break;
      case BlockType::kData:
        trace_block_type = TraceType::kBlockTraceDataBlock;
        break;
      case BlockType::kRangeDeletion:
        trace_block_type = TraceType::kBlockTraceRangeDeletionBlock;
        break;
      default:
        // This cannot happen.
        assert(false);
        break;
    }
    if (BlockCacheTraceHelper::ShouldTraceReferencedKey(
            trace_block_type, lookup_context->caller)) {
      // Defer logging the access to Get() and MultiGet() to trace additional
      // information, e.g., the referenced key,
      // referenced_key_exist_in_block.

      // Make a copy of the block key here since it will be logged later.
      lookup_context->FillLookupContext(
          is_cache_hit, no_insert, trace_block_type,
          /*block_size=*/usage, /*block_key=*/key.ToString(), nkeys);
    } else {
      // Avoid making copy of block_key and cf_name when constructing the access
      // record.
      BlockCacheTraceRecord access_record(
          rep_->ioptions.env->NowMicros(),
          /*block_key=*/"", trace_block_type,
          /*block_size=*/usage, rep_->cf_id_for_tracing(),
          /*cf_name=*/"", rep_->level_for_tracing(),
          rep_->sst_number_for_tracing(), lookup_context->caller, is_cache_hit,
          no_insert);
      block_cache_tracer_->WriteBlockAccess(access_record, key,
                                            rep_->cf_name_for_tracing(),
                                            /*referenced_key=*/nullptr);
    }
  }

2292
  assert(s.ok() || block_entry->GetValue() == nullptr);
2293
  return s;
2294 2295
}

2296
Status BlockBasedTable::RetrieveBlock(
2297
    FilePrefetchBuffer* prefetch_buffer, const ReadOptions& ro,
2298
    const BlockHandle& handle, const UncompressionDict& uncompression_dict,
2299
    CachableEntry<Block>* block_entry, BlockType block_type,
2300
    GetContext* get_context, BlockCacheLookupContext* lookup_context) const {
2301 2302 2303 2304
  assert(block_entry);
  assert(block_entry->IsEmpty());

  Status s;
2305 2306 2307 2308
  if (rep_->table_options.cache_index_and_filter_blocks ||
      (block_type != BlockType::kFilter &&
       block_type != BlockType::kCompressionDictionary &&
       block_type != BlockType::kIndex)) {
2309
    s = MaybeReadBlockAndLoadToCache(prefetch_buffer, ro, handle,
2310
                                     uncompression_dict, block_entry,
2311
                                     block_type, get_context, lookup_context);
2312 2313 2314 2315 2316 2317

    if (!s.ok()) {
      return s;
    }

    if (block_entry->GetValue() != nullptr) {
2318
      assert(s.ok());
2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332
      return s;
    }
  }

  assert(block_entry->IsEmpty());

  const bool no_io = ro.read_tier == kBlockCacheTier;
  if (no_io) {
    return Status::Incomplete("no blocking io");
  }

  std::unique_ptr<Block> block;

  {
2333
    StopWatch sw(rep_->ioptions.env, rep_->ioptions.statistics,
2334 2335
                 READ_BLOCK_GET_MICROS);
    s = ReadBlockFromFile(
2336 2337
        rep_->file.get(), prefetch_buffer, rep_->footer, ro, handle, &block,
        rep_->ioptions, rep_->blocks_maybe_compressed,
2338
        rep_->blocks_maybe_compressed, block_type, uncompression_dict,
2339 2340 2341 2342
        rep_->persistent_cache_options, rep_->get_global_seqno(block_type),
        block_type == BlockType::kData
            ? rep_->table_options.read_amp_bytes_per_bit
            : 0,
2343
        GetMemoryAllocator(rep_->table_options));
2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355
  }

  if (!s.ok()) {
    return s;
  }

  block_entry->SetOwnedValue(block.release());

  assert(s.ok());
  return s;
}

2356
BlockBasedTable::PartitionedIndexIteratorState::PartitionedIndexIteratorState(
2357
    const BlockBasedTable* table,
M
Maysam Yabandeh 已提交
2358
    std::unordered_map<uint64_t, CachableEntry<Block>>* block_map,
2359
    bool index_key_includes_seq, bool index_key_is_full)
M
Maysam Yabandeh 已提交
2360 2361
    : table_(table),
      block_map_(block_map),
2362 2363
      index_key_includes_seq_(index_key_includes_seq),
      index_key_is_full_(index_key_is_full) {}
M
Maysam Yabandeh 已提交
2364

2365
InternalIteratorBase<BlockHandle>*
2366
BlockBasedTable::PartitionedIndexIteratorState::NewSecondaryIterator(
2367
    const BlockHandle& handle) {
M
Maysam Yabandeh 已提交
2368
  // Return a block iterator on the index partition
2369 2370 2371 2372
  auto block = block_map_->find(handle.offset());
  // This is a possible scenario since block cache might not have had space
  // for the partition
  if (block != block_map_->end()) {
2373 2374 2375
    auto rep = table_->get_rep();
    assert(rep);

M
Maysam Yabandeh 已提交
2376
    Statistics* kNullStats = nullptr;
2377
    // We don't return pinned data from index blocks, so no need
2378
    // to set `block_contents_pinned`.
2379
    return block->second.GetValue()->NewIterator<IndexBlockIter>(
M
Maysam Yabandeh 已提交
2380
        &rep->internal_comparator, rep->internal_comparator.user_comparator(),
2381
        nullptr, kNullStats, true, index_key_includes_seq_, index_key_is_full_);
2382 2383
  }
  // Create an empty iterator
2384
  return new IndexBlockIter();
2385 2386
}

T
Tyler Harter 已提交
2387 2388
// This will be broken if the user specifies an unusual implementation
// of Options.comparator, or if the user specifies an unusual
2389 2390
// definition of prefixes in BlockBasedTableOptions.filter_policy.
// In particular, we require the following three properties:
T
Tyler Harter 已提交
2391 2392 2393 2394
//
// 1) key.starts_with(prefix(key))
// 2) Compare(prefix(key), key) <= 0.
// 3) If Compare(key1, key2) <= 0, then Compare(prefix(key1), prefix(key2)) <= 0
T
Tyler Harter 已提交
2395
//
K
Kai Liu 已提交
2396 2397 2398
// Otherwise, this method guarantees no I/O will be incurred.
//
// REQUIRES: this method shouldn't be called while the DB lock is held.
2399 2400 2401
bool BlockBasedTable::PrefixMayMatch(
    const Slice& internal_key, const ReadOptions& read_options,
    const SliceTransform* options_prefix_extractor,
2402 2403
    const bool need_upper_bound_check,
    BlockCacheLookupContext* lookup_context) const {
2404
  if (!rep_->filter_policy) {
2405 2406 2407
    return true;
  }

2408 2409 2410 2411 2412 2413 2414 2415 2416 2417
  const SliceTransform* prefix_extractor;

  if (rep_->table_prefix_extractor == nullptr) {
    if (need_upper_bound_check) {
      return true;
    }
    prefix_extractor = options_prefix_extractor;
  } else {
    prefix_extractor = rep_->table_prefix_extractor.get();
  }
2418
  auto user_key = ExtractUserKey(internal_key);
2419
  if (!prefix_extractor->InDomain(user_key)) {
2420 2421
    return true;
  }
L
Lei Jin 已提交
2422

T
Tyler Harter 已提交
2423 2424 2425
  bool may_match = true;
  Status s;

2426
  // First, try check with full filter
2427 2428 2429
  auto filter_entry =
      GetFilter(prefix_extractor, /*prefetch_buffer=*/nullptr, /*no_io=*/false,
                /*get_context=*/nullptr, lookup_context);
2430
  FilterBlockReader* filter = filter_entry.GetValue();
2431
  bool filter_checked = true;
2432 2433
  if (filter != nullptr) {
    if (!filter->IsBlockBased()) {
M
Maysam Yabandeh 已提交
2434
      const Slice* const const_ikey_ptr = &internal_key;
2435 2436 2437
      may_match = filter->RangeMayExist(
          read_options.iterate_upper_bound, user_key, prefix_extractor,
          rep_->internal_comparator.user_comparator(), const_ikey_ptr,
2438
          &filter_checked, need_upper_bound_check, lookup_context);
2439
    } else {
2440 2441 2442 2443 2444
      // if prefix_extractor changed for block based filter, skip filter
      if (need_upper_bound_check) {
        return true;
      }
      auto prefix = prefix_extractor->Transform(user_key);
M
Maysam Yabandeh 已提交
2445 2446 2447 2448 2449 2450 2451 2452 2453
      InternalKey internal_key_prefix(prefix, kMaxSequenceNumber, kTypeValue);
      auto internal_prefix = internal_key_prefix.Encode();

      // To prevent any io operation in this method, we set `read_tier` to make
      // sure we always read index or filter only when they have already been
      // loaded to memory.
      ReadOptions no_io_read_options;
      no_io_read_options.read_tier = kBlockCacheTier;

2454
      // Then, try find it within each block
2455 2456
      // we already know prefix_extractor and prefix_extractor_name must match
      // because `CheckPrefixMayMatch` first checks `check_filter_ == true`
2457 2458 2459 2460
      std::unique_ptr<InternalIteratorBase<BlockHandle>> iiter(NewIndexIterator(
          no_io_read_options,
          /*need_upper_bound_check=*/false, /*input_iter=*/nullptr,
          /*need_upper_bound_check=*/nullptr, lookup_context));
2461 2462 2463 2464 2465 2466 2467 2468
      iiter->Seek(internal_prefix);

      if (!iiter->Valid()) {
        // we're past end of file
        // if it's incomplete, it means that we avoided I/O
        // and we're not really sure that we're past the end
        // of the file
        may_match = iiter->status().IsIncomplete();
2469 2470
      } else if ((rep_->table_properties &&
                          rep_->table_properties->index_key_is_user_key
M
Maysam Yabandeh 已提交
2471 2472
                      ? iiter->key()
                      : ExtractUserKey(iiter->key()))
2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490
                     .starts_with(ExtractUserKey(internal_prefix))) {
        // we need to check for this subtle case because our only
        // guarantee is that "the key is a string >= last key in that data
        // block" according to the doc/table_format.txt spec.
        //
        // Suppose iiter->key() starts with the desired prefix; it is not
        // necessarily the case that the corresponding data block will
        // contain the prefix, since iiter->key() need not be in the
        // block.  However, the next data block may contain the prefix, so
        // we return true to play it safe.
        may_match = true;
      } else if (filter->IsBlockBased()) {
        // iiter->key() does NOT start with the desired prefix.  Because
        // Seek() finds the first key that is >= the seek target, this
        // means that iiter->key() > prefix.  Thus, any data blocks coming
        // after the data block corresponding to iiter->key() cannot
        // possibly contain the key.  Thus, the corresponding data block
        // is the only on could potentially contain the prefix.
2491
        BlockHandle handle = iiter->value();
2492 2493 2494
        may_match = filter->PrefixMayMatch(
            prefix, prefix_extractor, handle.offset(), /*no_io=*/false,
            /*const_key_ptr=*/nullptr, lookup_context);
2495
      }
2496
    }
T
Tyler Harter 已提交
2497
  }
T
Tyler Harter 已提交
2498

2499 2500 2501 2502 2503 2504
  if (filter_checked) {
    Statistics* statistics = rep_->ioptions.statistics;
    RecordTick(statistics, BLOOM_FILTER_PREFIX_CHECKED);
    if (!may_match) {
      RecordTick(statistics, BLOOM_FILTER_PREFIX_USEFUL);
    }
T
Tyler Harter 已提交
2505 2506
  }

T
Tyler Harter 已提交
2507 2508 2509
  return may_match;
}

2510 2511
template <class TBlockIter, typename TValue>
void BlockBasedTableIterator<TBlockIter, TValue>::Seek(const Slice& target) {
2512
  is_out_of_bound_ = false;
2513
  if (!CheckPrefixMayMatch(target)) {
2514 2515 2516 2517
    ResetDataIter();
    return;
  }

2518
  bool need_seek_index = true;
2519
  if (block_iter_points_to_real_block_ && block_iter_.Valid()) {
2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535
    // Reseek.
    prev_index_value_ = index_iter_->value();
    // We can avoid an index seek if:
    // 1. The new seek key is larger than the current key
    // 2. The new seek key is within the upper bound of the block
    // Since we don't necessarily know the internal key for either
    // the current key or the upper bound, we check user keys and
    // exclude the equality case. Considering internal keys can
    // improve for the boundary cases, but it would complicate the
    // code.
    if (user_comparator_.Compare(ExtractUserKey(target),
                                 block_iter_.user_key()) > 0 &&
        user_comparator_.Compare(ExtractUserKey(target),
                                 index_iter_->user_key()) < 0) {
      need_seek_index = false;
    }
2536 2537
  }

2538 2539 2540 2541 2542 2543 2544 2545
  if (need_seek_index) {
    index_iter_->Seek(target);
    if (!index_iter_->Valid()) {
      ResetDataIter();
      return;
    }
    InitDataBlock();
  }
2546

M
Maysam Yabandeh 已提交
2547
  block_iter_.Seek(target);
2548 2549

  FindKeyForward();
2550
  CheckOutOfBound();
M
Maysam Yabandeh 已提交
2551 2552 2553
  assert(
      !block_iter_.Valid() ||
      (key_includes_seq_ && icomp_.Compare(target, block_iter_.key()) <= 0) ||
2554 2555
      (!key_includes_seq_ && user_comparator_.Compare(ExtractUserKey(target),
                                                      block_iter_.key()) <= 0));
2556 2557
}

2558 2559 2560
template <class TBlockIter, typename TValue>
void BlockBasedTableIterator<TBlockIter, TValue>::SeekForPrev(
    const Slice& target) {
2561
  is_out_of_bound_ = false;
2562
  if (!CheckPrefixMayMatch(target)) {
2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594
    ResetDataIter();
    return;
  }

  SavePrevIndexValue();

  // Call Seek() rather than SeekForPrev() in the index block, because the
  // target data block will likely to contain the position for `target`, the
  // same as Seek(), rather than than before.
  // For example, if we have three data blocks, each containing two keys:
  //   [2, 4]  [6, 8] [10, 12]
  //  (the keys in the index block would be [4, 8, 12])
  // and the user calls SeekForPrev(7), we need to go to the second block,
  // just like if they call Seek(7).
  // The only case where the block is difference is when they seek to a position
  // in the boundary. For example, if they SeekForPrev(5), we should go to the
  // first block, rather than the second. However, we don't have the information
  // to distinguish the two unless we read the second block. In this case, we'll
  // end up with reading two blocks.
  index_iter_->Seek(target);

  if (!index_iter_->Valid()) {
    index_iter_->SeekToLast();
    if (!index_iter_->Valid()) {
      ResetDataIter();
      block_iter_points_to_real_block_ = false;
      return;
    }
  }

  InitDataBlock();

M
Maysam Yabandeh 已提交
2595
  block_iter_.SeekForPrev(target);
2596 2597

  FindKeyBackward();
M
Maysam Yabandeh 已提交
2598 2599
  assert(!block_iter_.Valid() ||
         icomp_.Compare(target, block_iter_.key()) >= 0);
2600 2601
}

2602 2603
template <class TBlockIter, typename TValue>
void BlockBasedTableIterator<TBlockIter, TValue>::SeekToFirst() {
2604
  is_out_of_bound_ = false;
2605 2606 2607 2608 2609 2610 2611
  SavePrevIndexValue();
  index_iter_->SeekToFirst();
  if (!index_iter_->Valid()) {
    ResetDataIter();
    return;
  }
  InitDataBlock();
M
Maysam Yabandeh 已提交
2612
  block_iter_.SeekToFirst();
2613
  FindKeyForward();
2614
  CheckOutOfBound();
2615 2616
}

2617 2618
template <class TBlockIter, typename TValue>
void BlockBasedTableIterator<TBlockIter, TValue>::SeekToLast() {
2619
  is_out_of_bound_ = false;
2620 2621 2622 2623 2624 2625 2626
  SavePrevIndexValue();
  index_iter_->SeekToLast();
  if (!index_iter_->Valid()) {
    ResetDataIter();
    return;
  }
  InitDataBlock();
M
Maysam Yabandeh 已提交
2627
  block_iter_.SeekToLast();
2628 2629 2630
  FindKeyBackward();
}

2631 2632
template <class TBlockIter, typename TValue>
void BlockBasedTableIterator<TBlockIter, TValue>::Next() {
2633
  assert(block_iter_points_to_real_block_);
M
Maysam Yabandeh 已提交
2634
  block_iter_.Next();
2635 2636 2637
  FindKeyForward();
}

2638 2639
template <class TBlockIter, typename TValue>
bool BlockBasedTableIterator<TBlockIter, TValue>::NextAndGetResult(
2640
    Slice* ret_key) {
2641 2642 2643
  Next();
  bool is_valid = Valid();
  if (is_valid) {
2644
    *ret_key = key();
2645 2646 2647 2648
  }
  return is_valid;
}

2649 2650
template <class TBlockIter, typename TValue>
void BlockBasedTableIterator<TBlockIter, TValue>::Prev() {
2651
  assert(block_iter_points_to_real_block_);
M
Maysam Yabandeh 已提交
2652
  block_iter_.Prev();
2653 2654 2655
  FindKeyBackward();
}

2656 2657 2658 2659 2660 2661 2662
// Found that 256 KB readahead size provides the best performance, based on
// experiments, for auto readahead. Experiment data is in PR #3282.
template <class TBlockIter, typename TValue>
const size_t
    BlockBasedTableIterator<TBlockIter, TValue>::kMaxAutoReadaheadSize =
        256 * 1024;

2663 2664 2665
template <class TBlockIter, typename TValue>
void BlockBasedTableIterator<TBlockIter, TValue>::InitDataBlock() {
  BlockHandle data_block_handle = index_iter_->value();
2666
  if (!block_iter_points_to_real_block_ ||
2667
      data_block_handle.offset() != prev_index_value_.offset() ||
2668
      // if previous attempt of reading the block missed cache, try again
M
Maysam Yabandeh 已提交
2669
      block_iter_.status().IsIncomplete()) {
2670 2671 2672 2673 2674
    if (block_iter_points_to_real_block_) {
      ResetDataIter();
    }
    auto* rep = table_->get_rep();

2675 2676 2677 2678 2679 2680 2681 2682 2683 2684 2685 2686 2687 2688 2689 2690 2691 2692 2693 2694 2695 2696 2697 2698 2699 2700 2701 2702 2703 2704 2705 2706 2707
    // Prefetch additional data for range scans (iterators). Enabled only for
    // user reads.
    // Implicit auto readahead:
    //   Enabled after 2 sequential IOs when ReadOptions.readahead_size == 0.
    // Explicit user requested readahead:
    //   Enabled from the very first IO when ReadOptions.readahead_size is set.
    if (!for_compaction_) {
      if (read_options_.readahead_size == 0) {
        // Implicit auto readahead
        num_file_reads_++;
        if (num_file_reads_ > kMinNumFileReadsToStartAutoReadahead) {
          if (!rep->file->use_direct_io() &&
              (data_block_handle.offset() +
                   static_cast<size_t>(data_block_handle.size()) +
                   kBlockTrailerSize >
               readahead_limit_)) {
            // Buffered I/O
            // Discarding the return status of Prefetch calls intentionally, as
            // we can fallback to reading from disk if Prefetch fails.
            rep->file->Prefetch(data_block_handle.offset(), readahead_size_);
            readahead_limit_ = static_cast<size_t>(data_block_handle.offset() +
                                                   readahead_size_);
            // Keep exponentially increasing readahead size until
            // kMaxAutoReadaheadSize.
            readahead_size_ =
                std::min(kMaxAutoReadaheadSize, readahead_size_ * 2);
          } else if (rep->file->use_direct_io() && !prefetch_buffer_) {
            // Direct I/O
            // Let FilePrefetchBuffer take care of the readahead.
            prefetch_buffer_.reset(
                new FilePrefetchBuffer(rep->file.get(), kInitAutoReadaheadSize,
                                       kMaxAutoReadaheadSize));
          }
2708
        }
2709 2710 2711 2712 2713 2714 2715
      } else if (!prefetch_buffer_) {
        // Explicit user requested readahead
        // The actual condition is:
        // if (read_options_.readahead_size != 0 && !prefetch_buffer_)
        prefetch_buffer_.reset(new FilePrefetchBuffer(
            rep->file.get(), read_options_.readahead_size,
            read_options_.readahead_size));
2716 2717 2718
      }
    }

2719
    Status s;
2720
    table_->NewDataBlockIterator<TBlockIter>(
2721
        read_options_, data_block_handle, &block_iter_, block_type_,
2722
        key_includes_seq_, index_key_is_full_,
2723
        /*get_context=*/nullptr, &lookup_context_, s, prefetch_buffer_.get());
2724 2725 2726 2727
    block_iter_points_to_real_block_ = true;
  }
}

2728
template <class TBlockIter, typename TValue>
2729
void BlockBasedTableIterator<TBlockIter, TValue>::FindBlockForward() {
2730 2731
  // TODO the while loop inherits from two-level-iterator. We don't know
  // whether a block can be empty so it can be replaced by an "if".
2732
  do {
M
Maysam Yabandeh 已提交
2733
    if (!block_iter_.status().ok()) {
2734 2735
      return;
    }
2736
    // Whether next data block is out of upper bound, if there is one.
2737 2738 2739 2740 2741 2742 2743
    bool next_block_is_out_of_bound = false;
    if (read_options_.iterate_upper_bound != nullptr &&
        block_iter_points_to_real_block_) {
      next_block_is_out_of_bound =
          (user_comparator_.Compare(*read_options_.iterate_upper_bound,
                                    index_iter_->user_key()) <= 0);
    }
2744 2745
    ResetDataIter();
    index_iter_->Next();
2746 2747 2748 2749 2750 2751 2752 2753 2754 2755 2756
    if (next_block_is_out_of_bound) {
      // The next block is out of bound. No need to read it.
      TEST_SYNC_POINT_CALLBACK("BlockBasedTableIterator:out_of_bound", nullptr);
      // We need to make sure this is not the last data block before setting
      // is_out_of_bound_, since the index key for the last data block can be
      // larger than smallest key of the next file on the same level.
      if (index_iter_->Valid()) {
        is_out_of_bound_ = true;
      }
      return;
    }
2757 2758 2759

    if (index_iter_->Valid()) {
      InitDataBlock();
M
Maysam Yabandeh 已提交
2760
      block_iter_.SeekToFirst();
2761 2762 2763
    } else {
      return;
    }
2764 2765 2766 2767 2768 2769 2770 2771 2772
  } while (!block_iter_.Valid());
}

template <class TBlockIter, typename TValue>
void BlockBasedTableIterator<TBlockIter, TValue>::FindKeyForward() {
  assert(!is_out_of_bound_);

  if (!block_iter_.Valid()) {
    FindBlockForward();
2773 2774 2775
  }
}

2776 2777
template <class TBlockIter, typename TValue>
void BlockBasedTableIterator<TBlockIter, TValue>::FindKeyBackward() {
M
Maysam Yabandeh 已提交
2778 2779
  while (!block_iter_.Valid()) {
    if (!block_iter_.status().ok()) {
2780 2781 2782 2783 2784 2785 2786 2787
      return;
    }

    ResetDataIter();
    index_iter_->Prev();

    if (index_iter_->Valid()) {
      InitDataBlock();
M
Maysam Yabandeh 已提交
2788
      block_iter_.SeekToLast();
2789 2790 2791 2792 2793 2794 2795 2796 2797
    } else {
      return;
    }
  }

  // We could have check lower bound here too, but we opt not to do it for
  // code simplicity.
}

2798 2799 2800 2801 2802 2803 2804 2805 2806
template <class TBlockIter, typename TValue>
void BlockBasedTableIterator<TBlockIter, TValue>::CheckOutOfBound() {
  if (read_options_.iterate_upper_bound != nullptr &&
      block_iter_points_to_real_block_ && block_iter_.Valid()) {
    is_out_of_bound_ = user_comparator_.Compare(
                           *read_options_.iterate_upper_bound, user_key()) <= 0;
  }
}

2807 2808
InternalIterator* BlockBasedTable::NewIterator(
    const ReadOptions& read_options, const SliceTransform* prefix_extractor,
2809
    Arena* arena, bool skip_filters, bool for_compaction) {
2810 2811 2812
  BlockCacheLookupContext lookup_context{
      for_compaction ? BlockCacheLookupCaller::kCompaction
                     : BlockCacheLookupCaller::kUserIterator};
2813
  bool need_upper_bound_check =
2814
      PrefixExtractorChanged(rep_->table_properties.get(), prefix_extractor);
2815
  if (arena == nullptr) {
M
Maysam Yabandeh 已提交
2816
    return new BlockBasedTableIterator<DataBlockIter>(
2817
        this, read_options, rep_->internal_comparator,
2818 2819
        NewIndexIterator(
            read_options,
2820
            need_upper_bound_check &&
2821 2822
                rep_->index_type == BlockBasedTableOptions::kHashSearch,
            /*input_iter=*/nullptr, /*get_context=*/nullptr, &lookup_context),
2823
        !skip_filters && !read_options.total_order_seek &&
2824
            prefix_extractor != nullptr,
2825
        need_upper_bound_check, prefix_extractor, BlockType::kData,
2826
        true /*key_includes_seq*/, true /*index_key_is_full*/, for_compaction);
2827
  } else {
M
Maysam Yabandeh 已提交
2828 2829 2830
    auto* mem =
        arena->AllocateAligned(sizeof(BlockBasedTableIterator<DataBlockIter>));
    return new (mem) BlockBasedTableIterator<DataBlockIter>(
2831
        this, read_options, rep_->internal_comparator,
2832 2833 2834
        NewIndexIterator(read_options, need_upper_bound_check,
                         /*input_iter=*/nullptr, /*get_context=*/nullptr,
                         &lookup_context),
2835
        !skip_filters && !read_options.total_order_seek &&
2836
            prefix_extractor != nullptr,
2837
        need_upper_bound_check, prefix_extractor, BlockType::kData,
2838
        true /*key_includes_seq*/, true /*index_key_is_full*/, for_compaction);
2839
  }
J
jorlow@chromium.org 已提交
2840 2841
}

2842
FragmentedRangeTombstoneIterator* BlockBasedTable::NewRangeTombstoneIterator(
2843
    const ReadOptions& read_options) {
2844 2845 2846
  if (rep_->fragmented_range_dels == nullptr) {
    return nullptr;
  }
2847 2848 2849 2850 2851
  SequenceNumber snapshot = kMaxSequenceNumber;
  if (read_options.snapshot != nullptr) {
    snapshot = read_options.snapshot->GetSequenceNumber();
  }
  return new FragmentedRangeTombstoneIterator(
2852
      rep_->fragmented_range_dels, rep_->internal_comparator, snapshot);
2853 2854
}

2855 2856 2857
bool BlockBasedTable::FullFilterKeyMayMatch(
    const ReadOptions& read_options, FilterBlockReader* filter,
    const Slice& internal_key, const bool no_io,
2858 2859
    const SliceTransform* prefix_extractor,
    BlockCacheLookupContext* lookup_context) const {
2860 2861 2862 2863
  if (filter == nullptr || filter->IsBlockBased()) {
    return true;
  }
  Slice user_key = ExtractUserKey(internal_key);
M
Maysam Yabandeh 已提交
2864
  const Slice* const const_ikey_ptr = &internal_key;
2865
  bool may_match = true;
2866
  if (filter->whole_key_filtering()) {
2867 2868 2869
    size_t ts_sz =
        rep_->internal_comparator.user_comparator()->timestamp_size();
    Slice user_key_without_ts = StripTimestampFromUserKey(user_key, ts_sz);
2870 2871 2872
    may_match =
        filter->KeyMayMatch(user_key_without_ts, prefix_extractor, kNotValid,
                            no_io, const_ikey_ptr, lookup_context);
2873
  } else if (!read_options.total_order_seek && prefix_extractor &&
2874
             rep_->table_properties->prefix_extractor_name.compare(
2875 2876 2877 2878
                 prefix_extractor->Name()) == 0 &&
             prefix_extractor->InDomain(user_key) &&
             !filter->PrefixMayMatch(prefix_extractor->Transform(user_key),
                                     prefix_extractor, kNotValid, false,
2879
                                     const_ikey_ptr, lookup_context)) {
2880 2881 2882 2883
    may_match = false;
  }
  if (may_match) {
    RecordTick(rep_->ioptions.statistics, BLOOM_FILTER_FULL_POSITIVE);
2884
    PERF_COUNTER_BY_LEVEL_ADD(bloom_filter_full_positive, 1, rep_->level);
2885
  }
2886
  return may_match;
2887 2888
}

2889 2890 2891
void BlockBasedTable::FullFilterKeysMayMatch(
    const ReadOptions& read_options, FilterBlockReader* filter,
    MultiGetRange* range, const bool no_io,
2892 2893
    const SliceTransform* prefix_extractor,
    BlockCacheLookupContext* lookup_context) const {
2894 2895 2896 2897
  if (filter == nullptr || filter->IsBlockBased()) {
    return;
  }
  if (filter->whole_key_filtering()) {
2898 2899
    filter->KeysMayMatch(range, prefix_extractor, kNotValid, no_io,
                         lookup_context);
2900 2901 2902 2903 2904 2905 2906 2907 2908 2909
  } else if (!read_options.total_order_seek && prefix_extractor &&
             rep_->table_properties->prefix_extractor_name.compare(
                 prefix_extractor->Name()) == 0) {
    for (auto iter = range->begin(); iter != range->end(); ++iter) {
      Slice user_key = iter->lkey->user_key();

      if (!prefix_extractor->InDomain(user_key)) {
        range->SkipKey(iter);
      }
    }
2910 2911
    filter->PrefixesMayMatch(range, prefix_extractor, kNotValid, false,
                             lookup_context);
2912 2913 2914
  }
}

2915
Status BlockBasedTable::Get(const ReadOptions& read_options, const Slice& key,
2916 2917 2918
                            GetContext* get_context,
                            const SliceTransform* prefix_extractor,
                            bool skip_filters) {
M
Maysam Yabandeh 已提交
2919
  assert(key.size() >= 8);  // key must be internal key
S
Sanjay Ghemawat 已提交
2920
  Status s;
M
Maysam Yabandeh 已提交
2921
  const bool no_io = read_options.read_tier == kBlockCacheTier;
2922
  CachableEntry<FilterBlockReader> filter_entry;
2923 2924
  bool may_match;
  FilterBlockReader* filter = nullptr;
2925
  BlockCacheLookupContext lookup_context{BlockCacheLookupCaller::kUserGet};
2926 2927
  {
    if (!skip_filters) {
2928 2929 2930
      filter_entry = GetFilter(prefix_extractor, /*prefetch_buffer=*/nullptr,
                               read_options.read_tier == kBlockCacheTier,
                               get_context, &lookup_context);
2931
    }
2932
    filter = filter_entry.GetValue();
2933

2934 2935 2936
    // First check the full filter
    // If full filter not useful, Then go into each block
    may_match = FullFilterKeyMayMatch(read_options, filter, key, no_io,
2937
                                      prefix_extractor, &lookup_context);
2938 2939
  }
  if (!may_match) {
2940
    RecordTick(rep_->ioptions.statistics, BLOOM_FILTER_USEFUL);
2941
    PERF_COUNTER_BY_LEVEL_ADD(bloom_filter_useful, 1, rep_->level);
2942
  } else {
M
Maysam Yabandeh 已提交
2943
    IndexBlockIter iiter_on_stack;
2944 2945
    // if prefix_extractor found in block differs from options, disable
    // BlockPrefixIndex. Only do this check when index_type is kHashSearch.
2946
    bool need_upper_bound_check = false;
2947
    if (rep_->index_type == BlockBasedTableOptions::kHashSearch) {
2948
      need_upper_bound_check = PrefixExtractorChanged(
2949
          rep_->table_properties.get(), prefix_extractor);
2950
    }
2951 2952 2953
    auto iiter =
        NewIndexIterator(read_options, need_upper_bound_check, &iiter_on_stack,
                         get_context, &lookup_context);
2954
    std::unique_ptr<InternalIteratorBase<BlockHandle>> iiter_unique_ptr;
M
Maysam Yabandeh 已提交
2955
    if (iiter != &iiter_on_stack) {
M
Maysam Yabandeh 已提交
2956
      iiter_unique_ptr.reset(iiter);
M
Maysam Yabandeh 已提交
2957
    }
2958

2959 2960
    size_t ts_sz =
        rep_->internal_comparator.user_comparator()->timestamp_size();
2961
    bool matched = false;  // if such user key mathced a key in SST
2962
    bool done = false;
M
Maysam Yabandeh 已提交
2963
    for (iiter->Seek(key); iiter->Valid() && !done; iiter->Next()) {
2964
      BlockHandle handle = iiter->value();
2965

2966 2967
      bool not_exist_in_filter =
          filter != nullptr && filter->IsBlockBased() == true &&
2968
          !filter->KeyMayMatch(ExtractUserKeyAndStripTimestamp(key, ts_sz),
2969 2970
                               prefix_extractor, handle.offset(), no_io,
                               /*const_ikey_ptr=*/nullptr, &lookup_context);
2971 2972 2973 2974 2975 2976

      if (not_exist_in_filter) {
        // Not found
        // TODO: think about interaction with Merge. If a user key cannot
        // cross one data block, we should be fine.
        RecordTick(rep_->ioptions.statistics, BLOOM_FILTER_USEFUL);
2977
        PERF_COUNTER_BY_LEVEL_ADD(bloom_filter_useful, 1, rep_->level);
2978 2979
        break;
      } else {
2980 2981 2982
        BlockCacheLookupContext lookup_data_block_context{
            BlockCacheLookupCaller::kUserGet};
        bool does_referenced_key_exist = false;
M
Maysam Yabandeh 已提交
2983
        DataBlockIter biter;
2984
        uint64_t referenced_data_size = 0;
M
Maysam Yabandeh 已提交
2985
        NewDataBlockIterator<DataBlockIter>(
2986
            read_options, iiter->value(), &biter, BlockType::kData,
2987
            /*key_includes_seq=*/true,
2988
            /*index_key_is_full=*/true, get_context, &lookup_data_block_context,
2989
            /*s=*/Status(), /*prefetch_buffer*/ nullptr);
2990

2991
        if (read_options.read_tier == kBlockCacheTier &&
M
Maysam Yabandeh 已提交
2992
            biter.status().IsIncomplete()) {
2993
          // couldn't get block from block_cache
2994 2995
          // Update Saver.state to Found because we are only looking for
          // whether we can guarantee the key is not there when "no_io" is set
2996
          get_context->MarkKeyMayExist();
2997 2998
          break;
        }
M
Maysam Yabandeh 已提交
2999 3000
        if (!biter.status().ok()) {
          s = biter.status();
3001 3002 3003
          break;
        }

3004
        bool may_exist = biter.SeekForGet(key);
3005 3006 3007
        // If user-specified timestamp is supported, we cannot end the search
        // just because hash index lookup indicates the key+ts does not exist.
        if (!may_exist && ts_sz == 0) {
3008 3009 3010 3011
          // HashSeek cannot find the key this block and the the iter is not
          // the end of the block, i.e. cannot be in the following blocks
          // either. In this case, the seek_key cannot be found, so we break
          // from the top level for-loop.
3012 3013 3014 3015 3016 3017 3018 3019
          done = true;
        } else {
          // Call the *saver function on each entry/block until it returns false
          for (; biter.Valid(); biter.Next()) {
            ParsedInternalKey parsed_key;
            if (!ParseInternalKey(biter.key(), &parsed_key)) {
              s = Status::Corruption(Slice());
            }
3020

3021 3022 3023 3024 3025 3026 3027 3028
            if (!get_context->SaveValue(
                    parsed_key, biter.value(), &matched,
                    biter.IsValuePinned() ? &biter : nullptr)) {
              does_referenced_key_exist = true;
              referenced_data_size = biter.key().size() + biter.value().size();
              done = true;
              break;
            }
3029
          }
3030 3031 3032 3033 3034 3035 3036 3037 3038 3039 3040 3041 3042 3043 3044 3045 3046 3047 3048 3049
          s = biter.status();
        }
        // Write the block cache access record.
        if (block_cache_tracer_) {
          // Avoid making copy of block_key, cf_name, and referenced_key when
          // constructing the access record.
          BlockCacheTraceRecord access_record(
              rep_->ioptions.env->NowMicros(),
              /*block_key=*/"", lookup_data_block_context.block_type,
              lookup_data_block_context.block_size, rep_->cf_id_for_tracing(),
              /*cf_name=*/"", rep_->level_for_tracing(),
              rep_->sst_number_for_tracing(), lookup_data_block_context.caller,
              lookup_data_block_context.is_cache_hit,
              lookup_data_block_context.no_insert,
              /*referenced_key=*/"", referenced_data_size,
              lookup_data_block_context.num_keys_in_block,
              does_referenced_key_exist);
          block_cache_tracer_->WriteBlockAccess(
              access_record, lookup_data_block_context.block_key,
              rep_->cf_name_for_tracing(), key);
3050
        }
S
Sanjay Ghemawat 已提交
3051
      }
3052

M
Maysam Yabandeh 已提交
3053 3054 3055 3056
      if (done) {
        // Avoid the extra Next which is expensive in two-level indexes
        break;
      }
3057
    }
3058 3059
    if (matched && filter != nullptr && !filter->IsBlockBased()) {
      RecordTick(rep_->ioptions.statistics, BLOOM_FILTER_FULL_TRUE_POSITIVE);
3060 3061
      PERF_COUNTER_BY_LEVEL_ADD(bloom_filter_full_true_positive, 1,
                                rep_->level);
3062
    }
3063
    if (s.ok()) {
M
Maysam Yabandeh 已提交
3064
      s = iiter->status();
S
Sanjay Ghemawat 已提交
3065 3066
    }
  }
K
Kai Liu 已提交
3067

S
Sanjay Ghemawat 已提交
3068 3069 3070
  return s;
}

3071 3072 3073 3074 3075
using MultiGetRange = MultiGetContext::Range;
void BlockBasedTable::MultiGet(const ReadOptions& read_options,
                               const MultiGetRange* mget_range,
                               const SliceTransform* prefix_extractor,
                               bool skip_filters) {
3076
  BlockCacheLookupContext lookup_context{BlockCacheLookupCaller::kUserMGet};
3077 3078 3079 3080 3081 3082 3083 3084
  const bool no_io = read_options.read_tier == kBlockCacheTier;
  CachableEntry<FilterBlockReader> filter_entry;
  FilterBlockReader* filter = nullptr;
  MultiGetRange sst_file_range(*mget_range, mget_range->begin(),
                               mget_range->end());
  {
    if (!skip_filters) {
      // TODO: Figure out where the stats should go
3085
      filter_entry = GetFilter(prefix_extractor, /*prefetch_buffer=*/nullptr,
3086
                               read_options.read_tier == kBlockCacheTier,
3087
                               /*get_context=*/nullptr, &lookup_context);
3088
    }
3089
    filter = filter_entry.GetValue();
3090 3091 3092 3093

    // First check the full filter
    // If full filter not useful, Then go into each block
    FullFilterKeysMayMatch(read_options, filter, &sst_file_range, no_io,
3094
                           prefix_extractor, &lookup_context);
3095 3096 3097 3098 3099 3100 3101 3102 3103 3104
  }
  if (skip_filters || !sst_file_range.empty()) {
    IndexBlockIter iiter_on_stack;
    // if prefix_extractor found in block differs from options, disable
    // BlockPrefixIndex. Only do this check when index_type is kHashSearch.
    bool need_upper_bound_check = false;
    if (rep_->index_type == BlockBasedTableOptions::kHashSearch) {
      need_upper_bound_check = PrefixExtractorChanged(
          rep_->table_properties.get(), prefix_extractor);
    }
3105 3106
    auto iiter =
        NewIndexIterator(read_options, need_upper_bound_check, &iiter_on_stack,
3107
                         sst_file_range.begin()->get_context, &lookup_context);
3108 3109 3110 3111 3112
    std::unique_ptr<InternalIteratorBase<BlockHandle>> iiter_unique_ptr;
    if (iiter != &iiter_on_stack) {
      iiter_unique_ptr.reset(iiter);
    }

3113 3114
    DataBlockIter biter;
    uint64_t offset = std::numeric_limits<uint64_t>::max();
3115 3116 3117 3118 3119 3120 3121 3122
    for (auto miter = sst_file_range.begin(); miter != sst_file_range.end();
         ++miter) {
      Status s;
      GetContext* get_context = miter->get_context;
      const Slice& key = miter->ikey;
      bool matched = false;  // if such user key matched a key in SST
      bool done = false;
      for (iiter->Seek(key); iiter->Valid() && !done; iiter->Next()) {
3123
        bool reusing_block = true;
3124 3125 3126 3127
        uint64_t referenced_data_size = 0;
        bool does_referenced_key_exist = false;
        BlockCacheLookupContext lookup_data_block_context(
            BlockCacheLookupCaller::kUserMGet);
3128 3129 3130 3131
        if (iiter->value().offset() != offset) {
          offset = iiter->value().offset();
          biter.Invalidate(Status::OK());
          NewDataBlockIterator<DataBlockIter>(
3132 3133
              read_options, iiter->value(), &biter, BlockType::kData,
              /*key_includes_seq=*/false,
3134 3135
              /*index_key_is_full=*/true, get_context,
              &lookup_data_block_context, Status(), nullptr);
3136 3137
          reusing_block = false;
        }
3138 3139 3140 3141 3142 3143 3144 3145 3146 3147 3148 3149 3150 3151 3152 3153 3154 3155 3156
        if (read_options.read_tier == kBlockCacheTier &&
            biter.status().IsIncomplete()) {
          // couldn't get block from block_cache
          // Update Saver.state to Found because we are only looking for
          // whether we can guarantee the key is not there when "no_io" is set
          get_context->MarkKeyMayExist();
          break;
        }
        if (!biter.status().ok()) {
          s = biter.status();
          break;
        }

        bool may_exist = biter.SeekForGet(key);
        if (!may_exist) {
          // HashSeek cannot find the key this block and the the iter is not
          // the end of the block, i.e. cannot be in the following blocks
          // either. In this case, the seek_key cannot be found, so we break
          // from the top level for-loop.
3157 3158 3159 3160 3161 3162 3163 3164 3165 3166 3167 3168 3169 3170 3171 3172 3173 3174 3175 3176 3177 3178
          done = true;
        } else {
          // Call the *saver function on each entry/block until it returns false
          for (; biter.Valid(); biter.Next()) {
            ParsedInternalKey parsed_key;
            Cleanable dummy;
            Cleanable* value_pinner = nullptr;

            if (!ParseInternalKey(biter.key(), &parsed_key)) {
              s = Status::Corruption(Slice());
            }
            if (biter.IsValuePinned()) {
              if (reusing_block) {
                Cache* block_cache = rep_->table_options.block_cache.get();
                assert(biter.cache_handle() != nullptr);
                block_cache->Ref(biter.cache_handle());
                dummy.RegisterCleanup(&ReleaseCachedEntry, block_cache,
                                      biter.cache_handle());
                value_pinner = &dummy;
              } else {
                value_pinner = &biter;
              }
3179
            }
3180

3181 3182 3183 3184 3185 3186 3187
            if (!get_context->SaveValue(parsed_key, biter.value(), &matched,
                                        value_pinner)) {
              does_referenced_key_exist = true;
              referenced_data_size = biter.key().size() + biter.value().size();
              done = true;
              break;
            }
3188
          }
3189 3190 3191 3192 3193 3194 3195 3196 3197 3198 3199 3200 3201 3202 3203 3204 3205 3206 3207 3208
          s = biter.status();
        }
        // Write the block cache access.
        if (block_cache_tracer_) {
          // Avoid making copy of block_key, cf_name, and referenced_key when
          // constructing the access record.
          BlockCacheTraceRecord access_record(
              rep_->ioptions.env->NowMicros(),
              /*block_key=*/"", lookup_data_block_context.block_type,
              lookup_data_block_context.block_size, rep_->cf_id_for_tracing(),
              /*cf_name=*/"", rep_->level_for_tracing(),
              rep_->sst_number_for_tracing(), lookup_data_block_context.caller,
              lookup_data_block_context.is_cache_hit,
              lookup_data_block_context.no_insert,
              /*referenced_key=*/"", referenced_data_size,
              lookup_data_block_context.num_keys_in_block,
              does_referenced_key_exist);
          block_cache_tracer_->WriteBlockAccess(
              access_record, lookup_data_block_context.block_key,
              rep_->cf_name_for_tracing(), key);
3209 3210 3211 3212 3213 3214 3215 3216 3217 3218 3219 3220 3221 3222 3223 3224 3225 3226 3227
        }
        if (done) {
          // Avoid the extra Next which is expensive in two-level indexes
          break;
        }
      }
      if (matched && filter != nullptr && !filter->IsBlockBased()) {
        RecordTick(rep_->ioptions.statistics, BLOOM_FILTER_FULL_TRUE_POSITIVE);
        PERF_COUNTER_BY_LEVEL_ADD(bloom_filter_full_true_positive, 1,
                                  rep_->level);
      }
      if (s.ok()) {
        s = iiter->status();
      }
      *(miter->s) = s;
    }
  }
}

3228 3229 3230
Status BlockBasedTable::Prefetch(const Slice* const begin,
                                 const Slice* const end) {
  auto& comparator = rep_->internal_comparator;
M
Maysam Yabandeh 已提交
3231
  auto user_comparator = comparator.user_comparator();
3232 3233 3234 3235
  // pre-condition
  if (begin && end && comparator.Compare(*begin, *end) > 0) {
    return Status::InvalidArgument(*begin, *end);
  }
3236
  BlockCacheLookupContext lookup_context{BlockCacheLookupCaller::kPrefetch};
M
Maysam Yabandeh 已提交
3237
  IndexBlockIter iiter_on_stack;
3238 3239 3240
  auto iiter = NewIndexIterator(ReadOptions(), /*need_upper_bound_check=*/false,
                                &iiter_on_stack, /*get_context=*/nullptr,
                                &lookup_context);
3241
  std::unique_ptr<InternalIteratorBase<BlockHandle>> iiter_unique_ptr;
M
Maysam Yabandeh 已提交
3242
  if (iiter != &iiter_on_stack) {
3243 3244
    iiter_unique_ptr =
        std::unique_ptr<InternalIteratorBase<BlockHandle>>(iiter);
M
Maysam Yabandeh 已提交
3245
  }
3246

M
Maysam Yabandeh 已提交
3247
  if (!iiter->status().ok()) {
3248
    // error opening index iterator
M
Maysam Yabandeh 已提交
3249
    return iiter->status();
3250 3251 3252 3253 3254
  }

  // indicates if we are on the last page that need to be pre-fetched
  bool prefetching_boundary_page = false;

M
Maysam Yabandeh 已提交
3255 3256
  for (begin ? iiter->Seek(*begin) : iiter->SeekToFirst(); iiter->Valid();
       iiter->Next()) {
3257
    BlockHandle block_handle = iiter->value();
3258 3259
    const bool is_user_key = rep_->table_properties &&
                             rep_->table_properties->index_key_is_user_key > 0;
M
Maysam Yabandeh 已提交
3260 3261 3262 3263
    if (end &&
        ((!is_user_key && comparator.Compare(iiter->key(), *end) >= 0) ||
         (is_user_key &&
          user_comparator->Compare(iiter->key(), ExtractUserKey(*end)) >= 0))) {
3264 3265 3266 3267 3268 3269 3270 3271 3272 3273
      if (prefetching_boundary_page) {
        break;
      }

      // The index entry represents the last key in the data block.
      // We should load this page into memory as well, but no more
      prefetching_boundary_page = true;
    }

    // Load the block specified by the block_handle into the block cache
M
Maysam Yabandeh 已提交
3274
    DataBlockIter biter;
3275 3276 3277 3278 3279 3280

    NewDataBlockIterator<DataBlockIter>(
        ReadOptions(), block_handle, &biter, /*type=*/BlockType::kData,
        /*key_includes_seq=*/true, /*index_key_is_full=*/true,
        /*get_context=*/nullptr, &lookup_context, Status(),
        /*prefetch_buffer=*/nullptr);
3281 3282 3283 3284 3285 3286 3287 3288 3289 3290

    if (!biter.status().ok()) {
      // there was an unexpected error while pre-fetching
      return biter.status();
    }
  }

  return Status::OK();
}

A
Aaron G 已提交
3291
Status BlockBasedTable::VerifyChecksum() {
3292 3293
  // TODO(haoyu): This function is called by external sst ingestion and the
  // verify checksum public API. We don't log its block cache accesses for now.
A
Aaron G 已提交
3294 3295 3296 3297
  Status s;
  // Check Meta blocks
  std::unique_ptr<Block> meta;
  std::unique_ptr<InternalIterator> meta_iter;
3298
  s = ReadMetaBlock(nullptr /* prefetch buffer */, &meta, &meta_iter);
A
Aaron G 已提交
3299
  if (s.ok()) {
3300
    s = VerifyChecksumInMetaBlocks(meta_iter.get());
A
Aaron G 已提交
3301 3302 3303 3304 3305 3306 3307
    if (!s.ok()) {
      return s;
    }
  } else {
    return s;
  }
  // Check Data blocks
M
Maysam Yabandeh 已提交
3308
  IndexBlockIter iiter_on_stack;
3309 3310 3311
  InternalIteratorBase<BlockHandle>* iiter = NewIndexIterator(
      ReadOptions(), /*need_upper_bound_check=*/false, &iiter_on_stack,
      /*get_context=*/nullptr, /*lookup_contex=*/nullptr);
3312
  std::unique_ptr<InternalIteratorBase<BlockHandle>> iiter_unique_ptr;
A
Aaron G 已提交
3313
  if (iiter != &iiter_on_stack) {
3314 3315
    iiter_unique_ptr =
        std::unique_ptr<InternalIteratorBase<BlockHandle>>(iiter);
A
Aaron G 已提交
3316 3317 3318 3319 3320 3321 3322 3323 3324
  }
  if (!iiter->status().ok()) {
    // error opening index iterator
    return iiter->status();
  }
  s = VerifyChecksumInBlocks(iiter);
  return s;
}

3325 3326
Status BlockBasedTable::VerifyChecksumInBlocks(
    InternalIteratorBase<BlockHandle>* index_iter) {
A
Aaron G 已提交
3327 3328 3329 3330 3331 3332
  Status s;
  for (index_iter->SeekToFirst(); index_iter->Valid(); index_iter->Next()) {
    s = index_iter->status();
    if (!s.ok()) {
      break;
    }
3333 3334
    BlockHandle handle = index_iter->value();
    BlockContents contents;
3335 3336 3337
    BlockFetcher block_fetcher(
        rep_->file.get(), nullptr /* prefetch buffer */, rep_->footer,
        ReadOptions(), handle, &contents, rep_->ioptions,
3338
        false /* decompress */, false /*maybe_compressed*/, BlockType::kData,
3339
        UncompressionDict::GetEmptyDict(), rep_->persistent_cache_options);
3340 3341 3342 3343 3344 3345 3346 3347
    s = block_fetcher.ReadBlockContents();
    if (!s.ok()) {
      break;
    }
  }
  return s;
}

3348 3349 3350 3351 3352 3353 3354 3355 3356 3357 3358 3359 3360 3361 3362 3363 3364 3365 3366 3367 3368 3369 3370 3371 3372 3373 3374 3375 3376 3377 3378 3379
BlockType BlockBasedTable::GetBlockTypeForMetaBlockByName(
    const Slice& meta_block_name) {
  if (meta_block_name.starts_with(kFilterBlockPrefix) ||
      meta_block_name.starts_with(kFullFilterBlockPrefix) ||
      meta_block_name.starts_with(kPartitionedFilterBlockPrefix)) {
    return BlockType::kFilter;
  }

  if (meta_block_name == kPropertiesBlock) {
    return BlockType::kProperties;
  }

  if (meta_block_name == kCompressionDictBlock) {
    return BlockType::kCompressionDictionary;
  }

  if (meta_block_name == kRangeDelBlock) {
    return BlockType::kRangeDeletion;
  }

  if (meta_block_name == kHashIndexPrefixesBlock) {
    return BlockType::kHashIndexPrefixes;
  }

  if (meta_block_name == kHashIndexPrefixesMetadataBlock) {
    return BlockType::kHashIndexMetadata;
  }

  assert(false);
  return BlockType::kInvalid;
}

3380
Status BlockBasedTable::VerifyChecksumInMetaBlocks(
3381 3382 3383 3384
    InternalIteratorBase<Slice>* index_iter) {
  Status s;
  for (index_iter->SeekToFirst(); index_iter->Valid(); index_iter->Next()) {
    s = index_iter->status();
A
Aaron G 已提交
3385 3386 3387
    if (!s.ok()) {
      break;
    }
3388 3389 3390
    BlockHandle handle;
    Slice input = index_iter->value();
    s = handle.DecodeFrom(&input);
A
Aaron G 已提交
3391
    BlockContents contents;
3392
    const Slice meta_block_name = index_iter->key();
3393 3394 3395 3396
    BlockFetcher block_fetcher(
        rep_->file.get(), nullptr /* prefetch buffer */, rep_->footer,
        ReadOptions(), handle, &contents, rep_->ioptions,
        false /* decompress */, false /*maybe_compressed*/,
3397
        GetBlockTypeForMetaBlockByName(meta_block_name),
3398
        UncompressionDict::GetEmptyDict(), rep_->persistent_cache_options);
S
Siying Dong 已提交
3399
    s = block_fetcher.ReadBlockContents();
3400
    if (s.IsCorruption() && meta_block_name == kPropertiesBlock) {
3401
      TableProperties* table_properties;
3402
      s = TryReadPropertiesWithGlobalSeqno(nullptr /* prefetch_buffer */,
3403 3404 3405 3406
                                           index_iter->value(),
                                           &table_properties);
      delete table_properties;
    }
A
Aaron G 已提交
3407 3408 3409 3410 3411 3412 3413
    if (!s.ok()) {
      break;
    }
  }
  return s;
}

3414 3415 3416 3417 3418 3419 3420 3421 3422
bool BlockBasedTable::TEST_BlockInCache(const BlockHandle& handle) const {
  assert(rep_ != nullptr);

  Cache* const cache = rep_->table_options.block_cache.get();
  if (cache == nullptr) {
    return false;
  }

  char cache_key_storage[kMaxCacheKeyPrefixSize + kMaxVarint64Length];
3423 3424 3425
  Slice cache_key =
      GetCacheKey(rep_->cache_key_prefix, rep_->cache_key_prefix_size, handle,
                  cache_key_storage);
3426 3427 3428 3429 3430 3431 3432 3433 3434 3435 3436

  Cache::Handle* const cache_handle = cache->Lookup(cache_key);
  if (cache_handle == nullptr) {
    return false;
  }

  cache->Release(cache_handle);

  return true;
}

S
Siying Dong 已提交
3437 3438
bool BlockBasedTable::TEST_KeyInCache(const ReadOptions& options,
                                      const Slice& key) {
3439 3440 3441
  std::unique_ptr<InternalIteratorBase<BlockHandle>> iiter(NewIndexIterator(
      options, /*need_upper_bound_check=*/false, /*input_iter=*/nullptr,
      /*get_context=*/nullptr, /*lookup_contex=*/nullptr));
I
Igor Canadi 已提交
3442 3443 3444
  iiter->Seek(key);
  assert(iiter->Valid());

3445
  return TEST_BlockInCache(iiter->value());
3446
}
S
Sanjay Ghemawat 已提交
3447

3448
BlockBasedTableOptions::IndexType BlockBasedTable::UpdateIndexType() {
3449 3450
  // Some old version of block-based tables don't have index type present in
  // table properties. If that's the case we can safely use the kBinarySearch.
3451 3452
  BlockBasedTableOptions::IndexType index_type_on_file =
      BlockBasedTableOptions::kBinarySearch;
3453 3454 3455 3456 3457 3458
  if (rep_->table_properties) {
    auto& props = rep_->table_properties->user_collected_properties;
    auto pos = props.find(BlockBasedTablePropertyNames::kIndexType);
    if (pos != props.end()) {
      index_type_on_file = static_cast<BlockBasedTableOptions::IndexType>(
          DecodeFixed32(pos->second.c_str()));
3459 3460
      // update index_type with the true type
      rep_->index_type = index_type_on_file;
3461 3462
    }
  }
3463 3464 3465 3466 3467 3468 3469 3470 3471 3472
  return index_type_on_file;
}

// REQUIRES: The following fields of rep_ should have already been populated:
//  1. file
//  2. index_handle,
//  3. options
//  4. internal_comparator
//  5. index_type
Status BlockBasedTable::CreateIndexReader(
3473 3474
    FilePrefetchBuffer* prefetch_buffer,
    InternalIterator* preloaded_meta_index_iter, bool use_cache, bool prefetch,
3475 3476
    bool pin, IndexReader** index_reader,
    BlockCacheLookupContext* lookup_context) {
3477
  auto index_type_on_file = rep_->index_type;
3478

3479 3480
  // kHashSearch requires non-empty prefix_extractor but bypass checking
  // prefix_extractor here since we have no access to MutableCFOptions.
3481
  // Add need_upper_bound_check flag in  BlockBasedTable::NewIndexIterator.
3482 3483
  // If prefix_extractor does not match prefix_extractor_name from table
  // properties, turn off Hash Index by setting total_order_seek to true
3484

K
Kai Liu 已提交
3485
  switch (index_type_on_file) {
M
Maysam Yabandeh 已提交
3486
    case BlockBasedTableOptions::kTwoLevelIndexSearch: {
3487
      return PartitionIndexReader::Create(this, prefetch_buffer, use_cache,
3488 3489
                                          prefetch, pin, index_reader,
                                          lookup_context);
M
Maysam Yabandeh 已提交
3490
    }
3491
    case BlockBasedTableOptions::kBinarySearch: {
3492
      return BinarySearchIndexReader::Create(this, prefetch_buffer, use_cache,
3493 3494
                                             prefetch, pin, index_reader,
                                             lookup_context);
3495 3496
    }
    case BlockBasedTableOptions::kHashSearch: {
K
Kai Liu 已提交
3497
      std::unique_ptr<Block> meta_guard;
S
sdong 已提交
3498
      std::unique_ptr<InternalIterator> meta_iter_guard;
K
Kai Liu 已提交
3499 3500
      auto meta_index_iter = preloaded_meta_index_iter;
      if (meta_index_iter == nullptr) {
3501
        auto s = ReadMetaBlock(prefetch_buffer, &meta_guard, &meta_iter_guard);
K
Kai Liu 已提交
3502
        if (!s.ok()) {
3503 3504
          // we simply fall back to binary search in case there is any
          // problem with prefix hash index loading.
3505 3506 3507
          ROCKS_LOG_WARN(rep_->ioptions.info_log,
                         "Unable to read the metaindex block."
                         " Fall back to binary search index.");
3508 3509 3510
          return BinarySearchIndexReader::Create(this, prefetch_buffer,
                                                 use_cache, prefetch, pin,
                                                 index_reader, lookup_context);
K
Kai Liu 已提交
3511 3512 3513 3514
        }
        meta_index_iter = meta_iter_guard.get();
      }

3515
      return HashIndexReader::Create(this, prefetch_buffer, meta_index_iter,
3516 3517
                                     use_cache, prefetch, pin, index_reader,
                                     lookup_context);
3518 3519 3520
    }
    default: {
      std::string error_message =
3521
          "Unrecognized index type: " + ToString(index_type_on_file);
3522
      return Status::InvalidArgument(error_message.c_str());
3523 3524 3525 3526
    }
  }
}

3527 3528 3529 3530 3531
uint64_t BlockBasedTable::ApproximateOffsetOf(const Slice& key,
                                              bool for_compaction) {
  BlockCacheLookupContext context(
      for_compaction ? BlockCacheLookupCaller::kCompaction
                     : BlockCacheLookupCaller::kUserApproximateSize);
3532
  std::unique_ptr<InternalIteratorBase<BlockHandle>> index_iter(
3533 3534 3535
      NewIndexIterator(ReadOptions(), /*need_upper_bound_check=*/false,
                       /*input_iter=*/nullptr, /*get_context=*/nullptr,
                       /*lookup_contex=*/&context));
K
Kai Liu 已提交
3536

J
jorlow@chromium.org 已提交
3537 3538 3539
  index_iter->Seek(key);
  uint64_t result;
  if (index_iter->Valid()) {
3540 3541
    BlockHandle handle = index_iter->value();
    result = handle.offset();
J
jorlow@chromium.org 已提交
3542
  } else {
K
Kai Liu 已提交
3543 3544 3545
    // key is past the last key in the file. If table_properties is not
    // available, approximate the offset by returning the offset of the
    // metaindex block (which is right near the end of the file).
3546 3547 3548 3549
    result = 0;
    if (rep_->table_properties) {
      result = rep_->table_properties->data_size;
    }
K
Kai Liu 已提交
3550 3551
    // table_properties is not present in the table.
    if (result == 0) {
I
xxHash  
Igor Canadi 已提交
3552
      result = rep_->footer.metaindex_handle().offset();
K
Kai Liu 已提交
3553
    }
J
jorlow@chromium.org 已提交
3554 3555 3556 3557
  }
  return result;
}

3558 3559 3560 3561
bool BlockBasedTable::TEST_filter_block_preloaded() const {
  return rep_->filter != nullptr;
}

3562 3563 3564 3565
bool BlockBasedTable::TEST_IndexBlockInCache() const {
  assert(rep_ != nullptr);

  return TEST_BlockInCache(rep_->footer.index_handle());
3566 3567
}

O
omegaga 已提交
3568 3569
Status BlockBasedTable::GetKVPairsFromDataBlocks(
    std::vector<KVPairBlock>* kv_pair_blocks) {
3570
  std::unique_ptr<InternalIteratorBase<BlockHandle>> blockhandles_iter(
3571 3572 3573
      NewIndexIterator(ReadOptions(), /*need_upper_bound_check=*/false,
                       /*input_iter=*/nullptr, /*get_context=*/nullptr,
                       /*lookup_contex=*/nullptr));
O
omegaga 已提交
3574 3575 3576 3577 3578 3579 3580 3581 3582 3583 3584 3585 3586 3587 3588 3589

  Status s = blockhandles_iter->status();
  if (!s.ok()) {
    // Cannot read Index Block
    return s;
  }

  for (blockhandles_iter->SeekToFirst(); blockhandles_iter->Valid();
       blockhandles_iter->Next()) {
    s = blockhandles_iter->status();

    if (!s.ok()) {
      break;
    }

    std::unique_ptr<InternalIterator> datablock_iter;
M
Maysam Yabandeh 已提交
3590
    datablock_iter.reset(NewDataBlockIterator<DataBlockIter>(
3591 3592 3593 3594 3595
        ReadOptions(), blockhandles_iter->value(), /*input_iter=*/nullptr,
        /*type=*/BlockType::kData,
        /*key_includes_seq=*/true, /*index_key_is_full=*/true,
        /*get_context=*/nullptr, /*lookup_context=*/nullptr, Status(),
        /*prefetch_buffer=*/nullptr));
O
omegaga 已提交
3596 3597 3598 3599 3600 3601 3602 3603 3604 3605 3606 3607 3608 3609 3610 3611 3612 3613 3614 3615 3616 3617 3618 3619 3620 3621 3622 3623
    s = datablock_iter->status();

    if (!s.ok()) {
      // Error reading the block - Skipped
      continue;
    }

    KVPairBlock kv_pair_block;
    for (datablock_iter->SeekToFirst(); datablock_iter->Valid();
         datablock_iter->Next()) {
      s = datablock_iter->status();
      if (!s.ok()) {
        // Error reading the block - Skipped
        break;
      }
      const Slice& key = datablock_iter->key();
      const Slice& value = datablock_iter->value();
      std::string key_copy = std::string(key.data(), key.size());
      std::string value_copy = std::string(value.data(), value.size());

      kv_pair_block.push_back(
          std::make_pair(std::move(key_copy), std::move(value_copy)));
    }
    kv_pair_blocks->push_back(std::move(kv_pair_block));
  }
  return Status::OK();
}

3624 3625
Status BlockBasedTable::DumpTable(WritableFile* out_file,
                                  const SliceTransform* prefix_extractor) {
3626 3627 3628 3629 3630 3631 3632 3633 3634 3635 3636 3637 3638
  // Output Footer
  out_file->Append(
      "Footer Details:\n"
      "--------------------------------------\n"
      "  ");
  out_file->Append(rep_->footer.ToString().c_str());
  out_file->Append("\n");

  // Output MetaIndex
  out_file->Append(
      "Metaindex Details:\n"
      "--------------------------------------\n");
  std::unique_ptr<Block> meta;
S
sdong 已提交
3639
  std::unique_ptr<InternalIterator> meta_iter;
3640
  Status s = ReadMetaBlock(nullptr /* prefetch_buffer */, &meta, &meta_iter);
3641 3642 3643 3644 3645 3646 3647 3648 3649 3650
  if (s.ok()) {
    for (meta_iter->SeekToFirst(); meta_iter->Valid(); meta_iter->Next()) {
      s = meta_iter->status();
      if (!s.ok()) {
        return s;
      }
      if (meta_iter->key() == rocksdb::kPropertiesBlock) {
        out_file->Append("  Properties block handle: ");
        out_file->Append(meta_iter->value().ToString(true).c_str());
        out_file->Append("\n");
3651 3652 3653 3654
      } else if (meta_iter->key() == rocksdb::kCompressionDictBlock) {
        out_file->Append("  Compression dictionary block handle: ");
        out_file->Append(meta_iter->value().ToString(true).c_str());
        out_file->Append("\n");
3655 3656 3657 3658 3659
      } else if (strstr(meta_iter->key().ToString().c_str(),
                        "filter.rocksdb.") != nullptr) {
        out_file->Append("  Filter block handle: ");
        out_file->Append(meta_iter->value().ToString(true).c_str());
        out_file->Append("\n");
3660 3661 3662 3663
      } else if (meta_iter->key() == rocksdb::kRangeDelBlock) {
        out_file->Append("  Range deletion block handle: ");
        out_file->Append(meta_iter->value().ToString(true).c_str());
        out_file->Append("\n");
3664 3665 3666 3667 3668 3669 3670 3671 3672 3673 3674 3675 3676 3677 3678 3679 3680 3681 3682
      }
    }
    out_file->Append("\n");
  } else {
    return s;
  }

  // Output TableProperties
  const rocksdb::TableProperties* table_properties;
  table_properties = rep_->table_properties.get();

  if (table_properties != nullptr) {
    out_file->Append(
        "Table Properties:\n"
        "--------------------------------------\n"
        "  ");
    out_file->Append(table_properties->ToString("\n  ", ": ").c_str());
    out_file->Append("\n");

3683 3684 3685 3686 3687 3688 3689 3690 3691 3692 3693 3694 3695 3696 3697
    // Output Filter blocks
    if (!rep_->filter && !table_properties->filter_policy_name.empty()) {
      // Support only BloomFilter as off now
      rocksdb::BlockBasedTableOptions table_options;
      table_options.filter_policy.reset(rocksdb::NewBloomFilterPolicy(1));
      if (table_properties->filter_policy_name.compare(
              table_options.filter_policy->Name()) == 0) {
        std::string filter_block_key = kFilterBlockPrefix;
        filter_block_key.append(table_properties->filter_policy_name);
        BlockHandle handle;
        if (FindMetaBlock(meta_iter.get(), filter_block_key, &handle).ok()) {
          BlockContents block;
          BlockFetcher block_fetcher(
              rep_->file.get(), nullptr /* prefetch_buffer */, rep_->footer,
              ReadOptions(), handle, &block, rep_->ioptions,
3698
              false /*decompress*/, false /*maybe_compressed*/,
3699
              BlockType::kFilter, UncompressionDict::GetEmptyDict(),
3700
              rep_->persistent_cache_options);
3701 3702 3703 3704 3705 3706 3707
          s = block_fetcher.ReadBlockContents();
          if (!s.ok()) {
            rep_->filter.reset(new BlockBasedFilterBlockReader(
                prefix_extractor, table_options,
                table_options.whole_key_filtering, std::move(block),
                rep_->ioptions.statistics));
          }
3708 3709 3710 3711 3712 3713 3714 3715 3716 3717 3718 3719 3720 3721 3722 3723 3724 3725
        }
      }
    }
  }
  if (rep_->filter) {
    out_file->Append(
        "Filter Details:\n"
        "--------------------------------------\n"
        "  ");
    out_file->Append(rep_->filter->ToString().c_str());
    out_file->Append("\n");
  }

  // Output Index block
  s = DumpIndexBlock(out_file);
  if (!s.ok()) {
    return s;
  }
3726 3727

  // Output compression dictionary
3728 3729
  if (!rep_->compression_dict_handle.IsNull()) {
    std::unique_ptr<const BlockContents> compression_dict_block;
3730
    s = ReadCompressionDictBlock(nullptr /* prefetch_buffer */,
3731 3732 3733 3734 3735 3736
                                 &compression_dict_block);
    if (!s.ok()) {
      return s;
    }
    assert(compression_dict_block != nullptr);
    auto compression_dict = compression_dict_block->data;
3737 3738 3739 3740 3741 3742 3743 3744 3745 3746 3747
    out_file->Append(
        "Compression Dictionary:\n"
        "--------------------------------------\n");
    out_file->Append("  size (bytes): ");
    out_file->Append(rocksdb::ToString(compression_dict.size()));
    out_file->Append("\n\n");
    out_file->Append("  HEX    ");
    out_file->Append(compression_dict.ToString(true).c_str());
    out_file->Append("\n\n");
  }

3748
  // Output range deletions block
A
Andrew Kryczka 已提交
3749
  auto* range_del_iter = NewRangeTombstoneIterator(ReadOptions());
A
Andrew Kryczka 已提交
3750 3751 3752 3753 3754 3755 3756 3757 3758 3759 3760
  if (range_del_iter != nullptr) {
    range_del_iter->SeekToFirst();
    if (range_del_iter->Valid()) {
      out_file->Append(
          "Range deletions:\n"
          "--------------------------------------\n"
          "  ");
      for (; range_del_iter->Valid(); range_del_iter->Next()) {
        DumpKeyValue(range_del_iter->key(), range_del_iter->value(), out_file);
      }
      out_file->Append("\n");
3761
    }
A
Andrew Kryczka 已提交
3762
    delete range_del_iter;
3763
  }
3764 3765 3766 3767 3768 3769
  // Output Data blocks
  s = DumpDataBlocks(out_file);

  return s;
}

3770
void BlockBasedTable::Close() {
3771 3772 3773
  if (rep_->closed) {
    return;
  }
3774 3775 3776 3777 3778

  Cache* const cache = rep_->table_options.block_cache.get();

  // cleanup index, filter, and compression dictionary blocks
  // to avoid accessing dangling pointers
3779 3780
  if (!rep_->table_options.no_block_cache) {
    char cache_key[kMaxCacheKeyPrefixSize + kMaxVarint64Length];
3781

3782 3783
    // Get the filter block key
    auto key = GetCacheKey(rep_->cache_key_prefix, rep_->cache_key_prefix_size,
M
Maysam Yabandeh 已提交
3784
                           rep_->filter_handle, cache_key);
3785 3786 3787 3788 3789 3790 3791 3792
    cache->Erase(key);

    if (!rep_->compression_dict_handle.IsNull()) {
      // Get the compression dictionary block key
      key = GetCacheKey(rep_->cache_key_prefix, rep_->cache_key_prefix_size,
                        rep_->compression_dict_handle, cache_key);
      cache->Erase(key);
    }
3793
  }
3794

3795
  rep_->closed = true;
3796 3797
}

3798 3799 3800 3801
Status BlockBasedTable::DumpIndexBlock(WritableFile* out_file) {
  out_file->Append(
      "Index Details:\n"
      "--------------------------------------\n");
3802
  std::unique_ptr<InternalIteratorBase<BlockHandle>> blockhandles_iter(
3803 3804 3805
      NewIndexIterator(ReadOptions(), /*need_upper_bound_check=*/false,
                       /*input_iter=*/nullptr, /*get_context=*/nullptr,
                       /*lookup_contex=*/nullptr));
3806 3807 3808 3809 3810 3811 3812 3813 3814 3815 3816 3817 3818 3819 3820
  Status s = blockhandles_iter->status();
  if (!s.ok()) {
    out_file->Append("Can not read Index Block \n\n");
    return s;
  }

  out_file->Append("  Block key hex dump: Data block handle\n");
  out_file->Append("  Block key ascii\n\n");
  for (blockhandles_iter->SeekToFirst(); blockhandles_iter->Valid();
       blockhandles_iter->Next()) {
    s = blockhandles_iter->status();
    if (!s.ok()) {
      break;
    }
    Slice key = blockhandles_iter->key();
M
Maysam Yabandeh 已提交
3821
    Slice user_key;
3822
    InternalKey ikey;
3823 3824 3825 3826
    if (rep_->table_properties &&
        rep_->table_properties->index_key_is_user_key != 0) {
      user_key = key;
    } else {
M
Maysam Yabandeh 已提交
3827 3828 3829
      ikey.DecodeFrom(key);
      user_key = ikey.user_key();
    }
3830 3831

    out_file->Append("  HEX    ");
M
Maysam Yabandeh 已提交
3832
    out_file->Append(user_key.ToString(true).c_str());
3833 3834 3835 3836
    out_file->Append(": ");
    out_file->Append(blockhandles_iter->value().ToString(true).c_str());
    out_file->Append("\n");

M
Maysam Yabandeh 已提交
3837
    std::string str_key = user_key.ToString();
3838 3839 3840 3841 3842 3843 3844 3845 3846 3847 3848 3849 3850 3851 3852
    std::string res_key("");
    char cspace = ' ';
    for (size_t i = 0; i < str_key.size(); i++) {
      res_key.append(&str_key[i], 1);
      res_key.append(1, cspace);
    }
    out_file->Append("  ASCII  ");
    out_file->Append(res_key.c_str());
    out_file->Append("\n  ------\n");
  }
  out_file->Append("\n");
  return Status::OK();
}

Status BlockBasedTable::DumpDataBlocks(WritableFile* out_file) {
3853
  std::unique_ptr<InternalIteratorBase<BlockHandle>> blockhandles_iter(
3854 3855 3856
      NewIndexIterator(ReadOptions(), /*need_upper_bound_check=*/false,
                       /*input_iter=*/nullptr, /*get_context=*/nullptr,
                       /*lookup_contex=*/nullptr));
3857 3858 3859 3860 3861 3862
  Status s = blockhandles_iter->status();
  if (!s.ok()) {
    out_file->Append("Can not read Index Block \n\n");
    return s;
  }

3863 3864 3865 3866
  uint64_t datablock_size_min = std::numeric_limits<uint64_t>::max();
  uint64_t datablock_size_max = 0;
  uint64_t datablock_size_sum = 0;

3867 3868 3869 3870 3871 3872 3873 3874
  size_t block_id = 1;
  for (blockhandles_iter->SeekToFirst(); blockhandles_iter->Valid();
       block_id++, blockhandles_iter->Next()) {
    s = blockhandles_iter->status();
    if (!s.ok()) {
      break;
    }

3875
    BlockHandle bh = blockhandles_iter->value();
3876 3877 3878 3879 3880
    uint64_t datablock_size = bh.size();
    datablock_size_min = std::min(datablock_size_min, datablock_size);
    datablock_size_max = std::max(datablock_size_max, datablock_size);
    datablock_size_sum += datablock_size;

3881
    out_file->Append("Data Block # ");
S
sdong 已提交
3882
    out_file->Append(rocksdb::ToString(block_id));
3883 3884 3885 3886 3887
    out_file->Append(" @ ");
    out_file->Append(blockhandles_iter->value().ToString(true).c_str());
    out_file->Append("\n");
    out_file->Append("--------------------------------------\n");

S
sdong 已提交
3888
    std::unique_ptr<InternalIterator> datablock_iter;
M
Maysam Yabandeh 已提交
3889
    datablock_iter.reset(NewDataBlockIterator<DataBlockIter>(
3890 3891 3892 3893 3894
        ReadOptions(), blockhandles_iter->value(), /*input_iter=*/nullptr,
        /*type=*/BlockType::kData,
        /*key_includes_seq=*/true, /*index_key_is_full=*/true,
        /*get_context=*/nullptr, /*lookup_context=*/nullptr, Status(),
        /*prefetch_buffer=*/nullptr));
3895 3896 3897 3898 3899 3900 3901 3902 3903 3904 3905 3906 3907 3908
    s = datablock_iter->status();

    if (!s.ok()) {
      out_file->Append("Error reading the block - Skipped \n\n");
      continue;
    }

    for (datablock_iter->SeekToFirst(); datablock_iter->Valid();
         datablock_iter->Next()) {
      s = datablock_iter->status();
      if (!s.ok()) {
        out_file->Append("Error reading the block - Skipped \n");
        break;
      }
3909
      DumpKeyValue(datablock_iter->key(), datablock_iter->value(), out_file);
3910 3911 3912
    }
    out_file->Append("\n");
  }
3913 3914 3915 3916 3917 3918 3919 3920 3921 3922 3923 3924 3925 3926 3927 3928 3929 3930

  uint64_t num_datablocks = block_id - 1;
  if (num_datablocks) {
    double datablock_size_avg =
        static_cast<double>(datablock_size_sum) / num_datablocks;
    out_file->Append("Data Block Summary:\n");
    out_file->Append("--------------------------------------");
    out_file->Append("\n  # data blocks: ");
    out_file->Append(rocksdb::ToString(num_datablocks));
    out_file->Append("\n  min data block size: ");
    out_file->Append(rocksdb::ToString(datablock_size_min));
    out_file->Append("\n  max data block size: ");
    out_file->Append(rocksdb::ToString(datablock_size_max));
    out_file->Append("\n  avg data block size: ");
    out_file->Append(rocksdb::ToString(datablock_size_avg));
    out_file->Append("\n");
  }

3931 3932 3933
  return Status::OK();
}

3934 3935 3936 3937 3938 3939 3940 3941 3942 3943 3944 3945 3946 3947 3948 3949
void BlockBasedTable::DumpKeyValue(const Slice& key, const Slice& value,
                                   WritableFile* out_file) {
  InternalKey ikey;
  ikey.DecodeFrom(key);

  out_file->Append("  HEX    ");
  out_file->Append(ikey.user_key().ToString(true).c_str());
  out_file->Append(": ");
  out_file->Append(value.ToString(true).c_str());
  out_file->Append("\n");

  std::string str_key = ikey.user_key().ToString();
  std::string str_value = value.ToString();
  std::string res_key(""), res_value("");
  char cspace = ' ';
  for (size_t i = 0; i < str_key.size(); i++) {
3950 3951 3952 3953 3954
    if (str_key[i] == '\0') {
      res_key.append("\\0", 2);
    } else {
      res_key.append(&str_key[i], 1);
    }
3955 3956 3957
    res_key.append(1, cspace);
  }
  for (size_t i = 0; i < str_value.size(); i++) {
3958 3959 3960 3961 3962
    if (str_value[i] == '\0') {
      res_value.append("\\0", 2);
    } else {
      res_value.append(&str_value[i], 1);
    }
3963 3964 3965 3966 3967 3968 3969 3970 3971 3972
    res_value.append(1, cspace);
  }

  out_file->Append("  ASCII  ");
  out_file->Append(res_key.c_str());
  out_file->Append(": ");
  out_file->Append(res_value.c_str());
  out_file->Append("\n  ------\n");
}

3973 3974
namespace {

A
Andrew Kryczka 已提交
3975
void DeleteCachedFilterEntry(const Slice& /*key*/, void* value) {
3976 3977 3978
  FilterBlockReader* filter = reinterpret_cast<FilterBlockReader*>(value);
  if (filter->statistics() != nullptr) {
    RecordTick(filter->statistics(), BLOCK_CACHE_FILTER_BYTES_EVICT,
3979
               filter->ApproximateMemoryUsage());
3980 3981 3982 3983
  }
  delete filter;
}

3984 3985 3986 3987 3988 3989 3990
void DeleteCachedUncompressionDictEntry(const Slice& /*key*/, void* value) {
  UncompressionDict* dict = reinterpret_cast<UncompressionDict*>(value);
  RecordTick(dict->statistics(), BLOCK_CACHE_COMPRESSION_DICT_BYTES_EVICT,
             dict->ApproximateMemoryUsage());
  delete dict;
}

3991 3992
}  // anonymous namespace

3993
}  // namespace rocksdb