db_iter.cc 40.6 KB
Newer Older
1
//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
2 3 4
//  This source code is licensed under the BSD-style license found in the
//  LICENSE file in the root directory of this source tree. An additional grant
//  of patent rights can be found in the PATENTS file in the same directory.
5 6
//  This source code is also licensed under the GPLv2 license found in the
//  COPYING file in the root directory of this source tree.
7
//
J
jorlow@chromium.org 已提交
8 9 10 11 12
// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file. See the AUTHORS file for names of contributors.

#include "db/db_iter.h"
13
#include <stdexcept>
14
#include <deque>
S
Stanislau Hlebik 已提交
15
#include <string>
S
Stanislau Hlebik 已提交
16
#include <limits>
J
jorlow@chromium.org 已提交
17 18

#include "db/dbformat.h"
19
#include "db/merge_context.h"
20
#include "db/merge_helper.h"
21
#include "db/pinned_iterators_manager.h"
22
#include "monitoring/perf_context_imp.h"
S
sdong 已提交
23
#include "port/port.h"
24 25 26
#include "rocksdb/env.h"
#include "rocksdb/iterator.h"
#include "rocksdb/merge_operator.h"
27
#include "rocksdb/options.h"
S
sdong 已提交
28
#include "table/internal_iterator.h"
29
#include "util/arena.h"
30
#include "util/filename.h"
J
jorlow@chromium.org 已提交
31 32
#include "util/logging.h"
#include "util/mutexlock.h"
33
#include "util/string_util.h"
J
jorlow@chromium.org 已提交
34

35
namespace rocksdb {
J
jorlow@chromium.org 已提交
36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56

#if 0
static void DumpInternalIter(Iterator* iter) {
  for (iter->SeekToFirst(); iter->Valid(); iter->Next()) {
    ParsedInternalKey k;
    if (!ParseInternalKey(iter->key(), &k)) {
      fprintf(stderr, "Corrupt '%s'\n", EscapeString(iter->key()).c_str());
    } else {
      fprintf(stderr, "@ '%s'\n", k.DebugString().c_str());
    }
  }
}
#endif

// Memtables and sstables that make the DB representation contain
// (userkey,seq,type) => uservalue entries.  DBIter
// combines multiple entries for the same userkey found in the DB
// representation into a single entry while accounting for sequence
// numbers, deletion markers, overwrites, etc.
class DBIter: public Iterator {
 public:
57
  // The following is grossly complicated. TODO: clean it up
J
jorlow@chromium.org 已提交
58 59 60 61 62 63 64 65 66 67
  // Which direction is the iterator currently moving?
  // (1) When moving forward, the internal iterator is positioned at
  //     the exact entry that yields this->key(), this->value()
  // (2) When moving backwards, the internal iterator is positioned
  //     just before all entries whose user key == this->key().
  enum Direction {
    kForward,
    kReverse
  };

68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105
  // LocalStatistics contain Statistics counters that will be aggregated per
  // each iterator instance and then will be sent to the global statistics when
  // the iterator is destroyed.
  //
  // The purpose of this approach is to avoid perf regression happening
  // when multiple threads bump the atomic counters from a DBIter::Next().
  struct LocalStatistics {
    explicit LocalStatistics() { ResetCounters(); }

    void ResetCounters() {
      next_count_ = 0;
      next_found_count_ = 0;
      prev_count_ = 0;
      prev_found_count_ = 0;
      bytes_read_ = 0;
    }

    void BumpGlobalStatistics(Statistics* global_statistics) {
      RecordTick(global_statistics, NUMBER_DB_NEXT, next_count_);
      RecordTick(global_statistics, NUMBER_DB_NEXT_FOUND, next_found_count_);
      RecordTick(global_statistics, NUMBER_DB_PREV, prev_count_);
      RecordTick(global_statistics, NUMBER_DB_PREV_FOUND, prev_found_count_);
      RecordTick(global_statistics, ITER_BYTES_READ, bytes_read_);
      ResetCounters();
    }

    // Map to Tickers::NUMBER_DB_NEXT
    uint64_t next_count_;
    // Map to Tickers::NUMBER_DB_NEXT_FOUND
    uint64_t next_found_count_;
    // Map to Tickers::NUMBER_DB_PREV
    uint64_t prev_count_;
    // Map to Tickers::NUMBER_DB_PREV_FOUND
    uint64_t prev_found_count_;
    // Map to Tickers::ITER_BYTES_READ
    uint64_t bytes_read_;
  };

106 107
  DBIter(Env* env, const ReadOptions& read_options,
         const ImmutableCFOptions& cf_options, const Comparator* cmp,
S
sdong 已提交
108
         InternalIterator* iter, SequenceNumber s, bool arena_mode,
109
         uint64_t max_sequential_skip_in_iterations, uint64_t version_number)
110 111
      : arena_mode_(arena_mode),
        env_(env),
112
        logger_(cf_options.info_log),
J
jorlow@chromium.org 已提交
113
        user_comparator_(cmp),
114
        merge_operator_(cf_options.merge_operator),
J
jorlow@chromium.org 已提交
115 116
        iter_(iter),
        sequence_(s),
J
jorlow@chromium.org 已提交
117
        direction_(kForward),
118
        valid_(false),
119
        current_entry_is_merged_(false),
120
        statistics_(cf_options.statistics),
121
        version_number_(version_number),
122 123 124 125 126
        iterate_upper_bound_(read_options.iterate_upper_bound),
        prefix_same_as_start_(read_options.prefix_same_as_start),
        pin_thru_lifetime_(read_options.pin_data),
        total_order_seek_(read_options.total_order_seek),
        range_del_agg_(cf_options.internal_comparator, s,
A
Andrew Kryczka 已提交
127
                       true /* collapse_deletions */) {
L
Lei Jin 已提交
128
    RecordTick(statistics_, NO_ITERATORS);
129
    prefix_extractor_ = cf_options.prefix_extractor;
130
    max_skip_ = max_sequential_skip_in_iterations;
131
    max_skippable_internal_keys_ = read_options.max_skippable_internal_keys;
132 133 134 135 136 137
    if (pin_thru_lifetime_) {
      pinned_iters_mgr_.StartPinning();
    }
    if (iter_) {
      iter_->SetPinnedItersMgr(&pinned_iters_mgr_);
    }
J
jorlow@chromium.org 已提交
138 139
  }
  virtual ~DBIter() {
140
    // Release pinned data if any
141 142 143
    if (pinned_iters_mgr_.PinningEnabled()) {
      pinned_iters_mgr_.ReleasePinnedData();
    }
144
    RecordTick(statistics_, NO_ITERATORS, -1);
145
    local_stats_.BumpGlobalStatistics(statistics_);
146 147 148
    if (!arena_mode_) {
      delete iter_;
    } else {
S
sdong 已提交
149
      iter_->~InternalIterator();
150 151
    }
  }
S
sdong 已提交
152
  virtual void SetIter(InternalIterator* iter) {
153 154
    assert(iter_ == nullptr);
    iter_ = iter;
155
    iter_->SetPinnedItersMgr(&pinned_iters_mgr_);
J
jorlow@chromium.org 已提交
156
  }
A
Andrew Kryczka 已提交
157 158 159 160
  virtual RangeDelAggregator* GetRangeDelAggregator() {
    return &range_del_agg_;
  }

I
Igor Sugak 已提交
161 162
  virtual bool Valid() const override { return valid_; }
  virtual Slice key() const override {
J
jorlow@chromium.org 已提交
163
    assert(valid_);
164
    return saved_key_.GetUserKey();
J
jorlow@chromium.org 已提交
165
  }
I
Igor Sugak 已提交
166
  virtual Slice value() const override {
J
jorlow@chromium.org 已提交
167
    assert(valid_);
168
    if (current_entry_is_merged_) {
169 170 171
      // If pinned_value_ is set then the result of merge operator is one of
      // the merge operands and we should return it.
      return pinned_value_.data() ? pinned_value_ : saved_value_;
172 173 174 175 176
    } else if (direction_ == kReverse) {
      return pinned_value_;
    } else {
      return iter_->value();
    }
J
jorlow@chromium.org 已提交
177
  }
I
Igor Sugak 已提交
178
  virtual Status status() const override {
J
jorlow@chromium.org 已提交
179 180 181 182 183 184
    if (status_.ok()) {
      return iter_->status();
    } else {
      return status_;
    }
  }
185 186 187 188 189 190

  virtual Status GetProperty(std::string prop_name,
                             std::string* prop) override {
    if (prop == nullptr) {
      return Status::InvalidArgument("prop is nullptr");
    }
191
    if (prop_name == "rocksdb.iterator.super-version-number") {
192 193 194 195 196 197
      // First try to pass the value returned from inner iterator.
      if (!iter_->GetProperty(prop_name, prop).ok()) {
        *prop = ToString(version_number_);
      }
      return Status::OK();
    } else if (prop_name == "rocksdb.iterator.is-key-pinned") {
198
      if (valid_) {
199
        *prop = (pin_thru_lifetime_ && saved_key_.IsKeyPinned()) ? "1" : "0";
200 201 202 203 204 205
      } else {
        *prop = "Iterator is not valid.";
      }
      return Status::OK();
    }
    return Status::InvalidArgument("Undentified property.");
206
  }
J
jorlow@chromium.org 已提交
207

I
Igor Sugak 已提交
208 209 210
  virtual void Next() override;
  virtual void Prev() override;
  virtual void Seek(const Slice& target) override;
A
Aaron Gao 已提交
211
  virtual void SeekForPrev(const Slice& target) override;
I
Igor Sugak 已提交
212 213
  virtual void SeekToFirst() override;
  virtual void SeekToLast() override;
J
jorlow@chromium.org 已提交
214

J
jorlow@chromium.org 已提交
215
 private:
216
  void ReverseToForward();
217
  void ReverseToBackward();
S
Stanislau Hlebik 已提交
218 219 220 221 222 223
  void PrevInternal();
  void FindParseableKey(ParsedInternalKey* ikey, Direction direction);
  bool FindValueForCurrentKey();
  bool FindValueForCurrentKeyUsingSeek();
  void FindPrevUserKey();
  void FindNextUserKey();
224 225
  inline void FindNextUserEntry(bool skipping, bool prefix_check);
  void FindNextUserEntryInternal(bool skipping, bool prefix_check);
J
jorlow@chromium.org 已提交
226
  bool ParseKey(ParsedInternalKey* key);
227
  void MergeValuesNewToOld();
228
  bool TooManyInternalKeysSkipped(bool increment = true);
J
jorlow@chromium.org 已提交
229

230 231 232 233 234 235 236 237 238 239
  // Temporarily pin the blocks that we encounter until ReleaseTempPinnedData()
  // is called
  void TempPinData() {
    if (!pin_thru_lifetime_) {
      pinned_iters_mgr_.StartPinning();
    }
  }

  // Release blocks pinned by TempPinData()
  void ReleaseTempPinnedData() {
240 241
    if (!pin_thru_lifetime_ && pinned_iters_mgr_.PinningEnabled()) {
      pinned_iters_mgr_.ReleasePinnedData();
242 243 244
    }
  }

J
jorlow@chromium.org 已提交
245 246 247 248 249 250 251 252 253
  inline void ClearSavedValue() {
    if (saved_value_.capacity() > 1048576) {
      std::string empty;
      swap(empty, saved_value_);
    } else {
      saved_value_.clear();
    }
  }

254 255 256 257
  inline void ResetInternalKeysSkippedCounter() {
    num_internal_keys_skipped_ = 0;
  }

258
  const SliceTransform* prefix_extractor_;
259
  bool arena_mode_;
J
jorlow@chromium.org 已提交
260
  Env* const env_;
I
Igor Canadi 已提交
261
  Logger* logger_;
J
jorlow@chromium.org 已提交
262
  const Comparator* const user_comparator_;
263
  const MergeOperator* const merge_operator_;
S
sdong 已提交
264
  InternalIterator* iter_;
J
jorlow@chromium.org 已提交
265
  SequenceNumber const sequence_;
J
jorlow@chromium.org 已提交
266

J
jorlow@chromium.org 已提交
267
  Status status_;
S
Stanislau Hlebik 已提交
268 269
  IterKey saved_key_;
  std::string saved_value_;
270
  Slice pinned_value_;
J
jorlow@chromium.org 已提交
271
  Direction direction_;
J
jorlow@chromium.org 已提交
272
  bool valid_;
273
  bool current_entry_is_merged_;
274
  // for prefix seek mode to support prev()
275
  Statistics* statistics_;
276
  uint64_t max_skip_;
277 278
  uint64_t max_skippable_internal_keys_;
  uint64_t num_internal_keys_skipped_;
279
  uint64_t version_number_;
280
  const Slice* iterate_upper_bound_;
281 282 283
  IterKey prefix_start_buf_;
  Slice prefix_start_key_;
  const bool prefix_same_as_start_;
284 285 286
  // Means that we will pin all data blocks we read as long the Iterator
  // is not deleted, will be true if ReadOptions::pin_data is true
  const bool pin_thru_lifetime_;
287
  const bool total_order_seek_;
288
  // List of operands for merge operator.
289
  MergeContext merge_context_;
A
Andrew Kryczka 已提交
290
  RangeDelAggregator range_del_agg_;
291
  LocalStatistics local_stats_;
292
  PinnedIteratorsManager pinned_iters_mgr_;
J
jorlow@chromium.org 已提交
293 294 295 296 297 298 299 300 301

  // No copying allowed
  DBIter(const DBIter&);
  void operator=(const DBIter&);
};

inline bool DBIter::ParseKey(ParsedInternalKey* ikey) {
  if (!ParseInternalKey(iter_->key(), ikey)) {
    status_ = Status::Corruption("corrupted internal key in DBIter");
302 303
    ROCKS_LOG_ERROR(logger_, "corrupted internal key in DBIter: %s",
                    iter_->key().ToString(true).c_str());
J
jorlow@chromium.org 已提交
304 305 306 307 308 309
    return false;
  } else {
    return true;
  }
}

J
jorlow@chromium.org 已提交
310 311 312
void DBIter::Next() {
  assert(valid_);

313 314
  // Release temporarily pinned blocks from last operation
  ReleaseTempPinnedData();
315
  ResetInternalKeysSkippedCounter();
S
Stanislau Hlebik 已提交
316
  if (direction_ == kReverse) {
317
    ReverseToForward();
318 319 320 321 322 323 324
  } else if (iter_->Valid() && !current_entry_is_merged_) {
    // If the current value is not a merge, the iter position is the
    // current key, which is already returned. We can safely issue a
    // Next() without checking the current key.
    // If the current key is a merge, very likely iter already points
    // to the next internal position.
    iter_->Next();
325
    PERF_COUNTER_ADD(internal_key_skipped_count, 1);
J
jorlow@chromium.org 已提交
326
  }
J
jorlow@chromium.org 已提交
327

328 329 330
  if (statistics_ != nullptr) {
    local_stats_.next_count_++;
  }
331 332
  // Now we point to the next internal position, for both of merge and
  // not merge cases.
333 334 335 336
  if (!iter_->Valid()) {
    valid_ = false;
    return;
  }
337
  FindNextUserEntry(true /* skipping the current user key */, prefix_same_as_start_);
338 339 340 341
  if (statistics_ != nullptr && valid_) {
    local_stats_.next_found_count_++;
    local_stats_.bytes_read_ += (key().size() + value().size());
  }
J
jorlow@chromium.org 已提交
342 343
}

344 345 346 347 348 349 350
// PRE: saved_key_ has the current user key if skipping
// POST: saved_key_ should have the next user key if valid_,
//       if the current entry is a result of merge
//           current_entry_is_merged_ => true
//           saved_value_             => the merged value
//
// NOTE: In between, saved_key_ can point to a user key that has
351
//       a delete marker or a sequence number higher than sequence_
352
//       saved_key_ MUST have a proper user_key before calling this function
353 354 355 356 357 358
//
// The prefix_check parameter controls whether we check the iterated
// keys against the prefix of the seeked key. Set to false when
// performing a seek without a key (e.g. SeekToFirst). Set to
// prefix_same_as_start_ for other iterations.
inline void DBIter::FindNextUserEntry(bool skipping, bool prefix_check) {
359
  PERF_TIMER_GUARD(find_next_user_entry_time);
360
  FindNextUserEntryInternal(skipping, prefix_check);
361 362 363
}

// Actual implementation of DBIter::FindNextUserEntry()
364
void DBIter::FindNextUserEntryInternal(bool skipping, bool prefix_check) {
J
jorlow@chromium.org 已提交
365 366 367
  // Loop until we hit an acceptable entry to yield
  assert(iter_->Valid());
  assert(direction_ == kForward);
368
  current_entry_is_merged_ = false;
369 370 371 372 373 374 375 376 377 378 379

  // How many times in a row we have skipped an entry with user key less than
  // or equal to saved_key_. We could skip these entries either because
  // sequence numbers were too high or because skipping = true.
  // What saved_key_ contains throughout this method:
  //  - if skipping        : saved_key_ contains the key that we need to skip,
  //                         and we haven't seen any keys greater than that,
  //  - if num_skipped > 0 : saved_key_ contains the key that we have skipped
  //                         num_skipped times, and we haven't seen any keys
  //                         greater than that,
  //  - none of the above  : saved_key_ can contain anything, it doesn't matter.
380
  uint64_t num_skipped = 0;
381

J
jorlow@chromium.org 已提交
382
  do {
J
jorlow@chromium.org 已提交
383
    ParsedInternalKey ikey;
384

385 386 387 388 389
    if (!ParseKey(&ikey)) {
      // Skip corrupted keys.
      iter_->Next();
      continue;
    }
390

391 392 393 394
    if (iterate_upper_bound_ != nullptr &&
        user_comparator_->Compare(ikey.user_key, *iterate_upper_bound_) >= 0) {
      break;
    }
395

396 397 398 399 400 401
    if (prefix_extractor_ && prefix_check &&
        prefix_extractor_->Transform(ikey.user_key)
          .compare(prefix_start_key_) != 0) {
      break;
    }

402 403 404 405
    if (TooManyInternalKeysSkipped()) {
      return;
    }

406 407
    if (ikey.sequence <= sequence_) {
      if (skipping &&
408 409
          user_comparator_->Compare(ikey.user_key, saved_key_.GetUserKey()) <=
              0) {
410 411 412 413 414 415 416 417 418
        num_skipped++;  // skip this entry
        PERF_COUNTER_ADD(internal_key_skipped_count, 1);
      } else {
        num_skipped = 0;
        switch (ikey.type) {
          case kTypeDeletion:
          case kTypeSingleDeletion:
            // Arrange to skip all upcoming entries for this key since
            // they are hidden by this deletion.
419
            saved_key_.SetUserKey(
420 421 422 423 424 425
                ikey.user_key,
                !iter_->IsKeyPinned() || !pin_thru_lifetime_ /* copy */);
            skipping = true;
            PERF_COUNTER_ADD(internal_delete_skipped_count, 1);
            break;
          case kTypeValue:
426
            saved_key_.SetUserKey(
427 428
                ikey.user_key,
                !iter_->IsKeyPinned() || !pin_thru_lifetime_ /* copy */);
429 430 431
            if (range_del_agg_.ShouldDelete(
                    ikey, RangeDelAggregator::RangePositioningMode::
                              kForwardTraversal)) {
432 433 434 435 436
              // Arrange to skip all upcoming entries for this key since
              // they are hidden by this deletion.
              skipping = true;
              num_skipped = 0;
              PERF_COUNTER_ADD(internal_delete_skipped_count, 1);
437 438 439 440 441 442
            } else {
              valid_ = true;
              return;
            }
            break;
          case kTypeMerge:
443
            saved_key_.SetUserKey(
444 445
                ikey.user_key,
                !iter_->IsKeyPinned() || !pin_thru_lifetime_ /* copy */);
446 447 448
            if (range_del_agg_.ShouldDelete(
                    ikey, RangeDelAggregator::RangePositioningMode::
                              kForwardTraversal)) {
449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465
              // Arrange to skip all upcoming entries for this key since
              // they are hidden by this deletion.
              skipping = true;
              num_skipped = 0;
              PERF_COUNTER_ADD(internal_delete_skipped_count, 1);
            } else {
              // By now, we are sure the current ikey is going to yield a
              // value
              current_entry_is_merged_ = true;
              valid_ = true;
              MergeValuesNewToOld();  // Go to a different state machine
              return;
            }
            break;
          default:
            assert(false);
            break;
466
        }
J
jorlow@chromium.org 已提交
467
      }
468 469 470 471 472 473
    } else {
      // This key was inserted after our snapshot was taken.
      PERF_COUNTER_ADD(internal_recent_skipped_count, 1);

      // Here saved_key_ may contain some old key, or the default empty key, or
      // key assigned by some random other method. We don't care.
474 475
      if (user_comparator_->Compare(ikey.user_key, saved_key_.GetUserKey()) <=
          0) {
476 477
        num_skipped++;
      } else {
478 479 480
        saved_key_.SetUserKey(
            ikey.user_key,
            !iter_->IsKeyPinned() || !pin_thru_lifetime_ /* copy */);
481 482 483
        skipping = false;
        num_skipped = 0;
      }
J
jorlow@chromium.org 已提交
484
    }
485 486 487 488

    // If we have sequentially iterated via numerous equal keys, then it's
    // better to seek so that we can avoid too many key comparisons.
    if (num_skipped > max_skip_) {
489 490
      num_skipped = 0;
      std::string last_key;
491 492 493 494
      if (skipping) {
        // We're looking for the next user-key but all we see are the same
        // user-key with decreasing sequence numbers. Fast forward to
        // sequence number 0 and type deletion (the smallest type).
495 496
        AppendInternalKey(&last_key, ParsedInternalKey(saved_key_.GetUserKey(),
                                                       0, kTypeDeletion));
497 498 499 500 501 502 503 504 505
        // Don't set skipping = false because we may still see more user-keys
        // equal to saved_key_.
      } else {
        // We saw multiple entries with this user key and sequence numbers
        // higher than sequence_. Fast forward to sequence_.
        // Note that this only covers a case when a higher key was overwritten
        // many times since our snapshot was taken, not the case when a lot of
        // different keys were inserted after our snapshot was taken.
        AppendInternalKey(&last_key,
506
                          ParsedInternalKey(saved_key_.GetUserKey(), sequence_,
507 508
                                            kValueTypeForSeek));
      }
509 510 511 512 513
      iter_->Seek(last_key);
      RecordTick(statistics_, NUMBER_OF_RESEEKS_IN_ITERATION);
    } else {
      iter_->Next();
    }
J
jorlow@chromium.org 已提交
514 515
  } while (iter_->Valid());
  valid_ = false;
J
jorlow@chromium.org 已提交
516 517
}

518 519 520 521 522 523 524
// Merge values of the same user key starting from the current iter_ position
// Scan from the newer entries to older entries.
// PRE: iter_->key() points to the first merge type entry
//      saved_key_ stores the user key
// POST: saved_value_ has the merged value for the user key
//       iter_ points to the next entry (or invalid)
void DBIter::MergeValuesNewToOld() {
525
  if (!merge_operator_) {
526
    ROCKS_LOG_ERROR(logger_, "Options::merge_operator is null.");
527
    status_ = Status::InvalidArgument("merge_operator_ must be set.");
528 529
    valid_ = false;
    return;
D
Deon Nicholas 已提交
530
  }
531

532 533
  // Temporarily pin the blocks that hold merge operands
  TempPinData();
534
  merge_context_.Clear();
535
  // Start the merge process by pushing the first operand
536 537
  merge_context_.PushOperand(iter_->value(),
                             iter_->IsValuePinned() /* operand_pinned */);
538 539

  ParsedInternalKey ikey;
540
  Status s;
541 542 543 544 545 546
  for (iter_->Next(); iter_->Valid(); iter_->Next()) {
    if (!ParseKey(&ikey)) {
      // skip corrupted key
      continue;
    }

547
    if (!user_comparator_->Equal(ikey.user_key, saved_key_.GetUserKey())) {
548 549
      // hit the next user key, stop right here
      break;
A
Andrew Kryczka 已提交
550
    } else if (kTypeDeletion == ikey.type || kTypeSingleDeletion == ikey.type ||
551 552 553
               range_del_agg_.ShouldDelete(
                   ikey, RangeDelAggregator::RangePositioningMode::
                             kForwardTraversal)) {
554 555 556 557
      // hit a delete with the same user key, stop right here
      // iter_ is positioned after delete
      iter_->Next();
      break;
A
Andres Noetzli 已提交
558
    } else if (kTypeValue == ikey.type) {
559 560 561
      // hit a put, merge the put value with operands and store the
      // final result in saved_value_. We are done!
      // ignore corruption if there is any.
I
Igor Canadi 已提交
562
      const Slice val = iter_->value();
563 564 565 566 567 568
      s = MergeHelper::TimedFullMerge(
          merge_operator_, ikey.user_key, &val, merge_context_.GetOperands(),
          &saved_value_, logger_, statistics_, env_, &pinned_value_);
      if (!s.ok()) {
        status_ = s;
      }
569 570 571
      // iter_ is positioned after put
      iter_->Next();
      return;
A
Andres Noetzli 已提交
572
    } else if (kTypeMerge == ikey.type) {
573 574
      // hit a merge, add the value as an operand and run associative merge.
      // when complete, add result to operands and continue.
575 576
      merge_context_.PushOperand(iter_->value(),
                                 iter_->IsValuePinned() /* operand_pinned */);
577
      PERF_COUNTER_ADD(internal_merge_count, 1);
A
Andres Noetzli 已提交
578 579
    } else {
      assert(false);
580 581 582
    }
  }

583 584 585 586
  // we either exhausted all internal keys under this user key, or hit
  // a deletion marker.
  // feed null as the existing value to the merge operator, such that
  // client can differentiate this scenario and do things accordingly.
587 588 589 590
  s = MergeHelper::TimedFullMerge(merge_operator_, saved_key_.GetUserKey(),
                                  nullptr, merge_context_.GetOperands(),
                                  &saved_value_, logger_, statistics_, env_,
                                  &pinned_value_);
591 592 593
  if (!s.ok()) {
    status_ = s;
  }
594 595
}

J
jorlow@chromium.org 已提交
596 597
void DBIter::Prev() {
  assert(valid_);
598
  ReleaseTempPinnedData();
599
  ResetInternalKeysSkippedCounter();
S
Stanislau Hlebik 已提交
600
  if (direction_ == kForward) {
601
    ReverseToBackward();
S
Stanislau Hlebik 已提交
602 603
  }
  PrevInternal();
M
Manuel Ung 已提交
604
  if (statistics_ != nullptr) {
605
    local_stats_.prev_count_++;
M
Manuel Ung 已提交
606
    if (valid_) {
607 608
      local_stats_.prev_found_count_++;
      local_stats_.bytes_read_ += (key().size() + value().size());
M
Manuel Ung 已提交
609 610
    }
  }
S
Stanislau Hlebik 已提交
611
}
J
jorlow@chromium.org 已提交
612

613 614 615 616
void DBIter::ReverseToForward() {
  if (prefix_extractor_ != nullptr && !total_order_seek_) {
    IterKey last_key;
    last_key.SetInternalKey(ParsedInternalKey(
617 618
        saved_key_.GetUserKey(), kMaxSequenceNumber, kValueTypeForSeek));
    iter_->Seek(last_key.GetInternalKey());
619 620 621 622 623
  }
  FindNextUserKey();
  direction_ = kForward;
  if (!iter_->Valid()) {
    iter_->SeekToFirst();
624
    range_del_agg_.InvalidateTombstoneMapPositions();
625 626 627
  }
}

628
void DBIter::ReverseToBackward() {
629 630
  if (prefix_extractor_ != nullptr && !total_order_seek_) {
    IterKey last_key;
631 632 633
    last_key.SetInternalKey(ParsedInternalKey(saved_key_.GetUserKey(), 0,
                                              kValueTypeForSeekForPrev));
    iter_->SeekForPrev(last_key.GetInternalKey());
634
  }
635 636 637 638 639
  if (current_entry_is_merged_) {
    // Not placed in the same key. Need to call Prev() until finding the
    // previous key.
    if (!iter_->Valid()) {
      iter_->SeekToLast();
640
      range_del_agg_.InvalidateTombstoneMapPositions();
641 642 643 644
    }
    ParsedInternalKey ikey;
    FindParseableKey(&ikey, kReverse);
    while (iter_->Valid() &&
645 646
           user_comparator_->Compare(ikey.user_key, saved_key_.GetUserKey()) >
               0) {
647 648 649 650 651
      if (ikey.sequence > sequence_) {
        PERF_COUNTER_ADD(internal_recent_skipped_count, 1);
      } else {
        PERF_COUNTER_ADD(internal_key_skipped_count, 1);
      }
652 653 654 655 656 657 658 659
      iter_->Prev();
      FindParseableKey(&ikey, kReverse);
    }
  }
#ifndef NDEBUG
  if (iter_->Valid()) {
    ParsedInternalKey ikey;
    assert(ParseKey(&ikey));
660 661
    assert(user_comparator_->Compare(ikey.user_key, saved_key_.GetUserKey()) <=
           0);
662 663 664 665 666 667 668
  }
#endif

  FindPrevUserKey();
  direction_ = kReverse;
}

S
Stanislau Hlebik 已提交
669 670 671 672
void DBIter::PrevInternal() {
  if (!iter_->Valid()) {
    valid_ = false;
    return;
673 674
  }

S
Stanislau Hlebik 已提交
675 676 677
  ParsedInternalKey ikey;

  while (iter_->Valid()) {
678 679 680
    saved_key_.SetUserKey(
        ExtractUserKey(iter_->key()),
        !iter_->IsKeyPinned() || !pin_thru_lifetime_ /* copy */);
681

S
Stanislau Hlebik 已提交
682 683
    if (FindValueForCurrentKey()) {
      valid_ = true;
J
jorlow@chromium.org 已提交
684 685 686
      if (!iter_->Valid()) {
        return;
      }
S
Stanislau Hlebik 已提交
687
      FindParseableKey(&ikey, kReverse);
688
      if (user_comparator_->Equal(ikey.user_key, saved_key_.GetUserKey())) {
S
Stanislau Hlebik 已提交
689
        FindPrevUserKey();
J
jorlow@chromium.org 已提交
690
      }
A
Aaron Gao 已提交
691
      if (valid_ && prefix_extractor_ && prefix_same_as_start_ &&
692
          prefix_extractor_->Transform(saved_key_.GetUserKey())
A
Aaron Gao 已提交
693
                  .compare(prefix_start_key_) != 0) {
694
        valid_ = false;
A
Aaron Gao 已提交
695
      }
S
Stanislau Hlebik 已提交
696
      return;
J
jorlow@chromium.org 已提交
697
    }
698 699 700 701 702

    if (TooManyInternalKeysSkipped(false)) {
      return;
    }

S
Stanislau Hlebik 已提交
703 704 705 706
    if (!iter_->Valid()) {
      break;
    }
    FindParseableKey(&ikey, kReverse);
707
    if (user_comparator_->Equal(ikey.user_key, saved_key_.GetUserKey())) {
S
Stanislau Hlebik 已提交
708 709 710 711
      FindPrevUserKey();
    }
  }
  // We haven't found any key - iterator is not valid
A
Aaron Gao 已提交
712
  // Or the prefix is different than start prefix
713
  assert(!iter_->Valid());
S
Stanislau Hlebik 已提交
714
  valid_ = false;
J
jorlow@chromium.org 已提交
715 716
}

S
Stanislau Hlebik 已提交
717
// This function checks, if the entry with biggest sequence_number <= sequence_
A
Andres Noetzli 已提交
718 719
// is non kTypeDeletion or kTypeSingleDeletion. If it's not, we save value in
// saved_value_
S
Stanislau Hlebik 已提交
720 721
bool DBIter::FindValueForCurrentKey() {
  assert(iter_->Valid());
722
  merge_context_.Clear();
723
  current_entry_is_merged_ = false;
A
Andres Noetzli 已提交
724 725
  // last entry before merge (could be kTypeDeletion, kTypeSingleDeletion or
  // kTypeValue)
S
Stanislau Hlebik 已提交
726 727
  ValueType last_not_merge_type = kTypeDeletion;
  ValueType last_key_entry_type = kTypeDeletion;
J
jorlow@chromium.org 已提交
728

S
Stanislau Hlebik 已提交
729 730 731
  ParsedInternalKey ikey;
  FindParseableKey(&ikey, kReverse);

732 733 734
  // Temporarily pin blocks that hold (merge operands / the value)
  ReleaseTempPinnedData();
  TempPinData();
S
Stanislau Hlebik 已提交
735 736
  size_t num_skipped = 0;
  while (iter_->Valid() && ikey.sequence <= sequence_ &&
737
         user_comparator_->Equal(ikey.user_key, saved_key_.GetUserKey())) {
738 739 740 741
    if (TooManyInternalKeysSkipped()) {
      return false;
    }

S
Stanislau Hlebik 已提交
742 743 744 745 746 747 748 749
    // We iterate too much: let's use Seek() to avoid too much key comparisons
    if (num_skipped >= max_skip_) {
      return FindValueForCurrentKeyUsingSeek();
    }

    last_key_entry_type = ikey.type;
    switch (last_key_entry_type) {
      case kTypeValue:
750 751 752
        if (range_del_agg_.ShouldDelete(
                ikey,
                RangeDelAggregator::RangePositioningMode::kBackwardTraversal)) {
A
Andrew Kryczka 已提交
753 754 755 756 757 758
          last_key_entry_type = kTypeRangeDeletion;
          PERF_COUNTER_ADD(internal_delete_skipped_count, 1);
        } else {
          assert(iter_->IsValuePinned());
          pinned_value_ = iter_->value();
        }
759
        merge_context_.Clear();
A
Andrew Kryczka 已提交
760
        last_not_merge_type = last_key_entry_type;
S
Stanislau Hlebik 已提交
761 762
        break;
      case kTypeDeletion:
A
Andres Noetzli 已提交
763
      case kTypeSingleDeletion:
764
        merge_context_.Clear();
A
Andres Noetzli 已提交
765
        last_not_merge_type = last_key_entry_type;
766
        PERF_COUNTER_ADD(internal_delete_skipped_count, 1);
S
Stanislau Hlebik 已提交
767 768
        break;
      case kTypeMerge:
769 770 771
        if (range_del_agg_.ShouldDelete(
                ikey,
                RangeDelAggregator::RangePositioningMode::kBackwardTraversal)) {
A
Andrew Kryczka 已提交
772 773 774 775 776 777 778 779
          merge_context_.Clear();
          last_key_entry_type = kTypeRangeDeletion;
          last_not_merge_type = last_key_entry_type;
          PERF_COUNTER_ADD(internal_delete_skipped_count, 1);
        } else {
          assert(merge_operator_ != nullptr);
          merge_context_.PushOperandBack(
              iter_->value(), iter_->IsValuePinned() /* operand_pinned */);
780
          PERF_COUNTER_ADD(internal_merge_count, 1);
A
Andrew Kryczka 已提交
781
        }
S
Stanislau Hlebik 已提交
782 783 784 785 786
        break;
      default:
        assert(false);
    }

787
    PERF_COUNTER_ADD(internal_key_skipped_count, 1);
788
    assert(user_comparator_->Equal(ikey.user_key, saved_key_.GetUserKey()));
S
Stanislau Hlebik 已提交
789 790 791 792 793
    iter_->Prev();
    ++num_skipped;
    FindParseableKey(&ikey, kReverse);
  }

794
  Status s;
S
Stanislau Hlebik 已提交
795 796
  switch (last_key_entry_type) {
    case kTypeDeletion:
A
Andres Noetzli 已提交
797
    case kTypeSingleDeletion:
A
Andrew Kryczka 已提交
798
    case kTypeRangeDeletion:
S
Stanislau Hlebik 已提交
799 800 801
      valid_ = false;
      return false;
    case kTypeMerge:
802
      current_entry_is_merged_ = true;
A
Aaron Gao 已提交
803
      if (last_not_merge_type == kTypeDeletion ||
A
Andrew Kryczka 已提交
804 805
          last_not_merge_type == kTypeSingleDeletion ||
          last_not_merge_type == kTypeRangeDeletion) {
806 807 808 809
        s = MergeHelper::TimedFullMerge(
            merge_operator_, saved_key_.GetUserKey(), nullptr,
            merge_context_.GetOperands(), &saved_value_, logger_, statistics_,
            env_, &pinned_value_);
810
      } else {
S
Stanislau Hlebik 已提交
811
        assert(last_not_merge_type == kTypeValue);
812
        s = MergeHelper::TimedFullMerge(
813
            merge_operator_, saved_key_.GetUserKey(), &pinned_value_,
814 815
            merge_context_.GetOperands(), &saved_value_, logger_, statistics_,
            env_, &pinned_value_);
816
      }
S
Stanislau Hlebik 已提交
817 818 819 820 821 822 823
      break;
    case kTypeValue:
      // do nothing - we've already has value in saved_value_
      break;
    default:
      assert(false);
      break;
J
jorlow@chromium.org 已提交
824
  }
S
Stanislau Hlebik 已提交
825
  valid_ = true;
826 827 828
  if (!s.ok()) {
    status_ = s;
  }
S
Stanislau Hlebik 已提交
829 830
  return true;
}
J
jorlow@chromium.org 已提交
831

S
Stanislau Hlebik 已提交
832 833 834
// This function is used in FindValueForCurrentKey.
// We use Seek() function instead of Prev() to find necessary value
bool DBIter::FindValueForCurrentKeyUsingSeek() {
835 836 837
  // FindValueForCurrentKey will enable pinning before calling
  // FindValueForCurrentKeyUsingSeek()
  assert(pinned_iters_mgr_.PinningEnabled());
S
Stanislau Hlebik 已提交
838
  std::string last_key;
839 840
  AppendInternalKey(&last_key, ParsedInternalKey(saved_key_.GetUserKey(),
                                                 sequence_, kValueTypeForSeek));
S
Stanislau Hlebik 已提交
841 842 843 844 845 846 847
  iter_->Seek(last_key);
  RecordTick(statistics_, NUMBER_OF_RESEEKS_IN_ITERATION);

  // assume there is at least one parseable key for this user key
  ParsedInternalKey ikey;
  FindParseableKey(&ikey, kForward);

A
Andrew Kryczka 已提交
848
  if (ikey.type == kTypeDeletion || ikey.type == kTypeSingleDeletion ||
849 850
      range_del_agg_.ShouldDelete(
          ikey, RangeDelAggregator::RangePositioningMode::kBackwardTraversal)) {
J
jorlow@chromium.org 已提交
851
    valid_ = false;
S
Stanislau Hlebik 已提交
852 853
    return false;
  }
A
Andrew Kryczka 已提交
854 855 856 857 858 859
  if (ikey.type == kTypeValue) {
    assert(iter_->IsValuePinned());
    pinned_value_ = iter_->value();
    valid_ = true;
    return true;
  }
S
Stanislau Hlebik 已提交
860 861 862

  // kTypeMerge. We need to collect all kTypeMerge values and save them
  // in operands
863
  current_entry_is_merged_ = true;
864
  merge_context_.Clear();
865 866
  while (
      iter_->Valid() &&
867
      user_comparator_->Equal(ikey.user_key, saved_key_.GetUserKey()) &&
868 869 870
      ikey.type == kTypeMerge &&
      !range_del_agg_.ShouldDelete(
          ikey, RangeDelAggregator::RangePositioningMode::kBackwardTraversal)) {
871 872
    merge_context_.PushOperand(iter_->value(),
                               iter_->IsValuePinned() /* operand_pinned */);
873
    PERF_COUNTER_ADD(internal_merge_count, 1);
S
Stanislau Hlebik 已提交
874 875 876 877
    iter_->Next();
    FindParseableKey(&ikey, kForward);
  }

878
  Status s;
S
Stanislau Hlebik 已提交
879
  if (!iter_->Valid() ||
880
      !user_comparator_->Equal(ikey.user_key, saved_key_.GetUserKey()) ||
A
Andrew Kryczka 已提交
881
      ikey.type == kTypeDeletion || ikey.type == kTypeSingleDeletion ||
882 883
      range_del_agg_.ShouldDelete(
          ikey, RangeDelAggregator::RangePositioningMode::kBackwardTraversal)) {
884
    s = MergeHelper::TimedFullMerge(merge_operator_, saved_key_.GetUserKey(),
885 886 887
                                    nullptr, merge_context_.GetOperands(),
                                    &saved_value_, logger_, statistics_, env_,
                                    &pinned_value_);
S
Stanislau Hlebik 已提交
888 889
    // Make iter_ valid and point to saved_key_
    if (!iter_->Valid() ||
890
        !user_comparator_->Equal(ikey.user_key, saved_key_.GetUserKey())) {
S
Stanislau Hlebik 已提交
891 892 893
      iter_->Seek(last_key);
      RecordTick(statistics_, NUMBER_OF_RESEEKS_IN_ITERATION);
    }
J
jorlow@chromium.org 已提交
894
    valid_ = true;
895 896 897
    if (!s.ok()) {
      status_ = s;
    }
S
Stanislau Hlebik 已提交
898 899 900
    return true;
  }

I
Igor Canadi 已提交
901
  const Slice& val = iter_->value();
902 903 904 905
  s = MergeHelper::TimedFullMerge(merge_operator_, saved_key_.GetUserKey(),
                                  &val, merge_context_.GetOperands(),
                                  &saved_value_, logger_, statistics_, env_,
                                  &pinned_value_);
S
Stanislau Hlebik 已提交
906
  valid_ = true;
907 908 909
  if (!s.ok()) {
    status_ = s;
  }
S
Stanislau Hlebik 已提交
910 911 912 913 914 915 916 917 918 919 920 921 922 923
  return true;
}

// Used in Next to change directions
// Go to next user key
// Don't use Seek(),
// because next user key will be very close
void DBIter::FindNextUserKey() {
  if (!iter_->Valid()) {
    return;
  }
  ParsedInternalKey ikey;
  FindParseableKey(&ikey, kForward);
  while (iter_->Valid() &&
924
         !user_comparator_->Equal(ikey.user_key, saved_key_.GetUserKey())) {
S
Stanislau Hlebik 已提交
925 926 927 928 929 930 931 932 933 934 935 936 937
    iter_->Next();
    FindParseableKey(&ikey, kForward);
  }
}

// Go to previous user_key
void DBIter::FindPrevUserKey() {
  if (!iter_->Valid()) {
    return;
  }
  size_t num_skipped = 0;
  ParsedInternalKey ikey;
  FindParseableKey(&ikey, kReverse);
938
  int cmp;
939 940 941 942
  while (iter_->Valid() &&
         ((cmp = user_comparator_->Compare(ikey.user_key,
                                           saved_key_.GetUserKey())) == 0 ||
          (cmp > 0 && ikey.sequence > sequence_))) {
943 944 945 946
    if (TooManyInternalKeysSkipped()) {
      return;
    }

947 948 949 950 951
    if (cmp == 0) {
      if (num_skipped >= max_skip_) {
        num_skipped = 0;
        IterKey last_key;
        last_key.SetInternalKey(ParsedInternalKey(
952 953
            saved_key_.GetUserKey(), kMaxSequenceNumber, kValueTypeForSeek));
        iter_->Seek(last_key.GetInternalKey());
954 955 956 957
        RecordTick(statistics_, NUMBER_OF_RESEEKS_IN_ITERATION);
      } else {
        ++num_skipped;
      }
S
Stanislau Hlebik 已提交
958
    }
959 960 961 962 963
    if (ikey.sequence > sequence_) {
      PERF_COUNTER_ADD(internal_recent_skipped_count, 1);
    } else {
      PERF_COUNTER_ADD(internal_key_skipped_count, 1);
    }
S
Stanislau Hlebik 已提交
964 965 966 967 968
    iter_->Prev();
    FindParseableKey(&ikey, kReverse);
  }
}

969 970 971 972 973 974 975 976 977 978 979 980
bool DBIter::TooManyInternalKeysSkipped(bool increment) {
  if ((max_skippable_internal_keys_ > 0) &&
      (num_internal_keys_skipped_ > max_skippable_internal_keys_)) {
    valid_ = false;
    status_ = Status::Incomplete("Too many internal keys skipped.");
    return true;
  } else if (increment) {
    num_internal_keys_skipped_++;
  }
  return false;
}

S
Stanislau Hlebik 已提交
981 982 983 984 985 986 987 988
// Skip all unparseable keys
void DBIter::FindParseableKey(ParsedInternalKey* ikey, Direction direction) {
  while (iter_->Valid() && !ParseKey(ikey)) {
    if (direction == kReverse) {
      iter_->Prev();
    } else {
      iter_->Next();
    }
J
jorlow@chromium.org 已提交
989 990
  }
}
J
jorlow@chromium.org 已提交
991

J
jorlow@chromium.org 已提交
992
void DBIter::Seek(const Slice& target) {
L
Lei Jin 已提交
993
  StopWatch sw(env_, statistics_, DB_SEEK);
994
  ReleaseTempPinnedData();
995
  ResetInternalKeysSkippedCounter();
996 997
  saved_key_.Clear();
  saved_key_.SetInternalKey(target, sequence_);
998 999 1000

  {
    PERF_TIMER_GUARD(seek_internal_seek_time);
1001
    iter_->Seek(saved_key_.GetInternalKey());
1002
    range_del_agg_.InvalidateTombstoneMapPositions();
1003
  }
M
Manuel Ung 已提交
1004
  RecordTick(statistics_, NUMBER_DB_SEEK);
J
jorlow@chromium.org 已提交
1005
  if (iter_->Valid()) {
1006 1007 1008
    if (prefix_extractor_ && prefix_same_as_start_) {
      prefix_start_key_ = prefix_extractor_->Transform(target);
    }
1009 1010
    direction_ = kForward;
    ClearSavedValue();
1011 1012 1013 1014
    FindNextUserEntry(false /* not skipping */, prefix_same_as_start_);
    if (!valid_) {
      prefix_start_key_.clear();
    }
M
Manuel Ung 已提交
1015 1016 1017 1018 1019 1020
    if (statistics_ != nullptr) {
      if (valid_) {
        RecordTick(statistics_, NUMBER_DB_SEEK_FOUND);
        RecordTick(statistics_, ITER_BYTES_READ, key().size() + value().size());
      }
    }
J
jorlow@chromium.org 已提交
1021 1022 1023
  } else {
    valid_ = false;
  }
1024

1025
  if (valid_ && prefix_extractor_ && prefix_same_as_start_) {
1026 1027
    prefix_start_buf_.SetUserKey(prefix_start_key_);
    prefix_start_key_ = prefix_start_buf_.GetUserKey();
1028
  }
J
jorlow@chromium.org 已提交
1029
}
J
jorlow@chromium.org 已提交
1030

A
Aaron Gao 已提交
1031 1032 1033
void DBIter::SeekForPrev(const Slice& target) {
  StopWatch sw(env_, statistics_, DB_SEEK);
  ReleaseTempPinnedData();
1034
  ResetInternalKeysSkippedCounter();
A
Aaron Gao 已提交
1035 1036 1037 1038 1039 1040 1041
  saved_key_.Clear();
  // now saved_key is used to store internal key.
  saved_key_.SetInternalKey(target, 0 /* sequence_number */,
                            kValueTypeForSeekForPrev);

  {
    PERF_TIMER_GUARD(seek_internal_seek_time);
1042
    iter_->SeekForPrev(saved_key_.GetInternalKey());
1043
    range_del_agg_.InvalidateTombstoneMapPositions();
A
Aaron Gao 已提交
1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066
  }

  RecordTick(statistics_, NUMBER_DB_SEEK);
  if (iter_->Valid()) {
    if (prefix_extractor_ && prefix_same_as_start_) {
      prefix_start_key_ = prefix_extractor_->Transform(target);
    }
    direction_ = kReverse;
    ClearSavedValue();
    PrevInternal();
    if (!valid_) {
      prefix_start_key_.clear();
    }
    if (statistics_ != nullptr) {
      if (valid_) {
        RecordTick(statistics_, NUMBER_DB_SEEK_FOUND);
        RecordTick(statistics_, ITER_BYTES_READ, key().size() + value().size());
      }
    }
  } else {
    valid_ = false;
  }
  if (valid_ && prefix_extractor_ && prefix_same_as_start_) {
1067 1068
    prefix_start_buf_.SetUserKey(prefix_start_key_);
    prefix_start_key_ = prefix_start_buf_.GetUserKey();
A
Aaron Gao 已提交
1069 1070 1071
  }
}

J
jorlow@chromium.org 已提交
1072
void DBIter::SeekToFirst() {
S
Stanislau Hlebik 已提交
1073
  // Don't use iter_::Seek() if we set a prefix extractor
1074
  // because prefix seek will be used.
1075
  if (prefix_extractor_ != nullptr) {
S
Stanislau Hlebik 已提交
1076 1077
    max_skip_ = std::numeric_limits<uint64_t>::max();
  }
J
jorlow@chromium.org 已提交
1078
  direction_ = kForward;
1079
  ReleaseTempPinnedData();
1080
  ResetInternalKeysSkippedCounter();
J
jorlow@chromium.org 已提交
1081
  ClearSavedValue();
1082 1083 1084 1085

  {
    PERF_TIMER_GUARD(seek_internal_seek_time);
    iter_->SeekToFirst();
1086
    range_del_agg_.InvalidateTombstoneMapPositions();
1087 1088
  }

M
Manuel Ung 已提交
1089
  RecordTick(statistics_, NUMBER_DB_SEEK);
J
jorlow@chromium.org 已提交
1090
  if (iter_->Valid()) {
1091 1092 1093
    saved_key_.SetUserKey(
        ExtractUserKey(iter_->key()),
        !iter_->IsKeyPinned() || !pin_thru_lifetime_ /* copy */);
1094
    FindNextUserEntry(false /* not skipping */, false /* no prefix check */);
M
Manuel Ung 已提交
1095 1096 1097 1098 1099 1100
    if (statistics_ != nullptr) {
      if (valid_) {
        RecordTick(statistics_, NUMBER_DB_SEEK_FOUND);
        RecordTick(statistics_, ITER_BYTES_READ, key().size() + value().size());
      }
    }
J
jorlow@chromium.org 已提交
1101 1102
  } else {
    valid_ = false;
J
jorlow@chromium.org 已提交
1103
  }
1104
  if (valid_ && prefix_extractor_ && prefix_same_as_start_) {
1105 1106 1107
    prefix_start_buf_.SetUserKey(
        prefix_extractor_->Transform(saved_key_.GetUserKey()));
    prefix_start_key_ = prefix_start_buf_.GetUserKey();
1108
  }
J
jorlow@chromium.org 已提交
1109 1110
}

J
jorlow@chromium.org 已提交
1111
void DBIter::SeekToLast() {
S
Stanislau Hlebik 已提交
1112
  // Don't use iter_::Seek() if we set a prefix extractor
1113
  // because prefix seek will be used.
1114
  if (prefix_extractor_ != nullptr) {
S
Stanislau Hlebik 已提交
1115 1116
    max_skip_ = std::numeric_limits<uint64_t>::max();
  }
J
jorlow@chromium.org 已提交
1117
  direction_ = kReverse;
1118
  ReleaseTempPinnedData();
1119
  ResetInternalKeysSkippedCounter();
J
jorlow@chromium.org 已提交
1120
  ClearSavedValue();
1121 1122 1123 1124

  {
    PERF_TIMER_GUARD(seek_internal_seek_time);
    iter_->SeekToLast();
1125
    range_del_agg_.InvalidateTombstoneMapPositions();
1126
  }
1127 1128 1129 1130
  // When the iterate_upper_bound is set to a value,
  // it will seek to the last key before the
  // ReadOptions.iterate_upper_bound
  if (iter_->Valid() && iterate_upper_bound_ != nullptr) {
1131
    SeekForPrev(*iterate_upper_bound_);
1132
    range_del_agg_.InvalidateTombstoneMapPositions();
1133 1134 1135 1136
    if (!Valid()) {
      return;
    } else if (user_comparator_->Equal(*iterate_upper_bound_, key())) {
      Prev();
1137
    }
1138 1139
  } else {
    PrevInternal();
1140
  }
M
Manuel Ung 已提交
1141 1142 1143 1144 1145 1146 1147
  if (statistics_ != nullptr) {
    RecordTick(statistics_, NUMBER_DB_SEEK);
    if (valid_) {
      RecordTick(statistics_, NUMBER_DB_SEEK_FOUND);
      RecordTick(statistics_, ITER_BYTES_READ, key().size() + value().size());
    }
  }
1148
  if (valid_ && prefix_extractor_ && prefix_same_as_start_) {
1149 1150 1151
    prefix_start_buf_.SetUserKey(
        prefix_extractor_->Transform(saved_key_.GetUserKey()));
    prefix_start_key_ = prefix_start_buf_.GetUserKey();
1152
  }
J
jorlow@chromium.org 已提交
1153 1154
}

1155 1156 1157 1158 1159 1160 1161 1162 1163 1164
Iterator* NewDBIterator(Env* env, const ReadOptions& read_options,
                        const ImmutableCFOptions& cf_options,
                        const Comparator* user_key_comparator,
                        InternalIterator* internal_iter,
                        const SequenceNumber& sequence,
                        uint64_t max_sequential_skip_in_iterations,
                        uint64_t version_number) {
  DBIter* db_iter = new DBIter(
      env, read_options, cf_options, user_key_comparator, internal_iter,
      sequence, false, max_sequential_skip_in_iterations, version_number);
1165
  return db_iter;
1166 1167
}

I
Igor Canadi 已提交
1168
ArenaWrappedDBIter::~ArenaWrappedDBIter() { db_iter_->~DBIter(); }
1169 1170 1171

void ArenaWrappedDBIter::SetDBIter(DBIter* iter) { db_iter_ = iter; }

A
Andrew Kryczka 已提交
1172 1173 1174 1175
RangeDelAggregator* ArenaWrappedDBIter::GetRangeDelAggregator() {
  return db_iter_->GetRangeDelAggregator();
}

S
sdong 已提交
1176
void ArenaWrappedDBIter::SetIterUnderDBIter(InternalIterator* iter) {
1177 1178 1179 1180 1181 1182 1183 1184 1185
  static_cast<DBIter*>(db_iter_)->SetIter(iter);
}

inline bool ArenaWrappedDBIter::Valid() const { return db_iter_->Valid(); }
inline void ArenaWrappedDBIter::SeekToFirst() { db_iter_->SeekToFirst(); }
inline void ArenaWrappedDBIter::SeekToLast() { db_iter_->SeekToLast(); }
inline void ArenaWrappedDBIter::Seek(const Slice& target) {
  db_iter_->Seek(target);
}
A
Aaron Gao 已提交
1186 1187 1188
inline void ArenaWrappedDBIter::SeekForPrev(const Slice& target) {
  db_iter_->SeekForPrev(target);
}
1189 1190 1191 1192 1193
inline void ArenaWrappedDBIter::Next() { db_iter_->Next(); }
inline void ArenaWrappedDBIter::Prev() { db_iter_->Prev(); }
inline Slice ArenaWrappedDBIter::key() const { return db_iter_->key(); }
inline Slice ArenaWrappedDBIter::value() const { return db_iter_->value(); }
inline Status ArenaWrappedDBIter::status() const { return db_iter_->status(); }
1194 1195 1196 1197
inline Status ArenaWrappedDBIter::GetProperty(std::string prop_name,
                                              std::string* prop) {
  return db_iter_->GetProperty(prop_name, prop);
}
1198 1199 1200 1201
void ArenaWrappedDBIter::RegisterCleanup(CleanupFunction function, void* arg1,
                                         void* arg2) {
  db_iter_->RegisterCleanup(function, arg1, arg2);
}
J
jorlow@chromium.org 已提交
1202

1203
ArenaWrappedDBIter* NewArenaWrappedDbIterator(
1204 1205 1206 1207
    Env* env, const ReadOptions& read_options,
    const ImmutableCFOptions& cf_options, const Comparator* user_key_comparator,
    const SequenceNumber& sequence, uint64_t max_sequential_skip_in_iterations,
    uint64_t version_number) {
1208 1209 1210
  ArenaWrappedDBIter* iter = new ArenaWrappedDBIter();
  Arena* arena = iter->GetArena();
  auto mem = arena->AllocateAligned(sizeof(DBIter));
1211 1212 1213
  DBIter* db_iter = new (mem)
      DBIter(env, read_options, cf_options, user_key_comparator, nullptr,
             sequence, true, max_sequential_skip_in_iterations, version_number);
1214

1215
  iter->SetDBIter(db_iter);
1216

1217
  return iter;
J
jorlow@chromium.org 已提交
1218 1219
}

1220
}  // namespace rocksdb