db_iter.cc 43.1 KB
Newer Older
1
//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
S
Siying Dong 已提交
2 3 4
//  This source code is licensed under both the GPLv2 (found in the
//  COPYING file in the root directory) and Apache 2.0 License
//  (found in the LICENSE.Apache file in the root directory).
5
//
J
jorlow@chromium.org 已提交
6 7 8 9 10
// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file. See the AUTHORS file for names of contributors.

#include "db/db_iter.h"
S
Stanislau Hlebik 已提交
11
#include <string>
12
#include <iostream>
S
Stanislau Hlebik 已提交
13
#include <limits>
J
jorlow@chromium.org 已提交
14 15

#include "db/dbformat.h"
16
#include "db/merge_context.h"
17
#include "db/merge_helper.h"
18
#include "db/pinned_iterators_manager.h"
19
#include "file/filename.h"
20
#include "logging/logging.h"
21
#include "memory/arena.h"
22
#include "monitoring/perf_context_imp.h"
23 24 25
#include "rocksdb/env.h"
#include "rocksdb/iterator.h"
#include "rocksdb/merge_operator.h"
26
#include "rocksdb/options.h"
S
sdong 已提交
27
#include "table/internal_iterator.h"
28
#include "table/iterator_wrapper.h"
29
#include "trace_replay/trace_replay.h"
J
jorlow@chromium.org 已提交
30
#include "util/mutexlock.h"
31
#include "util/string_util.h"
32
#include "util/user_comparator_wrapper.h"
J
jorlow@chromium.org 已提交
33

34
namespace rocksdb {
J
jorlow@chromium.org 已提交
35 36 37 38 39 40 41 42 43 44 45 46 47 48

#if 0
static void DumpInternalIter(Iterator* iter) {
  for (iter->SeekToFirst(); iter->Valid(); iter->Next()) {
    ParsedInternalKey k;
    if (!ParseInternalKey(iter->key(), &k)) {
      fprintf(stderr, "Corrupt '%s'\n", EscapeString(iter->key()).c_str());
    } else {
      fprintf(stderr, "@ '%s'\n", k.DebugString().c_str());
    }
  }
}
#endif

49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88
DBIter::DBIter(Env* _env, const ReadOptions& read_options,
       const ImmutableCFOptions& cf_options,
       const MutableCFOptions& mutable_cf_options, const Comparator* cmp,
       InternalIterator* iter, SequenceNumber s, bool arena_mode,
       uint64_t max_sequential_skip_in_iterations,
       ReadCallback* read_callback, DBImpl* db_impl, ColumnFamilyData* cfd,
       bool allow_blob)
    : env_(_env),
      logger_(cf_options.info_log),
      user_comparator_(cmp),
      merge_operator_(cf_options.merge_operator),
      iter_(iter),
      read_callback_(read_callback),
      sequence_(s),
      statistics_(cf_options.statistics),
      num_internal_keys_skipped_(0),
      iterate_lower_bound_(read_options.iterate_lower_bound),
      iterate_upper_bound_(read_options.iterate_upper_bound),
      direction_(kForward),
      valid_(false),
      current_entry_is_merged_(false),
      is_key_seqnum_zero_(false),
      prefix_same_as_start_(read_options.prefix_same_as_start),
      pin_thru_lifetime_(read_options.pin_data),
      total_order_seek_(read_options.total_order_seek),
      allow_blob_(allow_blob),
      is_blob_(false),
      arena_mode_(arena_mode),
      range_del_agg_(&cf_options.internal_comparator, s),
      db_impl_(db_impl),
      cfd_(cfd),
      start_seqnum_(read_options.iter_start_seqnum) {
  RecordTick(statistics_, NO_ITERATOR_CREATED);
  prefix_extractor_ = mutable_cf_options.prefix_extractor.get();
  max_skip_ = max_sequential_skip_in_iterations;
  max_skippable_internal_keys_ = read_options.max_skippable_internal_keys;
  if (pin_thru_lifetime_) {
    pinned_iters_mgr_.StartPinning();
  }
  if (iter_.iter()) {
89
    iter_.iter()->SetPinnedItersMgr(&pinned_iters_mgr_);
J
jorlow@chromium.org 已提交
90
  }
91
}
92

93 94 95
Status DBIter::GetProperty(std::string prop_name, std::string* prop) {
  if (prop == nullptr) {
    return Status::InvalidArgument("prop is nullptr");
96
  }
97 98 99 100
  if (prop_name == "rocksdb.iterator.super-version-number") {
    // First try to pass the value returned from inner iterator.
    return iter_.iter()->GetProperty(prop_name, prop);
  } else if (prop_name == "rocksdb.iterator.is-key-pinned") {
101
    if (valid_) {
102 103 104
      *prop = (pin_thru_lifetime_ && saved_key_.IsKeyPinned()) ? "1" : "0";
    } else {
      *prop = "Iterator is not valid.";
105
    }
106 107 108 109
    return Status::OK();
  } else if (prop_name == "rocksdb.iterator.internal-key") {
    *prop = saved_key_.GetUserKey().ToString();
    return Status::OK();
110
  }
111 112
  return Status::InvalidArgument("Unidentified property.");
}
113

114
bool DBIter::ParseKey(ParsedInternalKey* ikey) {
115
  if (!ParseInternalKey(iter_.key(), ikey)) {
J
jorlow@chromium.org 已提交
116
    status_ = Status::Corruption("corrupted internal key in DBIter");
117
    valid_ = false;
118
    ROCKS_LOG_ERROR(logger_, "corrupted internal key in DBIter: %s",
119
                    iter_.key().ToString(true).c_str());
J
jorlow@chromium.org 已提交
120 121 122 123 124 125
    return false;
  } else {
    return true;
  }
}

J
jorlow@chromium.org 已提交
126 127
void DBIter::Next() {
  assert(valid_);
128
  assert(status_.ok());
J
jorlow@chromium.org 已提交
129

130
  PERF_CPU_TIMER_GUARD(iter_next_cpu_nanos, env_);
131 132
  // Release temporarily pinned blocks from last operation
  ReleaseTempPinnedData();
133 134 135
  local_stats_.skip_count_ += num_internal_keys_skipped_;
  local_stats_.skip_count_--;
  num_internal_keys_skipped_ = 0;
136
  bool ok = true;
S
Stanislau Hlebik 已提交
137
  if (direction_ == kReverse) {
138
    is_key_seqnum_zero_ = false;
139 140 141
    if (!ReverseToForward()) {
      ok = false;
    }
142
  } else if (!current_entry_is_merged_) {
143 144 145 146 147
    // If the current value is not a merge, the iter position is the
    // current key, which is already returned. We can safely issue a
    // Next() without checking the current key.
    // If the current key is a merge, very likely iter already points
    // to the next internal position.
148 149
    assert(iter_.Valid());
    iter_.Next();
150
    PERF_COUNTER_ADD(internal_key_skipped_count, 1);
J
jorlow@chromium.org 已提交
151
  }
J
jorlow@chromium.org 已提交
152

153
  local_stats_.next_count_++;
154
  if (ok && iter_.Valid()) {
155 156 157
    FindNextUserEntry(true /* skipping the current user key */,
                      prefix_same_as_start_);
  } else {
158
    is_key_seqnum_zero_ = false;
159 160
    valid_ = false;
  }
161 162 163 164
  if (statistics_ != nullptr && valid_) {
    local_stats_.next_found_count_++;
    local_stats_.bytes_read_ += (key().size() + value().size());
  }
J
jorlow@chromium.org 已提交
165 166
}

167 168 169 170 171 172 173
// PRE: saved_key_ has the current user key if skipping
// POST: saved_key_ should have the next user key if valid_,
//       if the current entry is a result of merge
//           current_entry_is_merged_ => true
//           saved_value_             => the merged value
//
// NOTE: In between, saved_key_ can point to a user key that has
174
//       a delete marker or a sequence number higher than sequence_
175
//       saved_key_ MUST have a proper user_key before calling this function
176 177 178 179 180
//
// The prefix_check parameter controls whether we check the iterated
// keys against the prefix of the seeked key. Set to false when
// performing a seek without a key (e.g. SeekToFirst). Set to
// prefix_same_as_start_ for other iterations.
181
bool DBIter::FindNextUserEntry(bool skipping, bool prefix_check) {
182
  PERF_TIMER_GUARD(find_next_user_entry_time);
183
  return FindNextUserEntryInternal(skipping, prefix_check);
184 185 186
}

// Actual implementation of DBIter::FindNextUserEntry()
187
bool DBIter::FindNextUserEntryInternal(bool skipping, bool prefix_check) {
J
jorlow@chromium.org 已提交
188
  // Loop until we hit an acceptable entry to yield
189
  assert(iter_.Valid());
190
  assert(status_.ok());
J
jorlow@chromium.org 已提交
191
  assert(direction_ == kForward);
192
  current_entry_is_merged_ = false;
193 194 195 196 197 198 199 200 201 202 203

  // How many times in a row we have skipped an entry with user key less than
  // or equal to saved_key_. We could skip these entries either because
  // sequence numbers were too high or because skipping = true.
  // What saved_key_ contains throughout this method:
  //  - if skipping        : saved_key_ contains the key that we need to skip,
  //                         and we haven't seen any keys greater than that,
  //  - if num_skipped > 0 : saved_key_ contains the key that we have skipped
  //                         num_skipped times, and we haven't seen any keys
  //                         greater than that,
  //  - none of the above  : saved_key_ can contain anything, it doesn't matter.
204
  uint64_t num_skipped = 0;
205 206 207 208 209
  // For write unprepared, the target sequence number in reseek could be larger
  // than the snapshot, and thus needs to be skipped again. This could result in
  // an infinite loop of reseeks. To avoid that, we limit the number of reseeks
  // to one.
  bool reseek_done = false;
210

Y
Yi Wu 已提交
211 212
  is_blob_ = false;

J
jorlow@chromium.org 已提交
213
  do {
214 215 216
    // Will update is_key_seqnum_zero_ as soon as we parsed the current key
    // but we need to save the previous value to be used in the loop.
    bool is_prev_key_seqnum_zero = is_key_seqnum_zero_;
217
    if (!ParseKey(&ikey_)) {
218
      is_key_seqnum_zero_ = false;
219
      return false;
220
    }
221

222 223
    is_key_seqnum_zero_ = (ikey_.sequence == 0);

224 225 226
    assert(iterate_upper_bound_ == nullptr || iter_.MayBeOutOfUpperBound() ||
           user_comparator_.Compare(ikey_.user_key, *iterate_upper_bound_) < 0);
    if (iterate_upper_bound_ != nullptr && iter_.MayBeOutOfUpperBound() &&
227
        user_comparator_.Compare(ikey_.user_key, *iterate_upper_bound_) >= 0) {
228 229
      break;
    }
230

231
    if (prefix_extractor_ && prefix_check &&
232 233
        prefix_extractor_->Transform(ikey_.user_key)
                .compare(prefix_start_key_) != 0) {
234 235 236
      break;
    }

237
    if (TooManyInternalKeysSkipped()) {
238
      return false;
239 240
    }

Y
Yi Wu 已提交
241
    if (IsVisible(ikey_.sequence)) {
242 243 244 245 246 247 248
      // If the previous entry is of seqnum 0, the current entry will not
      // possibly be skipped. This condition can potentially be relaxed to
      // prev_key.seq <= ikey_.sequence. We are cautious because it will be more
      // prone to bugs causing the same user key with the same sequence number.
      if (!is_prev_key_seqnum_zero && skipping &&
          user_comparator_.Compare(ikey_.user_key, saved_key_.GetUserKey()) <=
              0) {
249 250 251
        num_skipped++;  // skip this entry
        PERF_COUNTER_ADD(internal_key_skipped_count, 1);
      } else {
252 253
        assert(!skipping || user_comparator_.Compare(
                                ikey_.user_key, saved_key_.GetUserKey()) > 0);
254
        num_skipped = 0;
255
        reseek_done = false;
256
        switch (ikey_.type) {
257 258 259 260
          case kTypeDeletion:
          case kTypeSingleDeletion:
            // Arrange to skip all upcoming entries for this key since
            // they are hidden by this deletion.
261 262 263
            // if iterartor specified start_seqnum we
            // 1) return internal key, including the type
            // 2) return ikey only if ikey.seqnum >= start_seqnum_
264
            // note that if deletion seqnum is < start_seqnum_ we
265 266 267
            // just skip it like in normal iterator.
            if (start_seqnum_ > 0 && ikey_.sequence >= start_seqnum_)  {
              saved_key_.SetInternalKey(ikey_);
268 269
              valid_ = true;
              return true;
270 271
            } else {
              saved_key_.SetUserKey(
272 273
                  ikey_.user_key, !pin_thru_lifetime_ ||
                                      !iter_.iter()->IsKeyPinned() /* copy */);
274 275
              skipping = true;
              PERF_COUNTER_ADD(internal_delete_skipped_count, 1);
276
            }
277 278
            break;
          case kTypeValue:
Y
Yi Wu 已提交
279
          case kTypeBlobIndex:
280 281 282 283 284 285 286 287
            if (start_seqnum_ > 0) {
              // we are taking incremental snapshot here
              // incremental snapshots aren't supported on DB with range deletes
              assert(!(
                (ikey_.type == kTypeBlobIndex) && (start_seqnum_ > 0)
              ));
              if (ikey_.sequence >= start_seqnum_) {
                saved_key_.SetInternalKey(ikey_);
Y
Yi Wu 已提交
288
                valid_ = true;
289
                return true;
290 291 292
              } else {
                // this key and all previous versions shouldn't be included,
                // skipping
293 294 295 296
                saved_key_.SetUserKey(
                    ikey_.user_key,
                    !pin_thru_lifetime_ ||
                        !iter_.iter()->IsKeyPinned() /* copy */);
297
                skipping = true;
Y
Yi Wu 已提交
298
              }
299
            } else {
300
              saved_key_.SetUserKey(
301 302
                  ikey_.user_key, !pin_thru_lifetime_ ||
                                      !iter_.iter()->IsKeyPinned() /* copy */);
303
              if (range_del_agg_.ShouldDelete(
304
                      ikey_, RangeDelPositioningMode::kForwardTraversal)) {
305 306 307 308
                // Arrange to skip all upcoming entries for this key since
                // they are hidden by this deletion.
                skipping = true;
                num_skipped = 0;
309
                reseek_done = false;
310 311 312 313 314 315 316 317
                PERF_COUNTER_ADD(internal_delete_skipped_count, 1);
              } else if (ikey_.type == kTypeBlobIndex) {
                if (!allow_blob_) {
                  ROCKS_LOG_ERROR(logger_, "Encounter unexpected blob index.");
                  status_ = Status::NotSupported(
                      "Encounter unexpected blob index. Please open DB with "
                      "rocksdb::blob_db::BlobDB instead.");
                  valid_ = false;
318
                  return false;
319
                }
320 321 322 323

                is_blob_ = true;
                valid_ = true;
                return true;
324 325
              } else {
                valid_ = true;
326
                return true;
327
              }
328 329 330
            }
            break;
          case kTypeMerge:
331
            saved_key_.SetUserKey(
332
                ikey_.user_key,
333
                !pin_thru_lifetime_ || !iter_.iter()->IsKeyPinned() /* copy */);
334
            if (range_del_agg_.ShouldDelete(
335
                    ikey_, RangeDelPositioningMode::kForwardTraversal)) {
336 337 338 339
              // Arrange to skip all upcoming entries for this key since
              // they are hidden by this deletion.
              skipping = true;
              num_skipped = 0;
340
              reseek_done = false;
341 342 343 344 345 346
              PERF_COUNTER_ADD(internal_delete_skipped_count, 1);
            } else {
              // By now, we are sure the current ikey is going to yield a
              // value
              current_entry_is_merged_ = true;
              valid_ = true;
347
              return MergeValuesNewToOld();  // Go to a different state machine
348 349 350 351 352
            }
            break;
          default:
            assert(false);
            break;
353
        }
J
jorlow@chromium.org 已提交
354
      }
355 356 357
    } else {
      PERF_COUNTER_ADD(internal_recent_skipped_count, 1);

358 359 360 361
      // This key was inserted after our snapshot was taken.
      // If this happens too many times in a row for the same user key, we want
      // to seek to the target sequence number.
      int cmp =
362
          user_comparator_.Compare(ikey_.user_key, saved_key_.GetUserKey());
363
      if (cmp == 0 || (skipping && cmp <= 0)) {
364 365
        num_skipped++;
      } else {
366
        saved_key_.SetUserKey(
367
            ikey_.user_key,
368
            !iter_.iter()->IsKeyPinned() || !pin_thru_lifetime_ /* copy */);
369 370
        skipping = false;
        num_skipped = 0;
371
        reseek_done = false;
372
      }
J
jorlow@chromium.org 已提交
373
    }
374 375 376

    // If we have sequentially iterated via numerous equal keys, then it's
    // better to seek so that we can avoid too many key comparisons.
377 378 379 380 381 382 383 384
    //
    // To avoid infinite loops, do not reseek if we have already attempted to
    // reseek previously.
    //
    // TODO(lth): If we reseek to sequence number greater than ikey_.sequence,
    // than it does not make sense to reseek as we would actually land further
    // away from the desired key. There is opportunity for optimization here.
    if (num_skipped > max_skip_ && !reseek_done) {
385
      is_key_seqnum_zero_ = false;
386
      num_skipped = 0;
387
      reseek_done = true;
388
      std::string last_key;
389 390 391 392
      if (skipping) {
        // We're looking for the next user-key but all we see are the same
        // user-key with decreasing sequence numbers. Fast forward to
        // sequence number 0 and type deletion (the smallest type).
393 394
        AppendInternalKey(&last_key, ParsedInternalKey(saved_key_.GetUserKey(),
                                                       0, kTypeDeletion));
395 396 397 398 399 400 401 402 403
        // Don't set skipping = false because we may still see more user-keys
        // equal to saved_key_.
      } else {
        // We saw multiple entries with this user key and sequence numbers
        // higher than sequence_. Fast forward to sequence_.
        // Note that this only covers a case when a higher key was overwritten
        // many times since our snapshot was taken, not the case when a lot of
        // different keys were inserted after our snapshot was taken.
        AppendInternalKey(&last_key,
404
                          ParsedInternalKey(saved_key_.GetUserKey(), sequence_,
405 406
                                            kValueTypeForSeek));
      }
407
      iter_.Seek(last_key);
408 409
      RecordTick(statistics_, NUMBER_OF_RESEEKS_IN_ITERATION);
    } else {
410
      iter_.Next();
411
    }
412
  } while (iter_.Valid());
413

J
jorlow@chromium.org 已提交
414
  valid_ = false;
415
  return iter_.status().ok();
J
jorlow@chromium.org 已提交
416 417
}

418 419
// Merge values of the same user key starting from the current iter_ position
// Scan from the newer entries to older entries.
420
// PRE: iter_.key() points to the first merge type entry
421 422 423
//      saved_key_ stores the user key
// POST: saved_value_ has the merged value for the user key
//       iter_ points to the next entry (or invalid)
424
bool DBIter::MergeValuesNewToOld() {
425
  if (!merge_operator_) {
426
    ROCKS_LOG_ERROR(logger_, "Options::merge_operator is null.");
427
    status_ = Status::InvalidArgument("merge_operator_ must be set.");
428
    valid_ = false;
429
    return false;
D
Deon Nicholas 已提交
430
  }
431

432 433
  // Temporarily pin the blocks that hold merge operands
  TempPinData();
434
  merge_context_.Clear();
435
  // Start the merge process by pushing the first operand
436 437
  merge_context_.PushOperand(
      iter_.value(), iter_.iter()->IsValuePinned() /* operand_pinned */);
438
  TEST_SYNC_POINT("DBIter::MergeValuesNewToOld:PushedFirstOperand");
439 440

  ParsedInternalKey ikey;
441
  Status s;
442
  for (iter_.Next(); iter_.Valid(); iter_.Next()) {
443
    TEST_SYNC_POINT("DBIter::MergeValuesNewToOld:SteppedToNextOperand");
444
    if (!ParseKey(&ikey)) {
445
      return false;
446 447
    }

448
    if (!user_comparator_.Equal(ikey.user_key, saved_key_.GetUserKey())) {
449 450
      // hit the next user key, stop right here
      break;
A
Andrew Kryczka 已提交
451
    } else if (kTypeDeletion == ikey.type || kTypeSingleDeletion == ikey.type ||
452
               range_del_agg_.ShouldDelete(
453
                   ikey, RangeDelPositioningMode::kForwardTraversal)) {
454 455
      // hit a delete with the same user key, stop right here
      // iter_ is positioned after delete
456
      iter_.Next();
457
      break;
A
Andres Noetzli 已提交
458
    } else if (kTypeValue == ikey.type) {
459 460
      // hit a put, merge the put value with operands and store the
      // final result in saved_value_. We are done!
461
      const Slice val = iter_.value();
462 463
      s = MergeHelper::TimedFullMerge(
          merge_operator_, ikey.user_key, &val, merge_context_.GetOperands(),
464
          &saved_value_, logger_, statistics_, env_, &pinned_value_, true);
465
      if (!s.ok()) {
Y
Yi Wu 已提交
466
        valid_ = false;
467
        status_ = s;
468
        return false;
469
      }
470
      // iter_ is positioned after put
471 472
      iter_.Next();
      if (!iter_.status().ok()) {
473 474 475 476
        valid_ = false;
        return false;
      }
      return true;
A
Andres Noetzli 已提交
477
    } else if (kTypeMerge == ikey.type) {
478 479
      // hit a merge, add the value as an operand and run associative merge.
      // when complete, add result to operands and continue.
480 481
      merge_context_.PushOperand(
          iter_.value(), iter_.iter()->IsValuePinned() /* operand_pinned */);
482
      PERF_COUNTER_ADD(internal_merge_count, 1);
Y
Yi Wu 已提交
483 484 485 486 487 488 489 490 491 492 493
    } else if (kTypeBlobIndex == ikey.type) {
      if (!allow_blob_) {
        ROCKS_LOG_ERROR(logger_, "Encounter unexpected blob index.");
        status_ = Status::NotSupported(
            "Encounter unexpected blob index. Please open DB with "
            "rocksdb::blob_db::BlobDB instead.");
      } else {
        status_ =
            Status::NotSupported("Blob DB does not support merge operator.");
      }
      valid_ = false;
494
      return false;
A
Andres Noetzli 已提交
495 496
    } else {
      assert(false);
497 498 499
    }
  }

500
  if (!iter_.status().ok()) {
501 502 503 504
    valid_ = false;
    return false;
  }

505 506 507 508
  // we either exhausted all internal keys under this user key, or hit
  // a deletion marker.
  // feed null as the existing value to the merge operator, such that
  // client can differentiate this scenario and do things accordingly.
509 510 511
  s = MergeHelper::TimedFullMerge(merge_operator_, saved_key_.GetUserKey(),
                                  nullptr, merge_context_.GetOperands(),
                                  &saved_value_, logger_, statistics_, env_,
512
                                  &pinned_value_, true);
513
  if (!s.ok()) {
Y
Yi Wu 已提交
514
    valid_ = false;
515
    status_ = s;
516
    return false;
517
  }
518 519 520

  assert(status_.ok());
  return true;
521 522
}

J
jorlow@chromium.org 已提交
523 524
void DBIter::Prev() {
  assert(valid_);
525
  assert(status_.ok());
526 527

  PERF_CPU_TIMER_GUARD(iter_prev_cpu_nanos, env_);
528
  ReleaseTempPinnedData();
529
  ResetInternalKeysSkippedCounter();
530
  bool ok = true;
S
Stanislau Hlebik 已提交
531
  if (direction_ == kForward) {
532 533 534 535 536 537
    if (!ReverseToBackward()) {
      ok = false;
    }
  }
  if (ok) {
    PrevInternal();
S
Stanislau Hlebik 已提交
538
  }
M
Manuel Ung 已提交
539
  if (statistics_ != nullptr) {
540
    local_stats_.prev_count_++;
M
Manuel Ung 已提交
541
    if (valid_) {
542 543
      local_stats_.prev_found_count_++;
      local_stats_.bytes_read_ += (key().size() + value().size());
M
Manuel Ung 已提交
544 545
    }
  }
S
Stanislau Hlebik 已提交
546
}
J
jorlow@chromium.org 已提交
547

548
bool DBIter::ReverseToForward() {
549
  assert(iter_.status().ok());
550 551 552 553

  // When moving backwards, iter_ is positioned on _previous_ key, which may
  // not exist or may have different prefix than the current key().
  // If that's the case, seek iter_ to current key.
554
  if ((prefix_extractor_ != nullptr && !total_order_seek_) || !iter_.Valid()) {
555 556
    IterKey last_key;
    last_key.SetInternalKey(ParsedInternalKey(
557
        saved_key_.GetUserKey(), kMaxSequenceNumber, kValueTypeForSeek));
558
    iter_.Seek(last_key.GetInternalKey());
559
  }
560

561
  direction_ = kForward;
562
  // Skip keys less than the current key() (a.k.a. saved_key_).
563
  while (iter_.Valid()) {
564 565 566 567
    ParsedInternalKey ikey;
    if (!ParseKey(&ikey)) {
      return false;
    }
568
    if (user_comparator_.Compare(ikey.user_key, saved_key_.GetUserKey()) >= 0) {
569 570
      return true;
    }
571
    iter_.Next();
572 573
  }

574
  if (!iter_.status().ok()) {
575 576
    valid_ = false;
    return false;
577
  }
578 579

  return true;
580 581
}

582 583
// Move iter_ to the key before saved_key_.
bool DBIter::ReverseToBackward() {
584
  assert(iter_.status().ok());
585 586 587 588 589 590

  // When current_entry_is_merged_ is true, iter_ may be positioned on the next
  // key, which may not exist or may have prefix different from current.
  // If that's the case, seek to saved_key_.
  if (current_entry_is_merged_ &&
      ((prefix_extractor_ != nullptr && !total_order_seek_) ||
591
       !iter_.Valid())) {
592
    IterKey last_key;
593 594 595 596 597 598
    // Using kMaxSequenceNumber and kValueTypeForSeek
    // (not kValueTypeForSeekForPrev) to seek to a key strictly smaller
    // than saved_key_.
    last_key.SetInternalKey(ParsedInternalKey(
        saved_key_.GetUserKey(), kMaxSequenceNumber, kValueTypeForSeek));
    if (prefix_extractor_ != nullptr && !total_order_seek_) {
599
      iter_.SeekForPrev(last_key.GetInternalKey());
600 601 602 603 604
    } else {
      // Some iterators may not support SeekForPrev(), so we avoid using it
      // when prefix seek mode is disabled. This is somewhat expensive
      // (an extra Prev(), as well as an extra change of direction of iter_),
      // so we may need to reconsider it later.
605 606 607
      iter_.Seek(last_key.GetInternalKey());
      if (!iter_.Valid() && iter_.status().ok()) {
        iter_.SeekToLast();
608
      }
609 610 611 612
    }
  }

  direction_ = kReverse;
613
  return FindUserKeyBeforeSavedKey();
614 615
}

S
Stanislau Hlebik 已提交
616
void DBIter::PrevInternal() {
617
  while (iter_.Valid()) {
618
    saved_key_.SetUserKey(
619 620
        ExtractUserKey(iter_.key()),
        !iter_.iter()->IsKeyPinned() || !pin_thru_lifetime_ /* copy */);
621

622 623 624 625 626 627 628 629
    if (prefix_extractor_ && prefix_same_as_start_ &&
        prefix_extractor_->Transform(saved_key_.GetUserKey())
                .compare(prefix_start_key_) != 0) {
      // Current key does not have the same prefix as start
      valid_ = false;
      return;
    }

630 631 632 633
    assert(iterate_lower_bound_ == nullptr || iter_.MayBeOutOfLowerBound() ||
           user_comparator_.Compare(saved_key_.GetUserKey(),
                                    *iterate_lower_bound_) >= 0);
    if (iterate_lower_bound_ != nullptr && iter_.MayBeOutOfLowerBound() &&
634 635
        user_comparator_.Compare(saved_key_.GetUserKey(),
                                 *iterate_lower_bound_) < 0) {
636 637 638 639 640
      // We've iterated earlier than the user-specified lower bound.
      valid_ = false;
      return;
    }

641
    if (!FindValueForCurrentKey()) {  // assigns valid_
S
Stanislau Hlebik 已提交
642
      return;
J
jorlow@chromium.org 已提交
643
    }
644

645 646 647
    // Whether or not we found a value for current key, we need iter_ to end up
    // on a smaller key.
    if (!FindUserKeyBeforeSavedKey()) {
648 649 650
      return;
    }

651 652 653
    if (valid_) {
      // Found the value.
      return;
S
Stanislau Hlebik 已提交
654
    }
655 656 657

    if (TooManyInternalKeysSkipped(false)) {
      return;
S
Stanislau Hlebik 已提交
658 659
    }
  }
660

S
Stanislau Hlebik 已提交
661 662
  // We haven't found any key - iterator is not valid
  valid_ = false;
J
jorlow@chromium.org 已提交
663 664
}

665 666 667 668 669 670 671 672 673 674 675
// Used for backwards iteration.
// Looks at the entries with user key saved_key_ and finds the most up-to-date
// value for it, or executes a merge, or determines that the value was deleted.
// Sets valid_ to true if the value is found and is ready to be presented to
// the user through value().
// Sets valid_ to false if the value was deleted, and we should try another key.
// Returns false if an error occurred, and !status().ok() and !valid_.
//
// PRE: iter_ is positioned on the last entry with user key equal to saved_key_.
// POST: iter_ is positioned on one of the entries equal to saved_key_, or on
//       the entry just before them, or on the entry just after them.
S
Stanislau Hlebik 已提交
676
bool DBIter::FindValueForCurrentKey() {
677
  assert(iter_.Valid());
678
  merge_context_.Clear();
679
  current_entry_is_merged_ = false;
A
Andres Noetzli 已提交
680 681
  // last entry before merge (could be kTypeDeletion, kTypeSingleDeletion or
  // kTypeValue)
S
Stanislau Hlebik 已提交
682 683
  ValueType last_not_merge_type = kTypeDeletion;
  ValueType last_key_entry_type = kTypeDeletion;
J
jorlow@chromium.org 已提交
684

685 686 687
  // Temporarily pin blocks that hold (merge operands / the value)
  ReleaseTempPinnedData();
  TempPinData();
S
Stanislau Hlebik 已提交
688
  size_t num_skipped = 0;
689
  while (iter_.Valid()) {
690 691 692 693 694 695
    ParsedInternalKey ikey;
    if (!ParseKey(&ikey)) {
      return false;
    }

    if (!IsVisible(ikey.sequence) ||
696
        !user_comparator_.Equal(ikey.user_key, saved_key_.GetUserKey())) {
697 698
      break;
    }
699 700 701 702
    if (TooManyInternalKeysSkipped()) {
      return false;
    }

703 704 705
    // This user key has lots of entries.
    // We're going from old to new, and it's taking too long. Let's do a Seek()
    // and go from new to old. This helps when a key was overwritten many times.
706
    if (num_skipped >= max_skip_) {
S
Stanislau Hlebik 已提交
707 708 709 710 711 712
      return FindValueForCurrentKeyUsingSeek();
    }

    last_key_entry_type = ikey.type;
    switch (last_key_entry_type) {
      case kTypeValue:
Y
Yi Wu 已提交
713
      case kTypeBlobIndex:
714
        if (range_del_agg_.ShouldDelete(
715
                ikey, RangeDelPositioningMode::kBackwardTraversal)) {
A
Andrew Kryczka 已提交
716 717 718
          last_key_entry_type = kTypeRangeDeletion;
          PERF_COUNTER_ADD(internal_delete_skipped_count, 1);
        } else {
719 720
          assert(iter_.iter()->IsValuePinned());
          pinned_value_ = iter_.value();
A
Andrew Kryczka 已提交
721
        }
722
        merge_context_.Clear();
A
Andrew Kryczka 已提交
723
        last_not_merge_type = last_key_entry_type;
S
Stanislau Hlebik 已提交
724 725
        break;
      case kTypeDeletion:
A
Andres Noetzli 已提交
726
      case kTypeSingleDeletion:
727
        merge_context_.Clear();
A
Andres Noetzli 已提交
728
        last_not_merge_type = last_key_entry_type;
729
        PERF_COUNTER_ADD(internal_delete_skipped_count, 1);
S
Stanislau Hlebik 已提交
730 731
        break;
      case kTypeMerge:
732
        if (range_del_agg_.ShouldDelete(
733
                ikey, RangeDelPositioningMode::kBackwardTraversal)) {
A
Andrew Kryczka 已提交
734 735 736 737 738 739 740
          merge_context_.Clear();
          last_key_entry_type = kTypeRangeDeletion;
          last_not_merge_type = last_key_entry_type;
          PERF_COUNTER_ADD(internal_delete_skipped_count, 1);
        } else {
          assert(merge_operator_ != nullptr);
          merge_context_.PushOperandBack(
741 742
              iter_.value(),
              iter_.iter()->IsValuePinned() /* operand_pinned */);
743
          PERF_COUNTER_ADD(internal_merge_count, 1);
A
Andrew Kryczka 已提交
744
        }
S
Stanislau Hlebik 已提交
745 746 747 748 749
        break;
      default:
        assert(false);
    }

750
    PERF_COUNTER_ADD(internal_key_skipped_count, 1);
751
    iter_.Prev();
S
Stanislau Hlebik 已提交
752
    ++num_skipped;
753 754
  }

755
  if (!iter_.status().ok()) {
756 757
    valid_ = false;
    return false;
S
Stanislau Hlebik 已提交
758 759
  }

760
  Status s;
Y
Yi Wu 已提交
761
  is_blob_ = false;
S
Stanislau Hlebik 已提交
762 763
  switch (last_key_entry_type) {
    case kTypeDeletion:
A
Andres Noetzli 已提交
764
    case kTypeSingleDeletion:
A
Andrew Kryczka 已提交
765
    case kTypeRangeDeletion:
S
Stanislau Hlebik 已提交
766
      valid_ = false;
767
      return true;
S
Stanislau Hlebik 已提交
768
    case kTypeMerge:
769
      current_entry_is_merged_ = true;
A
Aaron Gao 已提交
770
      if (last_not_merge_type == kTypeDeletion ||
A
Andrew Kryczka 已提交
771 772
          last_not_merge_type == kTypeSingleDeletion ||
          last_not_merge_type == kTypeRangeDeletion) {
773 774 775
        s = MergeHelper::TimedFullMerge(
            merge_operator_, saved_key_.GetUserKey(), nullptr,
            merge_context_.GetOperands(), &saved_value_, logger_, statistics_,
776
            env_, &pinned_value_, true);
Y
Yi Wu 已提交
777 778 779 780 781 782 783 784 785 786 787
      } else if (last_not_merge_type == kTypeBlobIndex) {
        if (!allow_blob_) {
          ROCKS_LOG_ERROR(logger_, "Encounter unexpected blob index.");
          status_ = Status::NotSupported(
              "Encounter unexpected blob index. Please open DB with "
              "rocksdb::blob_db::BlobDB instead.");
        } else {
          status_ =
              Status::NotSupported("Blob DB does not support merge operator.");
        }
        valid_ = false;
788
        return false;
789
      } else {
S
Stanislau Hlebik 已提交
790
        assert(last_not_merge_type == kTypeValue);
791
        s = MergeHelper::TimedFullMerge(
792
            merge_operator_, saved_key_.GetUserKey(), &pinned_value_,
793
            merge_context_.GetOperands(), &saved_value_, logger_, statistics_,
794
            env_, &pinned_value_, true);
795
      }
S
Stanislau Hlebik 已提交
796 797
      break;
    case kTypeValue:
798
      // do nothing - we've already has value in pinned_value_
S
Stanislau Hlebik 已提交
799
      break;
Y
Yi Wu 已提交
800 801 802 803 804 805 806
    case kTypeBlobIndex:
      if (!allow_blob_) {
        ROCKS_LOG_ERROR(logger_, "Encounter unexpected blob index.");
        status_ = Status::NotSupported(
            "Encounter unexpected blob index. Please open DB with "
            "rocksdb::blob_db::BlobDB instead.");
        valid_ = false;
807
        return false;
Y
Yi Wu 已提交
808 809 810
      }
      is_blob_ = true;
      break;
S
Stanislau Hlebik 已提交
811 812 813
    default:
      assert(false);
      break;
J
jorlow@chromium.org 已提交
814
  }
815
  if (!s.ok()) {
Y
Yi Wu 已提交
816
    valid_ = false;
817
    status_ = s;
818
    return false;
819
  }
820
  valid_ = true;
S
Stanislau Hlebik 已提交
821 822
  return true;
}
J
jorlow@chromium.org 已提交
823

S
Stanislau Hlebik 已提交
824 825
// This function is used in FindValueForCurrentKey.
// We use Seek() function instead of Prev() to find necessary value
826 827
// TODO: This is very similar to FindNextUserEntry() and MergeValuesNewToOld().
//       Would be nice to reuse some code.
S
Stanislau Hlebik 已提交
828
bool DBIter::FindValueForCurrentKeyUsingSeek() {
829 830 831
  // FindValueForCurrentKey will enable pinning before calling
  // FindValueForCurrentKeyUsingSeek()
  assert(pinned_iters_mgr_.PinningEnabled());
S
Stanislau Hlebik 已提交
832
  std::string last_key;
833 834
  AppendInternalKey(&last_key, ParsedInternalKey(saved_key_.GetUserKey(),
                                                 sequence_, kValueTypeForSeek));
835
  iter_.Seek(last_key);
S
Stanislau Hlebik 已提交
836 837
  RecordTick(statistics_, NUMBER_OF_RESEEKS_IN_ITERATION);

838 839
  // In case read_callback presents, the value we seek to may not be visible.
  // Find the next value that's visible.
S
Stanislau Hlebik 已提交
840
  ParsedInternalKey ikey;
841
  while (true) {
842
    if (!iter_.Valid()) {
843
      valid_ = false;
844
      return iter_.status().ok();
845 846 847 848 849
    }

    if (!ParseKey(&ikey)) {
      return false;
    }
850
    if (!user_comparator_.Equal(ikey.user_key, saved_key_.GetUserKey())) {
851 852 853 854 855 856 857 858 859 860
      // No visible values for this key, even though FindValueForCurrentKey()
      // has seen some. This is possible if we're using a tailing iterator, and
      // the entries were discarded in a compaction.
      valid_ = false;
      return true;
    }

    if (IsVisible(ikey.sequence)) {
      break;
    }
861

862
    iter_.Next();
863
  }
S
Stanislau Hlebik 已提交
864

A
Andrew Kryczka 已提交
865
  if (ikey.type == kTypeDeletion || ikey.type == kTypeSingleDeletion ||
866
      range_del_agg_.ShouldDelete(
867
          ikey, RangeDelPositioningMode::kBackwardTraversal)) {
J
jorlow@chromium.org 已提交
868
    valid_ = false;
869
    return true;
S
Stanislau Hlebik 已提交
870
  }
Y
Yi Wu 已提交
871 872 873 874 875 876
  if (ikey.type == kTypeBlobIndex && !allow_blob_) {
    ROCKS_LOG_ERROR(logger_, "Encounter unexpected blob index.");
    status_ = Status::NotSupported(
        "Encounter unexpected blob index. Please open DB with "
        "rocksdb::blob_db::BlobDB instead.");
    valid_ = false;
877
    return false;
Y
Yi Wu 已提交
878 879
  }
  if (ikey.type == kTypeValue || ikey.type == kTypeBlobIndex) {
880 881
    assert(iter_.iter()->IsValuePinned());
    pinned_value_ = iter_.value();
A
Andrew Kryczka 已提交
882 883 884
    valid_ = true;
    return true;
  }
S
Stanislau Hlebik 已提交
885 886 887

  // kTypeMerge. We need to collect all kTypeMerge values and save them
  // in operands
888
  assert(ikey.type == kTypeMerge);
889
  current_entry_is_merged_ = true;
890
  merge_context_.Clear();
891 892
  merge_context_.PushOperand(
      iter_.value(), iter_.iter()->IsValuePinned() /* operand_pinned */);
893
  while (true) {
894
    iter_.Next();
S
Stanislau Hlebik 已提交
895

896 897
    if (!iter_.Valid()) {
      if (!iter_.status().ok()) {
898 899 900 901
        valid_ = false;
        return false;
      }
      break;
S
Stanislau Hlebik 已提交
902
    }
903 904 905
    if (!ParseKey(&ikey)) {
      return false;
    }
906
    if (!user_comparator_.Equal(ikey.user_key, saved_key_.GetUserKey())) {
907 908 909 910 911
      break;
    }

    if (ikey.type == kTypeDeletion || ikey.type == kTypeSingleDeletion ||
        range_del_agg_.ShouldDelete(
912
            ikey, RangeDelPositioningMode::kForwardTraversal)) {
913 914
      break;
    } else if (ikey.type == kTypeValue) {
915
      const Slice val = iter_.value();
916 917 918 919 920 921 922 923 924
      Status s = MergeHelper::TimedFullMerge(
          merge_operator_, saved_key_.GetUserKey(), &val,
          merge_context_.GetOperands(), &saved_value_, logger_, statistics_,
          env_, &pinned_value_, true);
      if (!s.ok()) {
        valid_ = false;
        status_ = s;
        return false;
      }
Y
Yi Wu 已提交
925
      valid_ = true;
926 927
      return true;
    } else if (ikey.type == kTypeMerge) {
928 929
      merge_context_.PushOperand(
          iter_.value(), iter_.iter()->IsValuePinned() /* operand_pinned */);
930 931 932 933 934 935 936 937 938 939 940
      PERF_COUNTER_ADD(internal_merge_count, 1);
    } else if (ikey.type == kTypeBlobIndex) {
      if (!allow_blob_) {
        ROCKS_LOG_ERROR(logger_, "Encounter unexpected blob index.");
        status_ = Status::NotSupported(
            "Encounter unexpected blob index. Please open DB with "
            "rocksdb::blob_db::BlobDB instead.");
      } else {
        status_ =
            Status::NotSupported("Blob DB does not support merge operator.");
      }
Y
Yi Wu 已提交
941
      valid_ = false;
942 943 944
      return false;
    } else {
      assert(false);
945
    }
S
Stanislau Hlebik 已提交
946 947
  }

948 949 950 951 952
  Status s = MergeHelper::TimedFullMerge(
      merge_operator_, saved_key_.GetUserKey(), nullptr,
      merge_context_.GetOperands(), &saved_value_, logger_, statistics_, env_,
      &pinned_value_, true);
  if (!s.ok()) {
Y
Yi Wu 已提交
953
    valid_ = false;
954
    status_ = s;
955
    return false;
956
  }
S
Stanislau Hlebik 已提交
957

958 959 960
  // Make sure we leave iter_ in a good state. If it's valid and we don't care
  // about prefixes, that's already good enough. Otherwise it needs to be
  // seeked to the current key.
961
  if ((prefix_extractor_ != nullptr && !total_order_seek_) || !iter_.Valid()) {
962
    if (prefix_extractor_ != nullptr && !total_order_seek_) {
963
      iter_.SeekForPrev(last_key);
964
    } else {
965 966 967
      iter_.Seek(last_key);
      if (!iter_.Valid() && iter_.status().ok()) {
        iter_.SeekToLast();
968 969 970
      }
    }
    RecordTick(statistics_, NUMBER_OF_RESEEKS_IN_ITERATION);
S
Stanislau Hlebik 已提交
971
  }
972 973 974

  valid_ = true;
  return true;
S
Stanislau Hlebik 已提交
975 976
}

977 978 979 980
// Move backwards until the key smaller than saved_key_.
// Changes valid_ only if return value is false.
bool DBIter::FindUserKeyBeforeSavedKey() {
  assert(status_.ok());
S
Stanislau Hlebik 已提交
981
  size_t num_skipped = 0;
982
  while (iter_.Valid()) {
983 984 985
    ParsedInternalKey ikey;
    if (!ParseKey(&ikey)) {
      return false;
986 987
    }

988
    if (user_comparator_.Compare(ikey.user_key, saved_key_.GetUserKey()) < 0) {
989 990 991 992 993
      return true;
    }

    if (TooManyInternalKeysSkipped()) {
      return false;
S
Stanislau Hlebik 已提交
994
    }
995

S
Siying Dong 已提交
996
    assert(ikey.sequence != kMaxSequenceNumber);
Y
Yi Wu 已提交
997
    if (!IsVisible(ikey.sequence)) {
998 999 1000 1001
      PERF_COUNTER_ADD(internal_recent_skipped_count, 1);
    } else {
      PERF_COUNTER_ADD(internal_key_skipped_count, 1);
    }
1002

1003
    if (num_skipped >= max_skip_) {
1004 1005 1006 1007 1008 1009
      num_skipped = 0;
      IterKey last_key;
      last_key.SetInternalKey(ParsedInternalKey(
          saved_key_.GetUserKey(), kMaxSequenceNumber, kValueTypeForSeek));
      // It would be more efficient to use SeekForPrev() here, but some
      // iterators may not support it.
1010
      iter_.Seek(last_key.GetInternalKey());
1011
      RecordTick(statistics_, NUMBER_OF_RESEEKS_IN_ITERATION);
1012
      if (!iter_.Valid()) {
1013 1014 1015 1016 1017 1018
        break;
      }
    } else {
      ++num_skipped;
    }

1019
    iter_.Prev();
S
Stanislau Hlebik 已提交
1020
  }
1021

1022
  if (!iter_.status().ok()) {
1023 1024 1025 1026 1027
    valid_ = false;
    return false;
  }

  return true;
S
Stanislau Hlebik 已提交
1028 1029
}

1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041
bool DBIter::TooManyInternalKeysSkipped(bool increment) {
  if ((max_skippable_internal_keys_ > 0) &&
      (num_internal_keys_skipped_ > max_skippable_internal_keys_)) {
    valid_ = false;
    status_ = Status::Incomplete("Too many internal keys skipped.");
    return true;
  } else if (increment) {
    num_internal_keys_skipped_++;
  }
  return false;
}

Y
Yi Wu 已提交
1042
bool DBIter::IsVisible(SequenceNumber sequence) {
1043
  if (read_callback_ == nullptr) {
1044 1045 1046
    return sequence <= sequence_;
  } else {
    return read_callback_->IsVisible(sequence);
1047
  }
1048
}
1049

J
jorlow@chromium.org 已提交
1050
void DBIter::Seek(const Slice& target) {
1051
  PERF_CPU_TIMER_GUARD(iter_seek_cpu_nanos, env_);
L
Lei Jin 已提交
1052
  StopWatch sw(env_, statistics_, DB_SEEK);
1053
  status_ = Status::OK();
1054
  ReleaseTempPinnedData();
1055
  ResetInternalKeysSkippedCounter();
1056
  is_key_seqnum_zero_ = false;
1057

1058
  SequenceNumber seq = sequence_;
1059
  saved_key_.Clear();
1060
  saved_key_.SetInternalKey(target, seq);
1061

1062 1063 1064 1065 1066 1067
#ifndef ROCKSDB_LITE
  if (db_impl_ != nullptr && cfd_ != nullptr) {
    db_impl_->TraceIteratorSeek(cfd_->GetID(), target);
  }
#endif  // ROCKSDB_LITE

Z
zhangjinpeng1987 已提交
1068
  if (iterate_lower_bound_ != nullptr &&
1069 1070
      user_comparator_.Compare(saved_key_.GetUserKey(), *iterate_lower_bound_) <
          0) {
Z
zhangjinpeng1987 已提交
1071
    saved_key_.Clear();
1072
    saved_key_.SetInternalKey(*iterate_lower_bound_, seq);
Z
zhangjinpeng1987 已提交
1073 1074
  }

1075 1076
  {
    PERF_TIMER_GUARD(seek_internal_seek_time);
1077
    iter_.Seek(saved_key_.GetInternalKey());
1078
    range_del_agg_.InvalidateRangeDelMapPositions();
1079
  }
M
Manuel Ung 已提交
1080
  RecordTick(statistics_, NUMBER_DB_SEEK);
1081
  if (iter_.Valid()) {
1082 1083 1084
    if (prefix_extractor_ && prefix_same_as_start_) {
      prefix_start_key_ = prefix_extractor_->Transform(target);
    }
1085 1086
    direction_ = kForward;
    ClearSavedValue();
1087 1088 1089 1090
    FindNextUserEntry(false /* not skipping */, prefix_same_as_start_);
    if (!valid_) {
      prefix_start_key_.clear();
    }
M
Manuel Ung 已提交
1091 1092
    if (statistics_ != nullptr) {
      if (valid_) {
1093
        // Decrement since we don't want to count this key as skipped
M
Manuel Ung 已提交
1094 1095
        RecordTick(statistics_, NUMBER_DB_SEEK_FOUND);
        RecordTick(statistics_, ITER_BYTES_READ, key().size() + value().size());
1096
        PERF_COUNTER_ADD(iter_read_bytes, key().size() + value().size());
M
Manuel Ung 已提交
1097 1098
      }
    }
J
jorlow@chromium.org 已提交
1099 1100 1101
  } else {
    valid_ = false;
  }
1102

1103
  if (valid_ && prefix_extractor_ && prefix_same_as_start_) {
1104 1105
    prefix_start_buf_.SetUserKey(prefix_start_key_);
    prefix_start_key_ = prefix_start_buf_.GetUserKey();
1106
  }
J
jorlow@chromium.org 已提交
1107
}
J
jorlow@chromium.org 已提交
1108

A
Aaron Gao 已提交
1109
void DBIter::SeekForPrev(const Slice& target) {
1110
  PERF_CPU_TIMER_GUARD(iter_seek_cpu_nanos, env_);
A
Aaron Gao 已提交
1111
  StopWatch sw(env_, statistics_, DB_SEEK);
1112
  status_ = Status::OK();
A
Aaron Gao 已提交
1113
  ReleaseTempPinnedData();
1114
  ResetInternalKeysSkippedCounter();
1115
  is_key_seqnum_zero_ = false;
A
Aaron Gao 已提交
1116 1117 1118 1119 1120
  saved_key_.Clear();
  // now saved_key is used to store internal key.
  saved_key_.SetInternalKey(target, 0 /* sequence_number */,
                            kValueTypeForSeekForPrev);

Z
zhangjinpeng1987 已提交
1121
  if (iterate_upper_bound_ != nullptr &&
1122 1123
      user_comparator_.Compare(saved_key_.GetUserKey(),
                               *iterate_upper_bound_) >= 0) {
Z
zhangjinpeng1987 已提交
1124 1125 1126 1127
    saved_key_.Clear();
    saved_key_.SetInternalKey(*iterate_upper_bound_, kMaxSequenceNumber);
  }

A
Aaron Gao 已提交
1128 1129
  {
    PERF_TIMER_GUARD(seek_internal_seek_time);
1130
    iter_.SeekForPrev(saved_key_.GetInternalKey());
1131
    range_del_agg_.InvalidateRangeDelMapPositions();
A
Aaron Gao 已提交
1132 1133
  }

1134 1135 1136 1137 1138 1139
#ifndef ROCKSDB_LITE
  if (db_impl_ != nullptr && cfd_ != nullptr) {
    db_impl_->TraceIteratorSeekForPrev(cfd_->GetID(), target);
  }
#endif  // ROCKSDB_LITE

A
Aaron Gao 已提交
1140
  RecordTick(statistics_, NUMBER_DB_SEEK);
1141
  if (iter_.Valid()) {
A
Aaron Gao 已提交
1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154
    if (prefix_extractor_ && prefix_same_as_start_) {
      prefix_start_key_ = prefix_extractor_->Transform(target);
    }
    direction_ = kReverse;
    ClearSavedValue();
    PrevInternal();
    if (!valid_) {
      prefix_start_key_.clear();
    }
    if (statistics_ != nullptr) {
      if (valid_) {
        RecordTick(statistics_, NUMBER_DB_SEEK_FOUND);
        RecordTick(statistics_, ITER_BYTES_READ, key().size() + value().size());
1155
        PERF_COUNTER_ADD(iter_read_bytes, key().size() + value().size());
A
Aaron Gao 已提交
1156 1157 1158 1159 1160 1161
      }
    }
  } else {
    valid_ = false;
  }
  if (valid_ && prefix_extractor_ && prefix_same_as_start_) {
1162 1163
    prefix_start_buf_.SetUserKey(prefix_start_key_);
    prefix_start_key_ = prefix_start_buf_.GetUserKey();
A
Aaron Gao 已提交
1164 1165 1166
  }
}

J
jorlow@chromium.org 已提交
1167
void DBIter::SeekToFirst() {
1168 1169 1170 1171
  if (iterate_lower_bound_ != nullptr) {
    Seek(*iterate_lower_bound_);
    return;
  }
1172
  PERF_CPU_TIMER_GUARD(iter_seek_cpu_nanos, env_);
1173 1174 1175 1176 1177 1178
  // Don't use iter_::Seek() if we set a prefix extractor
  // because prefix seek will be used.
  if (prefix_extractor_ != nullptr && !total_order_seek_) {
    max_skip_ = std::numeric_limits<uint64_t>::max();
  }
  status_ = Status::OK();
J
jorlow@chromium.org 已提交
1179
  direction_ = kForward;
1180
  ReleaseTempPinnedData();
1181
  ResetInternalKeysSkippedCounter();
J
jorlow@chromium.org 已提交
1182
  ClearSavedValue();
1183
  is_key_seqnum_zero_ = false;
1184 1185 1186

  {
    PERF_TIMER_GUARD(seek_internal_seek_time);
1187
    iter_.SeekToFirst();
1188
    range_del_agg_.InvalidateRangeDelMapPositions();
1189 1190
  }

M
Manuel Ung 已提交
1191
  RecordTick(statistics_, NUMBER_DB_SEEK);
1192
  if (iter_.Valid()) {
1193
    saved_key_.SetUserKey(
1194 1195
        ExtractUserKey(iter_.key()),
        !iter_.iter()->IsKeyPinned() || !pin_thru_lifetime_ /* copy */);
1196
    FindNextUserEntry(false /* not skipping */, false /* no prefix check */);
M
Manuel Ung 已提交
1197 1198 1199 1200
    if (statistics_ != nullptr) {
      if (valid_) {
        RecordTick(statistics_, NUMBER_DB_SEEK_FOUND);
        RecordTick(statistics_, ITER_BYTES_READ, key().size() + value().size());
1201
        PERF_COUNTER_ADD(iter_read_bytes, key().size() + value().size());
M
Manuel Ung 已提交
1202 1203
      }
    }
J
jorlow@chromium.org 已提交
1204 1205
  } else {
    valid_ = false;
J
jorlow@chromium.org 已提交
1206
  }
1207
  if (valid_ && prefix_extractor_ && prefix_same_as_start_) {
1208 1209 1210
    prefix_start_buf_.SetUserKey(
        prefix_extractor_->Transform(saved_key_.GetUserKey()));
    prefix_start_key_ = prefix_start_buf_.GetUserKey();
1211
  }
J
jorlow@chromium.org 已提交
1212 1213
}

J
jorlow@chromium.org 已提交
1214
void DBIter::SeekToLast() {
1215 1216 1217
  if (iterate_upper_bound_ != nullptr) {
    // Seek to last key strictly less than ReadOptions.iterate_upper_bound.
    SeekForPrev(*iterate_upper_bound_);
1218
    if (Valid() && user_comparator_.Equal(*iterate_upper_bound_, key())) {
1219 1220 1221 1222 1223 1224
      ReleaseTempPinnedData();
      PrevInternal();
    }
    return;
  }

1225
  PERF_CPU_TIMER_GUARD(iter_seek_cpu_nanos, env_);
S
Stanislau Hlebik 已提交
1226
  // Don't use iter_::Seek() if we set a prefix extractor
1227
  // because prefix seek will be used.
1228
  if (prefix_extractor_ != nullptr && !total_order_seek_) {
S
Stanislau Hlebik 已提交
1229 1230
    max_skip_ = std::numeric_limits<uint64_t>::max();
  }
1231
  status_ = Status::OK();
J
jorlow@chromium.org 已提交
1232
  direction_ = kReverse;
1233
  ReleaseTempPinnedData();
1234
  ResetInternalKeysSkippedCounter();
J
jorlow@chromium.org 已提交
1235
  ClearSavedValue();
1236
  is_key_seqnum_zero_ = false;
1237 1238 1239

  {
    PERF_TIMER_GUARD(seek_internal_seek_time);
1240
    iter_.SeekToLast();
1241
    range_del_agg_.InvalidateRangeDelMapPositions();
1242
  }
1243
  PrevInternal();
M
Manuel Ung 已提交
1244 1245 1246 1247 1248
  if (statistics_ != nullptr) {
    RecordTick(statistics_, NUMBER_DB_SEEK);
    if (valid_) {
      RecordTick(statistics_, NUMBER_DB_SEEK_FOUND);
      RecordTick(statistics_, ITER_BYTES_READ, key().size() + value().size());
1249
      PERF_COUNTER_ADD(iter_read_bytes, key().size() + value().size());
M
Manuel Ung 已提交
1250 1251
    }
  }
1252
  if (valid_ && prefix_extractor_ && prefix_same_as_start_) {
1253 1254 1255
    prefix_start_buf_.SetUserKey(
        prefix_extractor_->Transform(saved_key_.GetUserKey()));
    prefix_start_key_ = prefix_start_buf_.GetUserKey();
1256
  }
J
jorlow@chromium.org 已提交
1257 1258
}

1259 1260
Iterator* NewDBIterator(Env* env, const ReadOptions& read_options,
                        const ImmutableCFOptions& cf_options,
1261
                        const MutableCFOptions& mutable_cf_options,
1262 1263 1264
                        const Comparator* user_key_comparator,
                        InternalIterator* internal_iter,
                        const SequenceNumber& sequence,
Y
Yi Wu 已提交
1265
                        uint64_t max_sequential_skip_in_iterations,
1266 1267 1268 1269 1270 1271
                        ReadCallback* read_callback, DBImpl* db_impl,
                        ColumnFamilyData* cfd, bool allow_blob) {
  DBIter* db_iter = new DBIter(
      env, read_options, cf_options, mutable_cf_options, user_key_comparator,
      internal_iter, sequence, false, max_sequential_skip_in_iterations,
      read_callback, db_impl, cfd, allow_blob);
1272
  return db_iter;
1273 1274
}

1275
}  // namespace rocksdb