compaction_iterator.cc 19.8 KB
Newer Older
1 2
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file. See the AUTHORS file for names of contributors.
3
//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
4 5 6 7 8
//  This source code is licensed under the BSD-style license found in the
//  LICENSE file in the root directory of this source tree. An additional grant
//  of patent rights can be found in the PATENTS file in the same directory.

#include "db/compaction_iterator.h"
S
sdong 已提交
9
#include "table/internal_iterator.h"
10 11 12 13

namespace rocksdb {

CompactionIterator::CompactionIterator(
S
sdong 已提交
14
    InternalIterator* input, const Comparator* cmp, MergeHelper* merge_helper,
15
    SequenceNumber last_sequence, std::vector<SequenceNumber>* snapshots,
16
    SequenceNumber earliest_write_conflict_snapshot, Env* env,
17 18 19
    bool expect_valid_internal_key, RangeDelAggregator* range_del_agg,
    const Compaction* compaction, const CompactionFilter* compaction_filter,
    LogBuffer* log_buffer)
20 21 22 23
    : input_(input),
      cmp_(cmp),
      merge_helper_(merge_helper),
      snapshots_(snapshots),
24
      earliest_write_conflict_snapshot_(earliest_write_conflict_snapshot),
25 26
      env_(env),
      expect_valid_internal_key_(expect_valid_internal_key),
27
      range_del_agg_(range_del_agg),
28 29 30 31 32 33 34 35 36 37 38 39 40
      compaction_(compaction),
      compaction_filter_(compaction_filter),
      log_buffer_(log_buffer),
      merge_out_iter_(merge_helper_) {
  assert(compaction_filter_ == nullptr || compaction_ != nullptr);
  bottommost_level_ =
      compaction_ == nullptr ? false : compaction_->bottommost_level();
  if (compaction_ != nullptr) {
    level_ptrs_ = std::vector<size_t>(compaction_->number_levels(), 0);
  }

  if (snapshots_->size() == 0) {
    // optimize for fast path if there are no snapshots
41 42
    visible_at_tip_ = true;
    earliest_snapshot_ = last_sequence;
43 44
    latest_snapshot_ = 0;
  } else {
45
    visible_at_tip_ = false;
46 47 48
    earliest_snapshot_ = snapshots_->at(0);
    latest_snapshot_ = snapshots_->back();
  }
49 50 51 52 53
  if (compaction_filter_ != nullptr && compaction_filter_->IgnoreSnapshots()) {
    ignore_snapshots_ = true;
  } else {
    ignore_snapshots_ = false;
  }
54 55 56 57 58 59
  input_->SetPinnedItersMgr(&pinned_iters_mgr_);
}

CompactionIterator::~CompactionIterator() {
  // input_ Iteartor lifetime is longer than pinned_iters_mgr_ lifetime
  input_->SetPinnedItersMgr(nullptr);
60 61 62 63 64 65
}

void CompactionIterator::ResetRecordCounts() {
  iter_stats_.num_record_drop_user = 0;
  iter_stats_.num_record_drop_hidden = 0;
  iter_stats_.num_record_drop_obsolete = 0;
A
Andrew Kryczka 已提交
66 67
  iter_stats_.num_record_drop_range_del = 0;
  iter_stats_.num_range_del_drop_obsolete = 0;
68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89
}

void CompactionIterator::SeekToFirst() {
  NextFromInput();
  PrepareOutput();
}

void CompactionIterator::Next() {
  // If there is a merge output, return it before continuing to process the
  // input.
  if (merge_out_iter_.Valid()) {
    merge_out_iter_.Next();

    // Check if we returned all records of the merge output.
    if (merge_out_iter_.Valid()) {
      key_ = merge_out_iter_.key();
      value_ = merge_out_iter_.value();
      bool valid_key __attribute__((__unused__)) =
          ParseInternalKey(key_, &ikey_);
      // MergeUntil stops when it encounters a corrupt key and does not
      // include them in the result, so we expect the keys here to be valid.
      assert(valid_key);
A
Andres Noetzli 已提交
90 91 92 93
      // Keep current_key_ in sync.
      current_key_.UpdateInternalKey(ikey_.sequence, ikey_.type);
      key_ = current_key_.GetKey();
      ikey_.user_key = current_key_.GetUserKey();
94 95
      valid_ = true;
    } else {
96
      // We consumed all pinned merge operands, release pinned iterators
97
      pinned_iters_mgr_.ReleasePinnedData();
98 99 100 101 102 103
      // MergeHelper moves the iterator to the first record after the merged
      // records, so even though we reached the end of the merge output, we do
      // not want to advance the iterator.
      NextFromInput();
    }
  } else {
A
Andres Noetzli 已提交
104 105 106 107 108
    // Only advance the input iterator if there is no merge output and the
    // iterator is not already at the next record.
    if (!at_next_) {
      input_->Next();
    }
109 110 111
    NextFromInput();
  }

112 113 114 115 116
  if (valid_) {
    // Record that we've ouputted a record for the current key.
    has_outputted_key_ = true;
  }

117 118 119 120
  PrepareOutput();
}

void CompactionIterator::NextFromInput() {
A
Andres Noetzli 已提交
121
  at_next_ = false;
122 123
  valid_ = false;

A
Andres Noetzli 已提交
124
  while (!valid_ && input_->Valid()) {
125 126 127 128 129 130 131
    key_ = input_->key();
    value_ = input_->value();
    iter_stats_.num_input_records++;

    if (!ParseInternalKey(key_, &ikey_)) {
      // If `expect_valid_internal_key_` is false, return the corrupted key
      // and let the caller decide what to do with it.
A
Andres Noetzli 已提交
132 133
      // TODO(noetzli): We should have a more elegant solution for this.
      if (expect_valid_internal_key_) {
A
Andres Noetzli 已提交
134 135
        assert(!"Corrupted internal key not expected.");
        status_ = Status::Corruption("Corrupted internal key not expected.");
A
Andres Noetzli 已提交
136 137
        break;
      }
A
Andres Noetzli 已提交
138
      key_ = current_key_.SetKey(key_);
139 140 141 142 143 144 145 146 147
      has_current_user_key_ = false;
      current_user_key_sequence_ = kMaxSequenceNumber;
      current_user_key_snapshot_ = 0;
      iter_stats_.num_input_corrupt_records++;
      valid_ = true;
      break;
    }

    // Update input statistics
A
Andres Noetzli 已提交
148
    if (ikey_.type == kTypeDeletion || ikey_.type == kTypeSingleDeletion) {
149 150 151 152 153
      iter_stats_.num_input_deletion_records++;
    }
    iter_stats_.total_input_raw_key_bytes += key_.size();
    iter_stats_.total_input_raw_value_bytes += value_.size();

A
Andres Noetzli 已提交
154 155 156
    // Check whether the user key changed. After this if statement current_key_
    // is a copy of the current input key (maybe converted to a delete by the
    // compaction filter). ikey_.user_key is pointing to the copy.
157
    if (!has_current_user_key_ ||
A
Andres Noetzli 已提交
158
        !cmp_->Equal(ikey_.user_key, current_user_key_)) {
159
      // First occurrence of this user key
160
      // Copy key for output
A
Andres Noetzli 已提交
161 162
      key_ = current_key_.SetKey(key_, &ikey_);
      current_user_key_ = ikey_.user_key;
163
      has_current_user_key_ = true;
164
      has_outputted_key_ = false;
165 166
      current_user_key_sequence_ = kMaxSequenceNumber;
      current_user_key_snapshot_ = 0;
167

168 169
      // apply the compaction filter to the first occurrence of the user key
      if (compaction_filter_ != nullptr && ikey_.type == kTypeValue &&
170 171
          (visible_at_tip_ || ikey_.sequence > latest_snapshot_ ||
           ignore_snapshots_)) {
172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187
        // If the user has specified a compaction filter and the sequence
        // number is greater than any external snapshot, then invoke the
        // filter. If the return value of the compaction filter is true,
        // replace the entry with a deletion marker.
        bool value_changed = false;
        bool to_delete = false;
        compaction_filter_value_.clear();
        {
          StopWatchNano timer(env_, true);
          to_delete = compaction_filter_->Filter(
              compaction_->level(), ikey_.user_key, value_,
              &compaction_filter_value_, &value_changed);
          iter_stats_.total_filter_time +=
              env_ != nullptr ? timer.ElapsedNanos() : 0;
        }
        if (to_delete) {
A
Andres Noetzli 已提交
188 189 190
          // convert the current key to a delete
          ikey_.type = kTypeDeletion;
          current_key_.UpdateInternalKey(ikey_.sequence, kTypeDeletion);
191 192 193 194 195 196 197
          // no value associated with delete
          value_.clear();
          iter_stats_.num_record_drop_user++;
        } else if (value_changed) {
          value_ = compaction_filter_value_;
        }
      }
A
Andres Noetzli 已提交
198 199 200
    } else {
      // Update the current key to reflect the new sequence number/type without
      // copying the user key.
201 202 203
      // TODO(rven): Compaction filter does not process keys in this path
      // Need to have the compaction filter process multiple versions
      // if we have versions on both sides of a snapshot
A
Andres Noetzli 已提交
204 205 206
      current_key_.UpdateInternalKey(ikey_.sequence, ikey_.type);
      key_ = current_key_.GetKey();
      ikey_.user_key = current_key_.GetUserKey();
207 208 209 210 211 212 213 214 215 216 217
    }

    // If there are no snapshots, then this kv affect visibility at tip.
    // Otherwise, search though all existing snapshots to find the earliest
    // snapshot that is affected by this kv.
    SequenceNumber last_sequence __attribute__((__unused__)) =
        current_user_key_sequence_;
    current_user_key_sequence_ = ikey_.sequence;
    SequenceNumber last_snapshot = current_user_key_snapshot_;
    SequenceNumber prev_snapshot = 0;  // 0 means no previous snapshot
    current_user_key_snapshot_ =
218 219 220
        visible_at_tip_
            ? earliest_snapshot_
            : findEarliestVisibleSnapshot(ikey_.sequence, &prev_snapshot);
221

222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262
    if (clear_and_output_next_key_) {
      // In the previous iteration we encountered a single delete that we could
      // not compact out.  We will keep this Put, but can drop it's data.
      // (See Optimization 3, below.)
      assert(ikey_.type == kTypeValue);
      assert(current_user_key_snapshot_ == last_snapshot);

      value_.clear();
      valid_ = true;
      clear_and_output_next_key_ = false;
    } else if (ikey_.type == kTypeSingleDeletion) {
      // We can compact out a SingleDelete if:
      // 1) We encounter the corresponding PUT -OR- we know that this key
      //    doesn't appear past this output level
      // =AND=
      // 2) We've already returned a record in this snapshot -OR-
      //    there are no earlier earliest_write_conflict_snapshot.
      //
      // Rule 1 is needed for SingleDelete correctness.  Rule 2 is needed to
      // allow Transactions to do write-conflict checking (if we compacted away
      // all keys, then we wouldn't know that a write happened in this
      // snapshot).  If there is no earlier snapshot, then we know that there
      // are no active transactions that need to know about any writes.
      //
      // Optimization 3:
      // If we encounter a SingleDelete followed by a PUT and Rule 2 is NOT
      // true, then we must output a SingleDelete.  In this case, we will decide
      // to also output the PUT.  While we are compacting less by outputting the
      // PUT now, hopefully this will lead to better compaction in the future
      // when Rule 2 is later true (Ie, We are hoping we can later compact out
      // both the SingleDelete and the Put, while we couldn't if we only
      // outputted the SingleDelete now).
      // In this case, we can save space by removing the PUT's value as it will
      // never be read.
      //
      // Deletes and Merges are not supported on the same key that has a
      // SingleDelete as it is not possible to correctly do any partial
      // compaction of such a combination of operations.  The result of mixing
      // those operations for a given key is documented as being undefined.  So
      // we can choose how to handle such a combinations of operations.  We will
      // try to compact out as much as we can in these cases.
263
      // We will report counts on these anomalous cases.
264 265 266

      // The easiest way to process a SingleDelete during iteration is to peek
      // ahead at the next key.
A
Andres Noetzli 已提交
267 268 269
      ParsedInternalKey next_ikey;
      input_->Next();

270
      // Check whether the next key exists, is not corrupt, and is the same key
A
Andres Noetzli 已提交
271 272 273
      // as the single delete.
      if (input_->Valid() && ParseInternalKey(input_->key(), &next_ikey) &&
          cmp_->Equal(ikey_.user_key, next_ikey.user_key)) {
274 275
        // Check whether the next key belongs to the same snapshot as the
        // SingleDelete.
A
Andres Noetzli 已提交
276
        if (prev_snapshot == 0 || next_ikey.sequence > prev_snapshot) {
277 278 279 280 281 282 283 284 285
          if (next_ikey.type == kTypeSingleDeletion) {
            // We encountered two SingleDeletes in a row.  This could be due to
            // unexpected user input.
            // Skip the first SingleDelete and let the next iteration decide how
            // to handle the second SingleDelete

            // First SingleDelete has been skipped since we already called
            // input_->Next().
            ++iter_stats_.num_record_drop_obsolete;
286
            ++iter_stats_.num_single_del_mismatch;
287 288 289 290 291 292 293 294 295
          } else if ((ikey_.sequence <= earliest_write_conflict_snapshot_) ||
                     has_outputted_key_) {
            // Found a matching value, we can drop the single delete and the
            // value.  It is safe to drop both records since we've already
            // outputted a key in this snapshot, or there is no earlier
            // snapshot (Rule 2 above).

            // Note: it doesn't matter whether the second key is a Put or if it
            // is an unexpected Merge or Delete.  We will compact it out
296 297 298 299 300 301
            // either way. We will maintain counts of how many mismatches
            // happened
            if (next_ikey.type != kTypeValue) {
              ++iter_stats_.num_single_del_mismatch;
            }

302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320
            ++iter_stats_.num_record_drop_hidden;
            ++iter_stats_.num_record_drop_obsolete;
            // Already called input_->Next() once.  Call it a second time to
            // skip past the second key.
            input_->Next();
          } else {
            // Found a matching value, but we cannot drop both keys since
            // there is an earlier snapshot and we need to leave behind a record
            // to know that a write happened in this snapshot (Rule 2 above).
            // Clear the value and output the SingleDelete. (The value will be
            // outputted on the next iteration.)

            // Setting valid_ to true will output the current SingleDelete
            valid_ = true;

            // Set up the Put to be outputted in the next iteration.
            // (Optimization 3).
            clear_and_output_next_key_ = true;
          }
A
Andres Noetzli 已提交
321 322 323 324 325 326 327
        } else {
          // We hit the next snapshot without hitting a put, so the iterator
          // returns the single delete.
          valid_ = true;
        }
      } else {
        // We are at the end of the input, could not parse the next key, or hit
328
        // a different key. The iterator returns the single delete if the key
A
Andres Noetzli 已提交
329 330 331 332 333 334
        // possibly exists beyond the current output level.  We set
        // has_current_user_key to false so that if the iterator is at the next
        // key, we do not compare it again against the previous key at the next
        // iteration. If the next key is corrupt, we return before the
        // comparison, so the value of has_current_user_key does not matter.
        has_current_user_key_ = false;
335
        if (compaction_ != nullptr && ikey_.sequence <= earliest_snapshot_ &&
A
Andres Noetzli 已提交
336 337
            compaction_->KeyNotExistsBeyondOutputLevel(ikey_.user_key,
                                                       &level_ptrs_)) {
338 339
          // Key doesn't exist outside of this range.
          // Can compact out this SingleDelete.
A
Andres Noetzli 已提交
340
          ++iter_stats_.num_record_drop_obsolete;
341
          ++iter_stats_.num_single_del_fallthru;
A
Andres Noetzli 已提交
342
        } else {
343
          // Output SingleDelete
A
Andres Noetzli 已提交
344 345 346 347 348 349 350 351
          valid_ = true;
        }
      }

      if (valid_) {
        at_next_ = true;
      }
    } else if (last_snapshot == current_user_key_snapshot_) {
352 353 354 355 356
      // If the earliest snapshot is which this key is visible in
      // is the same as the visibility of a previous instance of the
      // same key, then this kv is not visible in any snapshot.
      // Hidden by an newer entry for same user key
      // TODO: why not > ?
357 358 359 360
      //
      // Note: Dropping this key will not affect TransactionDB write-conflict
      // checking since there has already been a record returned for this key
      // in this snapshot.
361 362
      assert(last_sequence >= current_user_key_sequence_);
      ++iter_stats_.num_record_drop_hidden;  // (A)
A
Andres Noetzli 已提交
363
      input_->Next();
364 365 366 367 368 369 370 371 372 373 374 375 376 377 378
    } else if (compaction_ != nullptr && ikey_.type == kTypeDeletion &&
               ikey_.sequence <= earliest_snapshot_ &&
               compaction_->KeyNotExistsBeyondOutputLevel(ikey_.user_key,
                                                          &level_ptrs_)) {
      // TODO(noetzli): This is the only place where we use compaction_
      // (besides the constructor). We should probably get rid of this
      // dependency and find a way to do similar filtering during flushes.
      //
      // For this user key:
      // (1) there is no data in higher levels
      // (2) data in lower levels will have larger sequence numbers
      // (3) data in layers that are being compacted here and have
      //     smaller sequence numbers will be dropped in the next
      //     few iterations of this loop (by rule (A) above).
      // Therefore this deletion marker is obsolete and can be dropped.
379 380 381
      //
      // Note:  Dropping this Delete will not affect TransactionDB
      // write-conflict checking since it is earlier than any snapshot.
382
      ++iter_stats_.num_record_drop_obsolete;
A
Andres Noetzli 已提交
383
      input_->Next();
384 385 386 387 388 389 390 391
    } else if (ikey_.type == kTypeMerge) {
      if (!merge_helper_->HasOperator()) {
        LogToBuffer(log_buffer_, "Options::merge_operator is null.");
        status_ = Status::InvalidArgument(
            "merge_operator is not properly initialized.");
        return;
      }

392
      pinned_iters_mgr_.StartPinning();
393 394 395 396
      // We know the merge type entry is not hidden, otherwise we would
      // have hit (A)
      // We encapsulate the merge related state machine in a different
      // object to minimize change to the existing flow.
397 398
      merge_helper_->MergeUntil(input_, range_del_agg_, prev_snapshot,
                                bottommost_level_);
399 400
      merge_out_iter_.SeekToFirst();

I
Igor Canadi 已提交
401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420
      if (merge_out_iter_.Valid()) {
        // NOTE: key, value, and ikey_ refer to old entries.
        //       These will be correctly set below.
        key_ = merge_out_iter_.key();
        value_ = merge_out_iter_.value();
        bool valid_key __attribute__((__unused__)) =
            ParseInternalKey(key_, &ikey_);
        // MergeUntil stops when it encounters a corrupt key and does not
        // include them in the result, so we expect the keys here to valid.
        assert(valid_key);
        // Keep current_key_ in sync.
        current_key_.UpdateInternalKey(ikey_.sequence, ikey_.type);
        key_ = current_key_.GetKey();
        ikey_.user_key = current_key_.GetUserKey();
        valid_ = true;
      } else {
        // all merge operands were filtered out. reset the user key, since the
        // batch consumed by the merge operator should not shadow any keys
        // coming after the merges
        has_current_user_key_ = false;
421
        pinned_iters_mgr_.ReleasePinnedData();
I
Igor Canadi 已提交
422
      }
423
    } else {
424 425
      // 1. new user key -OR-
      // 2. different snapshot stripe
A
Andrew Kryczka 已提交
426
      bool should_delete = range_del_agg_->ShouldDelete(key_);
427
      if (should_delete) {
A
Andrew Kryczka 已提交
428 429
        ++iter_stats_.num_record_drop_hidden;
        ++iter_stats_.num_record_drop_range_del;
430 431 432 433
        input_->Next();
      } else {
        valid_ = true;
      }
434 435 436 437 438 439 440 441
    }
  }
}

void CompactionIterator::PrepareOutput() {
  // Zeroing out the sequence number leads to better compression.
  // If this is the bottommost level (no files in lower levels)
  // and the earliest snapshot is larger than this seqno
442
  // and the userkey differs from the last userkey in compaction
443
  // then we can squash the seqno to zero.
444 445 446

  // This is safe for TransactionDB write-conflict checking since transactions
  // only care about sequence number larger than any active snapshots.
447
  if (bottommost_level_ && valid_ && ikey_.sequence < earliest_snapshot_ &&
448 449
      ikey_.type != kTypeMerge &&
      !cmp_->Equal(compaction_->GetLargestUserKey(), ikey_.user_key)) {
A
Andres Noetzli 已提交
450 451 452
    assert(ikey_.type != kTypeDeletion && ikey_.type != kTypeSingleDeletion);
    ikey_.sequence = 0;
    current_key_.UpdateInternalKey(0, ikey_.type);
453 454 455 456 457 458
  }
}

inline SequenceNumber CompactionIterator::findEarliestVisibleSnapshot(
    SequenceNumber in, SequenceNumber* prev_snapshot) {
  assert(snapshots_->size());
V
Victor Tyutyunov 已提交
459
  SequenceNumber prev __attribute__((__unused__)) = kMaxSequenceNumber;
460
  for (const auto cur : *snapshots_) {
V
Victor Tyutyunov 已提交
461
    assert(prev == kMaxSequenceNumber || prev <= cur);
462
    if (cur >= in) {
V
Victor Tyutyunov 已提交
463
      *prev_snapshot = prev == kMaxSequenceNumber ? 0 : prev;
464 465 466
      return cur;
    }
    prev = cur;
V
Victor Tyutyunov 已提交
467
    assert(prev < kMaxSequenceNumber);
468 469 470 471 472 473
  }
  *prev_snapshot = prev;
  return kMaxSequenceNumber;
}

}  // namespace rocksdb