memtable.cc 32.7 KB
Newer Older
1
//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
S
Siying Dong 已提交
2 3 4
//  This source code is licensed under both the GPLv2 (found in the
//  COPYING file in the root directory) and Apache 2.0 License
//  (found in the LICENSE.Apache file in the root directory).
5
//
J
jorlow@chromium.org 已提交
6 7 8 9 10
// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file. See the AUTHORS file for names of contributors.

#include "db/memtable.h"
J
Jim Paton 已提交
11

12
#include <algorithm>
Y
Yi Wu 已提交
13 14
#include <limits>
#include <memory>
J
Jim Paton 已提交
15

J
jorlow@chromium.org 已提交
16
#include "db/dbformat.h"
17
#include "db/merge_context.h"
18
#include "db/merge_helper.h"
19
#include "db/pinned_iterators_manager.h"
20
#include "db/read_callback.h"
21 22
#include "monitoring/perf_context_imp.h"
#include "monitoring/statistics.h"
23
#include "port/port.h"
24 25 26 27
#include "rocksdb/comparator.h"
#include "rocksdb/env.h"
#include "rocksdb/iterator.h"
#include "rocksdb/merge_operator.h"
K
kailiu 已提交
28
#include "rocksdb/slice_transform.h"
29
#include "rocksdb/write_buffer_manager.h"
S
sdong 已提交
30
#include "table/internal_iterator.h"
A
Andrew Kryczka 已提交
31
#include "table/iterator_wrapper.h"
32
#include "table/merging_iterator.h"
K
kailiu 已提交
33
#include "util/arena.h"
34
#include "util/autovector.h"
J
jorlow@chromium.org 已提交
35
#include "util/coding.h"
36
#include "util/memory_usage.h"
J
Jim Paton 已提交
37
#include "util/murmurhash.h"
K
Kai Liu 已提交
38
#include "util/mutexlock.h"
T
Tamir Duberstein 已提交
39
#include "util/util.h"
J
jorlow@chromium.org 已提交
40

41
namespace rocksdb {
J
jorlow@chromium.org 已提交
42

43 44 45 46
ImmutableMemTableOptions::ImmutableMemTableOptions(
    const ImmutableCFOptions& ioptions,
    const MutableCFOptions& mutable_cf_options)
    : arena_block_size(mutable_cf_options.arena_block_size),
47 48 49 50 51
      memtable_prefix_bloom_bits(
          static_cast<uint32_t>(
              static_cast<double>(mutable_cf_options.write_buffer_size) *
              mutable_cf_options.memtable_prefix_bloom_size_ratio) *
          8u),
52
      memtable_huge_page_size(mutable_cf_options.memtable_huge_page_size),
53 54 55 56 57 58 59
      inplace_update_support(ioptions.inplace_update_support),
      inplace_update_num_locks(mutable_cf_options.inplace_update_num_locks),
      inplace_callback(ioptions.inplace_callback),
      max_successive_merges(mutable_cf_options.max_successive_merges),
      statistics(ioptions.statistics),
      merge_operator(ioptions.merge_operator),
      info_log(ioptions.info_log) {}
L
Lei Jin 已提交
60 61 62

MemTable::MemTable(const InternalKeyComparator& cmp,
                   const ImmutableCFOptions& ioptions,
63
                   const MutableCFOptions& mutable_cf_options,
64
                   WriteBufferManager* write_buffer_manager,
65
                   SequenceNumber latest_seq, uint32_t column_family_id)
J
jorlow@chromium.org 已提交
66
    : comparator_(cmp),
L
Lei Jin 已提交
67
      moptions_(ioptions, mutable_cf_options),
68
      refs_(0),
L
Lei Jin 已提交
69
      kArenaBlockSize(OptimizeBlockSize(moptions_.arena_block_size)),
70 71 72 73 74 75 76
      mem_tracker_(write_buffer_manager),
      arena_(
          moptions_.arena_block_size,
          (write_buffer_manager != nullptr && write_buffer_manager->enabled())
              ? &mem_tracker_
              : nullptr,
          mutable_cf_options.memtable_huge_page_size),
77 78 79
      table_(mutable_cf_options.memtable_factory->CreateMemTableRep(
          comparator_, &arena_, ioptions, mutable_cf_options,
          column_family_id)),
80
      range_del_table_(SkipListFactory().CreateMemTableRep(
81
          comparator_, &arena_, nullptr /* transform */, ioptions.info_log,
82
          column_family_id)),
A
Andrew Kryczka 已提交
83
      is_range_del_table_empty_(true),
84
      data_size_(0),
85
      num_entries_(0),
86
      num_deletes_(0),
87
      write_buffer_size_(mutable_cf_options.write_buffer_size),
88 89
      flush_in_progress_(false),
      flush_completed_(false),
A
Abhishek Kona 已提交
90
      file_number_(0),
91
      first_seqno_(0),
92 93
      earliest_seqno_(latest_seq),
      creation_seq_(latest_seq),
94
      mem_next_logfile_number_(0),
95
      min_prep_log_referenced_(0),
96 97 98
      locks_(moptions_.inplace_update_support
                 ? moptions_.inplace_update_num_locks
                 : 0),
99
      prefix_extractor_(mutable_cf_options.prefix_extractor.get()),
100
      flush_state_(FLUSH_NOT_REQUESTED),
101 102
      env_(ioptions.env),
      insert_with_hint_prefix_extractor_(
Y
Yi Wu 已提交
103
          ioptions.memtable_insert_with_hint_prefix_extractor),
104 105
      oldest_key_time_(std::numeric_limits<uint64_t>::max()),
      atomic_flush_seqno_(kMaxSequenceNumber) {
106 107 108 109
  UpdateFlushState();
  // something went wrong if we need to flush before inserting anything
  assert(!ShouldScheduleFlush());

L
Lei Jin 已提交
110
  if (prefix_extractor_ && moptions_.memtable_prefix_bloom_bits > 0) {
111
    prefix_bloom_.reset(new DynamicBloom(
112 113 114
        &arena_, moptions_.memtable_prefix_bloom_bits, ioptions.bloom_locality,
        6 /* hard coded 6 probes */, nullptr, moptions_.memtable_huge_page_size,
        ioptions.info_log));
115 116
  }
}
J
jorlow@chromium.org 已提交
117

118 119 120 121
MemTableRep* MemTableRepFactory::CreateMemTableRep(
    const MemTableRep::KeyComparator& key_cmp, Allocator* allocator,
    const ImmutableCFOptions& ioptions,
    const MutableCFOptions& mutable_cf_options,
R
rockeet 已提交
122
    uint32_t column_family_id) {
123 124
  return CreateMemTableRep(key_cmp, allocator,
                           mutable_cf_options.prefix_extractor.get(),
R
rockeet 已提交
125
                           ioptions.info_log, column_family_id);
126 127
}

128 129 130 131
MemTable::~MemTable() {
  mem_tracker_.FreeMem();
  assert(refs_ == 0);
}
J
jorlow@chromium.org 已提交
132

J
Jim Paton 已提交
133
size_t MemTable::ApproximateMemoryUsage() {
134 135 136 137 138 139 140 141 142 143 144 145
  autovector<size_t> usages = {arena_.ApproximateMemoryUsage(),
                               table_->ApproximateMemoryUsage(),
                               range_del_table_->ApproximateMemoryUsage(),
                               rocksdb::ApproximateMemoryUsage(insert_hints_)};
  size_t total_usage = 0;
  for (size_t usage : usages) {
    // If usage + total_usage >= kMaxSizet, return kMaxSizet.
    // the following variation is to avoid numeric overflow.
    if (usage >= port::kMaxSizet - total_usage) {
      return port::kMaxSizet;
    }
    total_usage += usage;
146 147
  }
  // otherwise, return the actual usage
148
  return total_usage;
J
Jim Paton 已提交
149
}
J
jorlow@chromium.org 已提交
150

151
bool MemTable::ShouldFlushNow() const {
152
  size_t write_buffer_size = write_buffer_size_.load(std::memory_order_relaxed);
153 154 155
  // In a lot of times, we cannot allocate arena blocks that exactly matches the
  // buffer size. Thus we have to decide if we should over-allocate or
  // under-allocate.
156
  // This constant variable can be interpreted as: if we still have more than
157 158 159 160 161 162
  // "kAllowOverAllocationRatio * kArenaBlockSize" space left, we'd try to over
  // allocate one more block.
  const double kAllowOverAllocationRatio = 0.6;

  // If arena still have room for new block allocation, we can safely say it
  // shouldn't flush.
163 164 165
  auto allocated_memory = table_->ApproximateMemoryUsage() +
                          range_del_table_->ApproximateMemoryUsage() +
                          arena_.MemoryAllocatedBytes();
166

167 168 169
  // if we can still allocate one more block without exceeding the
  // over-allocation ratio, then we should not flush.
  if (allocated_memory + kArenaBlockSize <
170
      write_buffer_size + kArenaBlockSize * kAllowOverAllocationRatio) {
171 172 173
    return false;
  }

174 175 176 177
  // if user keeps adding entries that exceeds write_buffer_size, we need to
  // flush earlier even though we still have much available memory left.
  if (allocated_memory >
      write_buffer_size + kArenaBlockSize * kAllowOverAllocationRatio) {
178 179 180 181
    return true;
  }

  // In this code path, Arena has already allocated its "last block", which
R
rockeet 已提交
182
  // means the total allocated memory size is either:
183
  //  (1) "moderately" over allocated the memory (no more than `0.6 * arena
184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208
  // block size`. Or,
  //  (2) the allocated memory is less than write buffer size, but we'll stop
  // here since if we allocate a new arena block, we'll over allocate too much
  // more (half of the arena block size) memory.
  //
  // In either case, to avoid over-allocate, the last block will stop allocation
  // when its usage reaches a certain ratio, which we carefully choose "0.75
  // full" as the stop condition because it addresses the following issue with
  // great simplicity: What if the next inserted entry's size is
  // bigger than AllocatedAndUnused()?
  //
  // The answer is: if the entry size is also bigger than 0.25 *
  // kArenaBlockSize, a dedicated block will be allocated for it; otherwise
  // arena will anyway skip the AllocatedAndUnused() and allocate a new, empty
  // and regular block. In either case, we *overly* over-allocated.
  //
  // Therefore, setting the last block to be at most "0.75 full" avoids both
  // cases.
  //
  // NOTE: the average percentage of waste space of this approach can be counted
  // as: "arena block size * 0.25 / write buffer size". User who specify a small
  // write buffer size and/or big arena block size may suffer.
  return arena_.AllocatedAndUnused() < kArenaBlockSize / 4;
}

209 210 211 212 213 214 215 216 217 218 219
void MemTable::UpdateFlushState() {
  auto state = flush_state_.load(std::memory_order_relaxed);
  if (state == FLUSH_NOT_REQUESTED && ShouldFlushNow()) {
    // ignore CAS failure, because that means somebody else requested
    // a flush
    flush_state_.compare_exchange_strong(state, FLUSH_REQUESTED,
                                         std::memory_order_relaxed,
                                         std::memory_order_relaxed);
  }
}

Y
Yi Wu 已提交
220 221 222 223 224 225 226 227 228 229 230 231 232 233 234
void MemTable::UpdateOldestKeyTime() {
  uint64_t oldest_key_time = oldest_key_time_.load(std::memory_order_relaxed);
  if (oldest_key_time == std::numeric_limits<uint64_t>::max()) {
    int64_t current_time = 0;
    auto s = env_->GetCurrentTime(&current_time);
    if (s.ok()) {
      assert(current_time >= 0);
      // If fail, the timestamp is already set.
      oldest_key_time_.compare_exchange_strong(
          oldest_key_time, static_cast<uint64_t>(current_time),
          std::memory_order_relaxed, std::memory_order_relaxed);
    }
  }
}

235 236 237 238 239
int MemTable::KeyComparator::operator()(const char* prefix_len_key1,
                                        const char* prefix_len_key2) const {
  // Internal keys are encoded as length-prefixed strings.
  Slice k1 = GetLengthPrefixedSlice(prefix_len_key1);
  Slice k2 = GetLengthPrefixedSlice(prefix_len_key2);
240
  return comparator.CompareKeySeq(k1, k2);
241 242 243
}

int MemTable::KeyComparator::operator()(const char* prefix_len_key,
244
                                        const Slice& key) const {
J
jorlow@chromium.org 已提交
245
  // Internal keys are encoded as length-prefixed strings.
246
  Slice a = GetLengthPrefixedSlice(prefix_len_key);
247
  return comparator.CompareKeySeq(a, key);
J
jorlow@chromium.org 已提交
248 249 250 251 252
}

// Encode a suitable internal key target for "target" and return it.
// Uses *scratch as scratch space, and the returned pointer will point
// into this scratch space.
253
const char* EncodeKey(std::string* scratch, const Slice& target) {
J
jorlow@chromium.org 已提交
254
  scratch->clear();
255
  PutVarint32(scratch, static_cast<uint32_t>(target.size()));
J
jorlow@chromium.org 已提交
256 257 258 259
  scratch->append(target.data(), target.size());
  return scratch->data();
}

S
sdong 已提交
260
class MemTableIterator : public InternalIterator {
J
jorlow@chromium.org 已提交
261
 public:
262
  MemTableIterator(const MemTable& mem, const ReadOptions& read_options,
263
                   Arena* arena, bool use_range_del_table = false)
264 265
      : bloom_(nullptr),
        prefix_extractor_(mem.prefix_extractor_),
A
Aaron Gao 已提交
266
        comparator_(mem.comparator_),
267
        valid_(false),
268
        arena_mode_(arena != nullptr),
269 270
        value_pinned_(
            !mem.GetImmutableMemTableOptions()->inplace_update_support) {
271 272 273
    if (use_range_del_table) {
      iter_ = mem.range_del_table_->GetIterator(arena);
    } else if (prefix_extractor_ != nullptr && !read_options.total_order_seek) {
274
      bloom_ = mem.prefix_bloom_.get();
275
      iter_ = mem.table_->GetDynamicPrefixIterator(arena);
276
    } else {
277 278
      iter_ = mem.table_->GetIterator(arena);
    }
279
    is_seek_for_prev_supported_ = iter_->IsSeekForPrevSupported();
280 281 282
  }

  ~MemTableIterator() {
283 284 285
#ifndef NDEBUG
    // Assert that the MemTableIterator is never deleted while
    // Pinning is Enabled.
J
jsteemann 已提交
286
    assert(!pinned_iters_mgr_ || !pinned_iters_mgr_->PinningEnabled());
287
#endif
288 289 290 291
    if (arena_mode_) {
      iter_->~Iterator();
    } else {
      delete iter_;
292 293
    }
  }
J
Jim Paton 已提交
294

295 296 297 298 299 300 301 302
#ifndef NDEBUG
  virtual void SetPinnedItersMgr(
      PinnedIteratorsManager* pinned_iters_mgr) override {
    pinned_iters_mgr_ = pinned_iters_mgr;
  }
  PinnedIteratorsManager* pinned_iters_mgr_ = nullptr;
#endif

I
Igor Sugak 已提交
303 304
  virtual bool Valid() const override { return valid_; }
  virtual void Seek(const Slice& k) override {
I
Igor Canadi 已提交
305 306
    PERF_TIMER_GUARD(seek_on_memtable_time);
    PERF_COUNTER_ADD(seek_on_memtable_count, 1);
307 308 309 310 311 312 313 314 315
    if (bloom_ != nullptr) {
      if (!bloom_->MayContain(
              prefix_extractor_->Transform(ExtractUserKey(k)))) {
        PERF_COUNTER_ADD(bloom_memtable_miss_count, 1);
        valid_ = false;
        return;
      } else {
        PERF_COUNTER_ADD(bloom_memtable_hit_count, 1);
      }
316 317 318 319
    }
    iter_->Seek(k, nullptr);
    valid_ = iter_->Valid();
  }
A
Aaron Gao 已提交
320 321 322 323 324 325 326 327 328 329 330 331 332
  virtual void SeekForPrev(const Slice& k) override {
    PERF_TIMER_GUARD(seek_on_memtable_time);
    PERF_COUNTER_ADD(seek_on_memtable_count, 1);
    if (bloom_ != nullptr) {
      if (!bloom_->MayContain(
              prefix_extractor_->Transform(ExtractUserKey(k)))) {
        PERF_COUNTER_ADD(bloom_memtable_miss_count, 1);
        valid_ = false;
        return;
      } else {
        PERF_COUNTER_ADD(bloom_memtable_hit_count, 1);
      }
    }
333 334 335 336 337 338 339 340 341 342 343 344
    if (is_seek_for_prev_supported_) {
      iter_->SeekForPrev(k, nullptr);
      valid_ = iter_->Valid();
    } else {
      iter_->Seek(k, nullptr);
      valid_ = iter_->Valid();
      if (!Valid()) {
        SeekToLast();
      }
      while (Valid() && comparator_.comparator.Compare(k, key()) < 0) {
        Prev();
      }
A
Aaron Gao 已提交
345 346
    }
  }
I
Igor Sugak 已提交
347
  virtual void SeekToFirst() override {
348 349 350
    iter_->SeekToFirst();
    valid_ = iter_->Valid();
  }
I
Igor Sugak 已提交
351
  virtual void SeekToLast() override {
352 353 354
    iter_->SeekToLast();
    valid_ = iter_->Valid();
  }
I
Igor Sugak 已提交
355
  virtual void Next() override {
356
    PERF_COUNTER_ADD(next_on_memtable_count, 1);
357 358 359 360
    assert(Valid());
    iter_->Next();
    valid_ = iter_->Valid();
  }
I
Igor Sugak 已提交
361
  virtual void Prev() override {
362
    PERF_COUNTER_ADD(prev_on_memtable_count, 1);
363 364 365 366
    assert(Valid());
    iter_->Prev();
    valid_ = iter_->Valid();
  }
I
Igor Sugak 已提交
367
  virtual Slice key() const override {
368
    assert(Valid());
369
    return iter_->GetKey();
J
Jim Paton 已提交
370
  }
I
Igor Sugak 已提交
371
  virtual Slice value() const override {
372
    assert(Valid());
373
    return iter_->GetValue();
J
jorlow@chromium.org 已提交
374 375
  }

I
Igor Sugak 已提交
376
  virtual Status status() const override { return Status::OK(); }
J
jorlow@chromium.org 已提交
377

378
  virtual bool IsKeyPinned() const override { return iter_->IsKeyPinned(); }
379

380 381 382 383 384
  virtual bool IsValuePinned() const override {
    // memtable value is always pinned, except if we allow inplace update.
    return value_pinned_;
  }

J
jorlow@chromium.org 已提交
385
 private:
386 387
  DynamicBloom* bloom_;
  const SliceTransform* const prefix_extractor_;
A
Aaron Gao 已提交
388
  const MemTable::KeyComparator comparator_;
389
  MemTableRep::Iterator* iter_;
390
  bool valid_;
391
  bool arena_mode_;
392
  bool value_pinned_;
393
  bool is_seek_for_prev_supported_;
J
jorlow@chromium.org 已提交
394 395 396 397 398 399

  // No copying allowed
  MemTableIterator(const MemTableIterator&);
  void operator=(const MemTableIterator&);
};

S
sdong 已提交
400 401
InternalIterator* MemTable::NewIterator(const ReadOptions& read_options,
                                        Arena* arena) {
402 403
  assert(arena != nullptr);
  auto mem = arena->AllocateAligned(sizeof(MemTableIterator));
L
Lei Jin 已提交
404
  return new (mem) MemTableIterator(*this, read_options, arena);
J
jorlow@chromium.org 已提交
405 406
}

407
InternalIterator* MemTable::NewRangeTombstoneIterator(
408
    const ReadOptions& read_options) {
A
Andrew Kryczka 已提交
409 410
  if (read_options.ignore_range_deletions || is_range_del_table_empty_) {
    return nullptr;
A
Andrew Kryczka 已提交
411
  }
412 413
  return new MemTableIterator(*this, read_options, nullptr /* arena */,
                              true /* use_range_del_table */);
414 415
}

416
port::RWMutex* MemTable::GetLock(const Slice& key) {
K
kailiu 已提交
417 418
  static murmur_hash hash;
  return &locks_[hash(key) % locks_.size()];
419 420
}

421 422
MemTable::MemTableStats MemTable::ApproximateStats(const Slice& start_ikey,
                                                   const Slice& end_ikey) {
423
  uint64_t entry_count = table_->ApproximateNumEntries(start_ikey, end_ikey);
424
  entry_count += range_del_table_->ApproximateNumEntries(start_ikey, end_ikey);
425
  if (entry_count == 0) {
426
    return {0, 0};
427 428 429
  }
  uint64_t n = num_entries_.load(std::memory_order_relaxed);
  if (n == 0) {
430
    return {0, 0};
431 432
  }
  if (entry_count > n) {
433 434 435
    // (range_del_)table_->ApproximateNumEntries() is just an estimate so it can
    // be larger than actual entries we have. Cap it to entries we have to limit
    // the inaccuracy.
436 437 438
    entry_count = n;
  }
  uint64_t data_size = data_size_.load(std::memory_order_relaxed);
439
  return {entry_count * (data_size / n), entry_count};
440 441
}

442
bool MemTable::Add(SequenceNumber s, ValueType type,
443
                   const Slice& key, /* user key */
444 445
                   const Slice& value, bool allow_concurrent,
                   MemTablePostProcessInfo* post_process_info) {
446 447
  std::unique_ptr<MemTableRep>& table =
      type == kTypeRangeDeletion ? range_del_table_ : table_;
448 449 450 451

  InternalKey internal_key(key, s, type);
  size_t encoded_len =
      MemTableRep::EncodeKeyValueSize(internal_key.Encode(), value);
452
  if (!allow_concurrent) {
453
    // Extract prefix for insert with hint.
454
    if (insert_with_hint_prefix_extractor_ != nullptr &&
455 456 457 458 459
        insert_with_hint_prefix_extractor_->InDomain(internal_key.user_key())) {
      Slice prefix = insert_with_hint_prefix_extractor_->Transform(
          internal_key.user_key());
      bool res = table->InsertKeyValueWithHint(internal_key.Encode(), value,
                                               &insert_hints_[prefix]);
M
Maysam Yabandeh 已提交
460
      if (UNLIKELY(!res)) {
461 462
        return res;
      }
463
    } else {
464
      bool res = table->InsertKeyValue(internal_key.Encode(), value);
M
Maysam Yabandeh 已提交
465
      if (UNLIKELY(!res)) {
466 467
        return res;
      }
468
    }
469 470 471 472 473 474

    // this is a bit ugly, but is the way to avoid locked instructions
    // when incrementing an atomic
    num_entries_.store(num_entries_.load(std::memory_order_relaxed) + 1,
                       std::memory_order_relaxed);
    data_size_.store(data_size_.load(std::memory_order_relaxed) + encoded_len,
475
                     std::memory_order_relaxed);
476 477 478 479
    if (type == kTypeDeletion) {
      num_deletes_.store(num_deletes_.load(std::memory_order_relaxed) + 1,
                         std::memory_order_relaxed);
    }
480

481 482 483 484
    if (prefix_bloom_) {
      assert(prefix_extractor_);
      prefix_bloom_->Add(prefix_extractor_->Transform(key));
    }
485

486
    // The first sequence number inserted into the memtable
M
Maysam Yabandeh 已提交
487
    assert(first_seqno_ == 0 || s >= first_seqno_);
488 489
    if (first_seqno_ == 0) {
      first_seqno_.store(s, std::memory_order_relaxed);
A
agiardullo 已提交
490

491 492 493 494 495 496
      if (earliest_seqno_ == kMaxSequenceNumber) {
        earliest_seqno_.store(GetFirstSequenceNumber(),
                              std::memory_order_relaxed);
      }
      assert(first_seqno_.load() >= earliest_seqno_.load());
    }
497 498
    assert(post_process_info == nullptr);
    UpdateFlushState();
499
  } else {
500
    bool res = table->InsertKeyValueConcurrently(internal_key.Encode(), value);
M
Maysam Yabandeh 已提交
501
    if (UNLIKELY(!res)) {
502 503
      return res;
    }
504

505 506 507
    assert(post_process_info != nullptr);
    post_process_info->num_entries++;
    post_process_info->data_size += encoded_len;
508
    if (type == kTypeDeletion) {
509
      post_process_info->num_deletes++;
510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526
    }

    if (prefix_bloom_) {
      assert(prefix_extractor_);
      prefix_bloom_->AddConcurrently(prefix_extractor_->Transform(key));
    }

    // atomically update first_seqno_ and earliest_seqno_.
    uint64_t cur_seq_num = first_seqno_.load(std::memory_order_relaxed);
    while ((cur_seq_num == 0 || s < cur_seq_num) &&
           !first_seqno_.compare_exchange_weak(cur_seq_num, s)) {
    }
    uint64_t cur_earliest_seqno =
        earliest_seqno_.load(std::memory_order_relaxed);
    while (
        (cur_earliest_seqno == kMaxSequenceNumber || s < cur_earliest_seqno) &&
        !first_seqno_.compare_exchange_weak(cur_earliest_seqno, s)) {
A
agiardullo 已提交
527
    }
528
  }
A
Andrew Kryczka 已提交
529 530 531
  if (is_range_del_table_empty_ && type == kTypeRangeDeletion) {
    is_range_del_table_empty_ = false;
  }
Y
Yi Wu 已提交
532
  UpdateOldestKeyTime();
533
  return true;
J
jorlow@chromium.org 已提交
534 535
}

536 537 538 539 540 541 542 543
// Callback from MemTable::Get()
namespace {

struct Saver {
  Status* status;
  const LookupKey* key;
  bool* found_final_value;  // Is value set correctly? Used by KeyMayExist
  bool* merge_in_progress;
M
Maysam Yabandeh 已提交
544
  std::string* value;
A
agiardullo 已提交
545
  SequenceNumber seq;
546 547 548
  const MergeOperator* merge_operator;
  // the merge operations encountered;
  MergeContext* merge_context;
A
Andrew Kryczka 已提交
549
  RangeDelAggregator* range_del_agg;
550 551 552 553
  MemTable* mem;
  Logger* logger;
  Statistics* statistics;
  bool inplace_update_support;
554
  Env* env_;
555
  ReadCallback* callback_;
Y
Yi Wu 已提交
556 557
  bool* is_blob_index;

558 559
  bool CheckCallback(SequenceNumber _seq) {
    if (callback_) {
560
      return callback_->IsVisible(_seq);
561 562 563
    }
    return true;
  }
564 565 566
};
}  // namespace

567
static bool SaveValue(void* arg, const MemTableRep::KeyValuePair* pair) {
568 569
  Saver* s = reinterpret_cast<Saver*>(arg);
  MergeContext* merge_context = s->merge_context;
A
Andrew Kryczka 已提交
570
  RangeDelAggregator* range_del_agg = s->range_del_agg;
571 572
  const MergeOperator* merge_operator = s->merge_operator;

573
  assert(s != nullptr && merge_context != nullptr && range_del_agg != nullptr);
574

575 576
  Slice internal_key, value;
  std::tie(internal_key, value) = pair->GetKeyValue();
577 578 579
  // Check that it belongs to same user key.  We do not check the
  // sequence number since the Seek() call above should have skipped
  // all entries with overly large sequence numbers.
580
  if (s->mem->GetInternalKeyComparator().user_comparator()->Equal(
581
          ExtractUserKey(internal_key), s->key->user_key())) {
582
    // Correct user key
583 584
    const uint64_t tag =
        DecodeFixed64(internal_key.data() + internal_key.size() - 8);
A
agiardullo 已提交
585
    ValueType type;
586 587 588 589 590 591 592 593
    SequenceNumber seq;
    UnPackSequenceAndType(tag, &seq, &type);
    // If the value is not in the snapshot, skip it
    if (!s->CheckCallback(seq)) {
      return true;  // to continue to the next seq
    }

    s->seq = seq;
A
agiardullo 已提交
594

Y
Yi Wu 已提交
595
    if ((type == kTypeValue || type == kTypeMerge || type == kTypeBlobIndex) &&
596
        range_del_agg->ShouldDelete(internal_key)) {
A
Andrew Kryczka 已提交
597 598
      type = kTypeRangeDeletion;
    }
A
agiardullo 已提交
599
    switch (type) {
Y
Yi Wu 已提交
600 601 602 603 604 605 606 607 608 609 610 611 612 613
      case kTypeBlobIndex:
        if (s->is_blob_index == nullptr) {
          ROCKS_LOG_ERROR(s->logger, "Encounter unexpected blob index.");
          *(s->status) = Status::NotSupported(
              "Encounter unsupported blob value. Please open DB with "
              "rocksdb::blob_db::BlobDB instead.");
        } else if (*(s->merge_in_progress)) {
          *(s->status) =
              Status::NotSupported("Blob DB does not support merge operator.");
        }
        if (!s->status->ok()) {
          *(s->found_final_value) = true;
          return false;
        }
614
        FALLTHROUGH_INTENDED;
615 616 617 618 619
      case kTypeValue: {
        if (s->inplace_update_support) {
          s->mem->GetLock(s->key->user_key())->ReadLock();
        }
        *(s->status) = Status::OK();
M
Maysam Yabandeh 已提交
620
        if (*(s->merge_in_progress)) {
621 622
          if (s->value != nullptr) {
            *(s->status) = MergeHelper::TimedFullMerge(
623
                merge_operator, s->key->user_key(), &value,
624 625 626
                merge_context->GetOperands(), s->value, s->logger,
                s->statistics, s->env_, nullptr /* result_operand */, true);
          }
M
Maysam Yabandeh 已提交
627
        } else if (s->value != nullptr) {
628
          s->value->assign(value.data(), value.size());
629 630
        }
        if (s->inplace_update_support) {
631
          s->mem->GetLock(s->key->user_key())->ReadUnlock();
632 633
        }
        *(s->found_final_value) = true;
Y
Yi Wu 已提交
634 635 636
        if (s->is_blob_index != nullptr) {
          *(s->is_blob_index) = (type == kTypeBlobIndex);
        }
637 638
        return false;
      }
A
Andres Noetzli 已提交
639
      case kTypeDeletion:
A
Andrew Kryczka 已提交
640 641
      case kTypeSingleDeletion:
      case kTypeRangeDeletion: {
642
        if (*(s->merge_in_progress)) {
643 644 645 646 647 648
          if (s->value != nullptr) {
            *(s->status) = MergeHelper::TimedFullMerge(
                merge_operator, s->key->user_key(), nullptr,
                merge_context->GetOperands(), s->value, s->logger,
                s->statistics, s->env_, nullptr /* result_operand */, true);
          }
649 650 651 652 653 654 655
        } else {
          *(s->status) = Status::NotFound();
        }
        *(s->found_final_value) = true;
        return false;
      }
      case kTypeMerge: {
656 657 658 659
        if (!merge_operator) {
          *(s->status) = Status::InvalidArgument(
              "merge_operator is not properly initialized.");
          // Normally we continue the loop (return true) when we see a merge
660
          // operand.  But in case of an error, we should stop the loop
661 662 663 664 665
          // immediately and pretend we have found the value to stop further
          // seek.  Otherwise, the later call will override this error status.
          *(s->found_final_value) = true;
          return false;
        }
666
        *(s->merge_in_progress) = true;
667
        merge_context->PushOperand(
668
            value, s->inplace_update_support == false /* operand_pinned */);
669
        if (merge_operator->ShouldMerge(merge_context->GetOperandsDirectionBackward())) {
670 671 672 673 674 675 676
          *(s->status) = MergeHelper::TimedFullMerge(
              merge_operator, s->key->user_key(), nullptr,
              merge_context->GetOperands(), s->value, s->logger, s->statistics,
              s->env_, nullptr /* result_operand */, true);
          *(s->found_final_value) = true;
          return false;
        }
677 678 679 680 681 682 683 684 685 686 687 688
        return true;
      }
      default:
        assert(false);
        return true;
    }
  }

  // s->state could be Corrupt, merge or notfound
  return false;
}

M
Maysam Yabandeh 已提交
689
bool MemTable::Get(const LookupKey& key, std::string* value, Status* s,
A
Andrew Kryczka 已提交
690 691
                   MergeContext* merge_context,
                   RangeDelAggregator* range_del_agg, SequenceNumber* seq,
Y
Yi Wu 已提交
692 693
                   const ReadOptions& read_opts, ReadCallback* callback,
                   bool* is_blob_index) {
694
  // The sequence number is updated synchronously in version_set.h
695
  if (IsEmpty()) {
696 697 698
    // Avoiding recording stats for speed.
    return false;
  }
699
  PERF_TIMER_GUARD(get_from_memtable_time);
700

701 702 703 704 705 706 707 708
  std::unique_ptr<InternalIterator> range_del_iter(
      NewRangeTombstoneIterator(read_opts));
  Status status = range_del_agg->AddTombstones(std::move(range_del_iter));
  if (!status.ok()) {
    *s = status;
    return false;
  }

709
  Slice user_key = key.user_key();
710 711
  bool found_final_value = false;
  bool merge_in_progress = s->IsMergeInProgress();
712 713 714 715 716
  bool const may_contain =
      nullptr == prefix_bloom_
          ? false
          : prefix_bloom_->MayContain(prefix_extractor_->Transform(user_key));
  if (prefix_bloom_ && !may_contain) {
717
    // iter is null if prefix bloom says the key does not exist
718
    PERF_COUNTER_ADD(bloom_memtable_miss_count, 1);
A
agiardullo 已提交
719
    *seq = kMaxSequenceNumber;
720
  } else {
721 722 723
    if (prefix_bloom_) {
      PERF_COUNTER_ADD(bloom_memtable_hit_count, 1);
    }
724 725 726 727 728
    Saver saver;
    saver.status = s;
    saver.found_final_value = &found_final_value;
    saver.merge_in_progress = &merge_in_progress;
    saver.key = &key;
M
Maysam Yabandeh 已提交
729
    saver.value = value;
A
agiardullo 已提交
730
    saver.seq = kMaxSequenceNumber;
731
    saver.mem = this;
732
    saver.merge_context = merge_context;
A
Andrew Kryczka 已提交
733
    saver.range_del_agg = range_del_agg;
L
Lei Jin 已提交
734 735
    saver.merge_operator = moptions_.merge_operator;
    saver.logger = moptions_.info_log;
L
Lei Jin 已提交
736
    saver.inplace_update_support = moptions_.inplace_update_support;
L
Lei Jin 已提交
737
    saver.statistics = moptions_.statistics;
738
    saver.env_ = env_;
739
    saver.callback_ = callback;
Y
Yi Wu 已提交
740
    saver.is_blob_index = is_blob_index;
741
    table_->Get(key, &saver, SaveValue);
A
agiardullo 已提交
742 743

    *seq = saver.seq;
744
  }
745

746
  // No change to value, since we have not yet found a Put/Delete
747
  if (!found_final_value && merge_in_progress) {
748
    *s = Status::MergeInProgress();
749
  }
L
Lei Jin 已提交
750
  PERF_COUNTER_ADD(get_from_memtable_count, 1);
751
  return found_final_value;
752 753
}

754
void MemTable::Update(SequenceNumber seq,
755 756 757
                      const Slice& key,
                      const Slice& value) {
  LookupKey lkey(key, seq);
758
  Slice mem_key = lkey.memtable_key();
759

760
  std::unique_ptr<MemTableRep::Iterator> iter(
761
      table_->GetDynamicPrefixIterator());
762
  iter->Seek(lkey.internal_key(), mem_key.data());
763 764 765 766

  if (iter->Valid()) {
    // sequence number since the Seek() call above should have skipped
    // all entries with overly large sequence numbers.
R
rockeet 已提交
767 768
    Slice internal_key, old_value;
    std::tie(internal_key, old_value) = iter->GetKeyValue();
769
    if (comparator_.comparator.user_comparator()->Equal(
770
            ExtractUserKey(internal_key), lkey.user_key())) {
771
      // Correct user key
772 773
      const uint64_t tag =
          DecodeFixed64(internal_key.data() + internal_key .size() - 8);
A
agiardullo 已提交
774
      ValueType type;
775 776
      SequenceNumber unused;
      UnPackSequenceAndType(tag, &unused, &type);
D
Daniel Black 已提交
777
      if (type == kTypeValue) {
R
rockeet 已提交
778
        uint32_t old_size = static_cast<uint32_t>(old_value.size());
D
Daniel Black 已提交
779 780
        uint32_t new_size = static_cast<uint32_t>(value.size());

R
rockeet 已提交
781 782
        // Update value, if new value size <= old value size
        if (new_size <= old_size) {
Y
Yi Wu 已提交
783
          char* p =
R
rockeet 已提交
784
              const_cast<char*>(old_value.data()) - VarintLength(old_size);
785
          p = EncodeVarint32(p, new_size);
D
Daniel Black 已提交
786 787
          WriteLock wl(GetLock(lkey.user_key()));
          memcpy(p, value.data(), value.size());
Z
Zhongyi Xie 已提交
788
          RecordTick(moptions_.statistics, NUMBER_KEYS_UPDATED);
D
Daniel Black 已提交
789
          return;
790 791 792 793 794
        }
      }
    }
  }

795
  // key doesn't exist
796 797
  bool add_res __attribute__((__unused__));
  add_res = Add(seq, kTypeValue, key, value);
798 799
  // We already checked unused != seq above. In that case, Add should not fail.
  assert(add_res);
800 801 802
}

bool MemTable::UpdateCallback(SequenceNumber seq,
803
                              const Slice& key,
L
Lei Jin 已提交
804
                              const Slice& delta) {
805 806 807
  LookupKey lkey(key, seq);
  Slice memkey = lkey.memtable_key();

L
Lei Jin 已提交
808
  std::unique_ptr<MemTableRep::Iterator> iter(
809
      table_->GetDynamicPrefixIterator());
810
  iter->Seek(lkey.internal_key(), memkey.data());
811 812 813 814 815

  if (iter->Valid()) {
    // Check that it belongs to same user key.  We do not check the
    // sequence number since the Seek() call above should have skipped
    // all entries with overly large sequence numbers.
R
rockeet 已提交
816 817
    Slice internal_key, old_value;
    std::tie(internal_key, old_value) = iter->GetKeyValue();
818
    if (comparator_.comparator.user_comparator()->Equal(
819
            ExtractUserKey(internal_key), lkey.user_key())) {
820
      // Correct user key
821 822
      const uint64_t tag =
          DecodeFixed64(internal_key.data() + internal_key.size() - 8);
A
agiardullo 已提交
823 824 825 826
      ValueType type;
      uint64_t unused;
      UnPackSequenceAndType(tag, &unused, &type);
      switch (type) {
827
        case kTypeValue: {
R
rockeet 已提交
828
          uint32_t old_size = static_cast<uint32_t>(old_value.size());
829

R
rockeet 已提交
830 831
          char* old_buffer = const_cast<char*>(old_value.data());
          uint32_t new_inplace_size = old_size;
832

R
rockeet 已提交
833
          std::string merged_value;
834
          WriteLock wl(GetLock(lkey.user_key()));
R
rockeet 已提交
835 836
          auto status = moptions_.inplace_callback(old_buffer, &new_inplace_size,
                                                   delta, &merged_value);
837
          if (status == UpdateStatus::UPDATED_INPLACE) {
838
            // Value already updated by callback.
R
rockeet 已提交
839 840 841
            assert(new_inplace_size <= old_size);
            if (new_inplace_size < old_size) {
              // overwrite the new inplace size
842
              char* p =
R
rockeet 已提交
843 844 845
                const_cast<char*>(old_value.data()) - VarintLength(old_size);
              p = EncodeVarint32(p, new_inplace_size);
              if (p < old_buffer) {
846
                // shift the value buffer as well.
R
rockeet 已提交
847
                memmove(p, old_buffer, new_inplace_size);
848 849
              }
            }
L
Lei Jin 已提交
850
            RecordTick(moptions_.statistics, NUMBER_KEYS_UPDATED);
851
            UpdateFlushState();
852
            return true;
853
          } else if (status == UpdateStatus::UPDATED) {
R
rockeet 已提交
854
            Add(seq, kTypeValue, key, Slice(merged_value));
L
Lei Jin 已提交
855
            RecordTick(moptions_.statistics, NUMBER_KEYS_WRITTEN);
856
            UpdateFlushState();
857
            return true;
858 859
          } else if (status == UpdateStatus::UPDATE_FAILED) {
            // No action required. Return.
860
            UpdateFlushState();
861 862 863
            return true;
          }
        }
L
Lei Peng 已提交
864
          break;
865 866 867 868 869 870 871
        default:
          break;
      }
    }
  }
  // If the latest value is not kTypeValue
  // or key doesn't exist
872 873
  return false;
}
874 875 876 877 878 879 880

size_t MemTable::CountSuccessiveMergeEntries(const LookupKey& key) {
  Slice memkey = key.memtable_key();

  // A total ordered iterator is costly for some memtablerep (prefix aware
  // reps). By passing in the user key, we allow efficient iterator creation.
  // The iterator only needs to be ordered within the same user key.
881
  std::unique_ptr<MemTableRep::Iterator> iter(
882
      table_->GetDynamicPrefixIterator());
883
  iter->Seek(key.internal_key(), memkey.data());
884 885 886 887

  size_t num_successive_merges = 0;

  for (; iter->Valid(); iter->Next()) {
888
    Slice internal_key = iter->GetKey();
889
    if (!comparator_.comparator.user_comparator()->Equal(
890
            ExtractUserKey(internal_key), key.user_key())) {
891 892 893
      break;
    }

894 895
    const uint64_t tag =
        DecodeFixed64(internal_key.data() + internal_key.size() - 8);
A
agiardullo 已提交
896 897 898 899
    ValueType type;
    uint64_t unused;
    UnPackSequenceAndType(tag, &unused, &type);
    if (type != kTypeMerge) {
900 901 902 903 904 905 906 907 908
      break;
    }

    ++num_successive_merges;
  }

  return num_successive_merges;
}

909 910 911 912 913 914 915 916 917 918 919 920 921
void MemTable::RefLogContainingPrepSection(uint64_t log) {
  assert(log > 0);
  auto cur = min_prep_log_referenced_.load();
  while ((log < cur || cur == 0) &&
         !min_prep_log_referenced_.compare_exchange_strong(cur, log)) {
    cur = min_prep_log_referenced_.load();
  }
}

uint64_t MemTable::GetMinLogContainingPrepSection() {
  return min_prep_log_referenced_.load();
}

922
}  // namespace rocksdb