memtable.cc 24.8 KB
Newer Older
1 2 3 4 5
//  Copyright (c) 2013, Facebook, Inc.  All rights reserved.
//  This source code is licensed under the BSD-style license found in the
//  LICENSE file in the root directory of this source tree. An additional grant
//  of patent rights can be found in the PATENTS file in the same directory.
//
J
jorlow@chromium.org 已提交
6 7 8 9 10
// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file. See the AUTHORS file for names of contributors.

#include "db/memtable.h"
J
Jim Paton 已提交
11 12

#include <memory>
13
#include <algorithm>
14
#include <limits>
J
Jim Paton 已提交
15

J
jorlow@chromium.org 已提交
16
#include "db/dbformat.h"
17
#include "db/merge_context.h"
18
#include "db/writebuffer.h"
19 20 21 22
#include "rocksdb/comparator.h"
#include "rocksdb/env.h"
#include "rocksdb/iterator.h"
#include "rocksdb/merge_operator.h"
K
kailiu 已提交
23
#include "rocksdb/slice_transform.h"
24
#include "table/merger.h"
K
kailiu 已提交
25
#include "util/arena.h"
J
jorlow@chromium.org 已提交
26
#include "util/coding.h"
J
Jim Paton 已提交
27
#include "util/murmurhash.h"
K
Kai Liu 已提交
28
#include "util/mutexlock.h"
29
#include "util/perf_context_imp.h"
I
Igor Canadi 已提交
30
#include "util/statistics.h"
31
#include "util/stop_watch.h"
J
jorlow@chromium.org 已提交
32

33
namespace rocksdb {
J
jorlow@chromium.org 已提交
34

35
MemTableOptions::MemTableOptions(
L
Lei Jin 已提交
36 37
    const ImmutableCFOptions& ioptions,
    const MutableCFOptions& mutable_cf_options)
38 39 40 41 42
  : write_buffer_size(mutable_cf_options.write_buffer_size),
    arena_block_size(mutable_cf_options.arena_block_size),
    memtable_prefix_bloom_bits(mutable_cf_options.memtable_prefix_bloom_bits),
    memtable_prefix_bloom_probes(
        mutable_cf_options.memtable_prefix_bloom_probes),
L
Lei Jin 已提交
43
    memtable_prefix_bloom_huge_page_tlb_size(
44
        mutable_cf_options.memtable_prefix_bloom_huge_page_tlb_size),
L
Lei Jin 已提交
45 46 47
    inplace_update_support(ioptions.inplace_update_support),
    inplace_update_num_locks(mutable_cf_options.inplace_update_num_locks),
    inplace_callback(ioptions.inplace_callback),
48
    max_successive_merges(mutable_cf_options.max_successive_merges),
L
Lei Jin 已提交
49 50 51 52
    filter_deletes(mutable_cf_options.filter_deletes),
    statistics(ioptions.statistics),
    merge_operator(ioptions.merge_operator),
    info_log(ioptions.info_log) {}
L
Lei Jin 已提交
53 54 55

MemTable::MemTable(const InternalKeyComparator& cmp,
                   const ImmutableCFOptions& ioptions,
56
                   const MutableCFOptions& mutable_cf_options,
A
agiardullo 已提交
57
                   WriteBuffer* write_buffer, SequenceNumber earliest_seq)
J
jorlow@chromium.org 已提交
58
    : comparator_(cmp),
L
Lei Jin 已提交
59
      moptions_(ioptions, mutable_cf_options),
60
      refs_(0),
L
Lei Jin 已提交
61 62
      kArenaBlockSize(OptimizeBlockSize(moptions_.arena_block_size)),
      arena_(moptions_.arena_block_size),
63
      allocator_(&arena_, write_buffer),
L
Lei Jin 已提交
64
      table_(ioptions.memtable_factory->CreateMemTableRep(
65 66
          comparator_, &allocator_, ioptions.prefix_extractor,
          ioptions.info_log)),
67
      num_entries_(0),
68
      num_deletes_(0),
69 70
      flush_in_progress_(false),
      flush_completed_(false),
A
Abhishek Kona 已提交
71
      file_number_(0),
72
      first_seqno_(0),
A
agiardullo 已提交
73
      earliest_seqno_(earliest_seq),
74
      mem_next_logfile_number_(0),
75 76 77
      locks_(moptions_.inplace_update_support
                 ? moptions_.inplace_update_num_locks
                 : 0),
L
Lei Jin 已提交
78
      prefix_extractor_(ioptions.prefix_extractor),
I
Igor Canadi 已提交
79
      should_flush_(ShouldFlushNow()),
80 81
      flush_scheduled_(false),
      env_(ioptions.env) {
82 83 84
  // if should_flush_ == true without an entry inserted, something must have
  // gone wrong already.
  assert(!should_flush_);
L
Lei Jin 已提交
85
  if (prefix_extractor_ && moptions_.memtable_prefix_bloom_bits > 0) {
86
    prefix_bloom_.reset(new DynamicBloom(
87
        &allocator_,
L
Lei Jin 已提交
88 89 90
        moptions_.memtable_prefix_bloom_bits, ioptions.bloom_locality,
        moptions_.memtable_prefix_bloom_probes, nullptr,
        moptions_.memtable_prefix_bloom_huge_page_tlb_size,
L
Lei Jin 已提交
91
        ioptions.info_log));
92 93
  }
}
J
jorlow@chromium.org 已提交
94

I
Igor Canadi 已提交
95
MemTable::~MemTable() { assert(refs_ == 0); }
J
jorlow@chromium.org 已提交
96

J
Jim Paton 已提交
97
size_t MemTable::ApproximateMemoryUsage() {
98 99 100 101 102 103 104 105 106 107
  size_t arena_usage = arena_.ApproximateMemoryUsage();
  size_t table_usage = table_->ApproximateMemoryUsage();
  // let MAX_USAGE =  std::numeric_limits<size_t>::max()
  // then if arena_usage + total_usage >= MAX_USAGE, return MAX_USAGE.
  // the following variation is to avoid numeric overflow.
  if (arena_usage >= std::numeric_limits<size_t>::max() - table_usage) {
    return std::numeric_limits<size_t>::max();
  }
  // otherwise, return the actual usage
  return arena_usage + table_usage;
J
Jim Paton 已提交
108
}
J
jorlow@chromium.org 已提交
109

110 111 112 113 114 115 116 117 118 119 120 121 122 123
bool MemTable::ShouldFlushNow() const {
  // In a lot of times, we cannot allocate arena blocks that exactly matches the
  // buffer size. Thus we have to decide if we should over-allocate or
  // under-allocate.
  // This constant avariable can be interpreted as: if we still have more than
  // "kAllowOverAllocationRatio * kArenaBlockSize" space left, we'd try to over
  // allocate one more block.
  const double kAllowOverAllocationRatio = 0.6;

  // If arena still have room for new block allocation, we can safely say it
  // shouldn't flush.
  auto allocated_memory =
      table_->ApproximateMemoryUsage() + arena_.MemoryAllocatedBytes();

124 125 126
  // if we can still allocate one more block without exceeding the
  // over-allocation ratio, then we should not flush.
  if (allocated_memory + kArenaBlockSize <
L
Lei Jin 已提交
127 128
      moptions_.write_buffer_size +
      kArenaBlockSize * kAllowOverAllocationRatio) {
129 130 131
    return false;
  }

L
Lei Jin 已提交
132 133 134 135 136
  // if user keeps adding entries that exceeds moptions.write_buffer_size,
  // we need to flush earlier even though we still have much available
  // memory left.
  if (allocated_memory > moptions_.write_buffer_size +
      kArenaBlockSize * kAllowOverAllocationRatio) {
137 138 139 140 141
    return true;
  }

  // In this code path, Arena has already allocated its "last block", which
  // means the total allocatedmemory size is either:
142
  //  (1) "moderately" over allocated the memory (no more than `0.6 * arena
143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167
  // block size`. Or,
  //  (2) the allocated memory is less than write buffer size, but we'll stop
  // here since if we allocate a new arena block, we'll over allocate too much
  // more (half of the arena block size) memory.
  //
  // In either case, to avoid over-allocate, the last block will stop allocation
  // when its usage reaches a certain ratio, which we carefully choose "0.75
  // full" as the stop condition because it addresses the following issue with
  // great simplicity: What if the next inserted entry's size is
  // bigger than AllocatedAndUnused()?
  //
  // The answer is: if the entry size is also bigger than 0.25 *
  // kArenaBlockSize, a dedicated block will be allocated for it; otherwise
  // arena will anyway skip the AllocatedAndUnused() and allocate a new, empty
  // and regular block. In either case, we *overly* over-allocated.
  //
  // Therefore, setting the last block to be at most "0.75 full" avoids both
  // cases.
  //
  // NOTE: the average percentage of waste space of this approach can be counted
  // as: "arena block size * 0.25 / write buffer size". User who specify a small
  // write buffer size and/or big arena block size may suffer.
  return arena_.AllocatedAndUnused() < kArenaBlockSize / 4;
}

168 169 170 171 172 173 174 175 176 177
int MemTable::KeyComparator::operator()(const char* prefix_len_key1,
                                        const char* prefix_len_key2) const {
  // Internal keys are encoded as length-prefixed strings.
  Slice k1 = GetLengthPrefixedSlice(prefix_len_key1);
  Slice k2 = GetLengthPrefixedSlice(prefix_len_key2);
  return comparator.Compare(k1, k2);
}

int MemTable::KeyComparator::operator()(const char* prefix_len_key,
                                        const Slice& key)
J
jorlow@chromium.org 已提交
178 179
    const {
  // Internal keys are encoded as length-prefixed strings.
180 181
  Slice a = GetLengthPrefixedSlice(prefix_len_key);
  return comparator.Compare(a, key);
J
jorlow@chromium.org 已提交
182 183
}

J
Jim Paton 已提交
184 185 186 187 188
Slice MemTableRep::UserKey(const char* key) const {
  Slice slice = GetLengthPrefixedSlice(key);
  return Slice(slice.data(), slice.size() - 8);
}

189
KeyHandle MemTableRep::Allocate(const size_t len, char** buf) {
190
  *buf = allocator_->Allocate(len);
191 192 193
  return static_cast<KeyHandle>(*buf);
}

J
jorlow@chromium.org 已提交
194 195 196
// Encode a suitable internal key target for "target" and return it.
// Uses *scratch as scratch space, and the returned pointer will point
// into this scratch space.
197
const char* EncodeKey(std::string* scratch, const Slice& target) {
J
jorlow@chromium.org 已提交
198
  scratch->clear();
199
  PutVarint32(scratch, static_cast<uint32_t>(target.size()));
J
jorlow@chromium.org 已提交
200 201 202 203 204 205
  scratch->append(target.data(), target.size());
  return scratch->data();
}

class MemTableIterator: public Iterator {
 public:
206
  MemTableIterator(
L
Lei Jin 已提交
207
      const MemTable& mem, const ReadOptions& read_options, Arena* arena)
208 209
      : bloom_(nullptr),
        prefix_extractor_(mem.prefix_extractor_),
210 211
        valid_(false),
        arena_mode_(arena != nullptr) {
L
Lei Jin 已提交
212
    if (prefix_extractor_ != nullptr && !read_options.total_order_seek) {
213
      bloom_ = mem.prefix_bloom_.get();
214
      iter_ = mem.table_->GetDynamicPrefixIterator(arena);
215
    } else {
216 217 218 219 220 221 222 223 224
      iter_ = mem.table_->GetIterator(arena);
    }
  }

  ~MemTableIterator() {
    if (arena_mode_) {
      iter_->~Iterator();
    } else {
      delete iter_;
225 226
    }
  }
J
Jim Paton 已提交
227

I
Igor Sugak 已提交
228 229
  virtual bool Valid() const override { return valid_; }
  virtual void Seek(const Slice& k) override {
I
Igor Canadi 已提交
230 231
    PERF_TIMER_GUARD(seek_on_memtable_time);
    PERF_COUNTER_ADD(seek_on_memtable_count, 1);
232 233
    if (bloom_ != nullptr &&
        !bloom_->MayContain(prefix_extractor_->Transform(ExtractUserKey(k)))) {
234 235 236 237 238 239
      valid_ = false;
      return;
    }
    iter_->Seek(k, nullptr);
    valid_ = iter_->Valid();
  }
I
Igor Sugak 已提交
240
  virtual void SeekToFirst() override {
241 242 243
    iter_->SeekToFirst();
    valid_ = iter_->Valid();
  }
I
Igor Sugak 已提交
244
  virtual void SeekToLast() override {
245 246 247
    iter_->SeekToLast();
    valid_ = iter_->Valid();
  }
I
Igor Sugak 已提交
248
  virtual void Next() override {
249 250 251 252
    assert(Valid());
    iter_->Next();
    valid_ = iter_->Valid();
  }
I
Igor Sugak 已提交
253
  virtual void Prev() override {
254 255 256 257
    assert(Valid());
    iter_->Prev();
    valid_ = iter_->Valid();
  }
I
Igor Sugak 已提交
258
  virtual Slice key() const override {
259
    assert(Valid());
J
Jim Paton 已提交
260 261
    return GetLengthPrefixedSlice(iter_->key());
  }
I
Igor Sugak 已提交
262
  virtual Slice value() const override {
263
    assert(Valid());
J
Jim Paton 已提交
264
    Slice key_slice = GetLengthPrefixedSlice(iter_->key());
J
jorlow@chromium.org 已提交
265 266 267
    return GetLengthPrefixedSlice(key_slice.data() + key_slice.size());
  }

I
Igor Sugak 已提交
268
  virtual Status status() const override { return Status::OK(); }
J
jorlow@chromium.org 已提交
269 270

 private:
271 272
  DynamicBloom* bloom_;
  const SliceTransform* const prefix_extractor_;
273
  MemTableRep::Iterator* iter_;
274
  bool valid_;
275
  bool arena_mode_;
J
jorlow@chromium.org 已提交
276 277 278 279 280 281

  // No copying allowed
  MemTableIterator(const MemTableIterator&);
  void operator=(const MemTableIterator&);
};

L
Lei Jin 已提交
282
Iterator* MemTable::NewIterator(const ReadOptions& read_options, Arena* arena) {
283 284
  assert(arena != nullptr);
  auto mem = arena->AllocateAligned(sizeof(MemTableIterator));
L
Lei Jin 已提交
285
  return new (mem) MemTableIterator(*this, read_options, arena);
J
jorlow@chromium.org 已提交
286 287
}

288
port::RWMutex* MemTable::GetLock(const Slice& key) {
K
kailiu 已提交
289 290
  static murmur_hash hash;
  return &locks_[hash(key) % locks_.size()];
291 292
}

J
jorlow@chromium.org 已提交
293
void MemTable::Add(SequenceNumber s, ValueType type,
294
                   const Slice& key, /* user key */
J
jorlow@chromium.org 已提交
295 296 297 298 299 300
                   const Slice& value) {
  // Format of an entry is concatenation of:
  //  key_size     : varint32 of internal_key.size()
  //  key bytes    : char[internal_key.size()]
  //  value_size   : varint32 of value.size()
  //  value bytes  : char[value.size()]
301 302 303 304 305 306
  uint32_t key_size = static_cast<uint32_t>(key.size());
  uint32_t val_size = static_cast<uint32_t>(value.size());
  uint32_t internal_key_size = key_size + 8;
  const uint32_t encoded_len = VarintLength(internal_key_size) +
                               internal_key_size + VarintLength(val_size) +
                               val_size;
307 308 309
  char* buf = nullptr;
  KeyHandle handle = table_->Allocate(encoded_len, &buf);
  assert(buf != nullptr);
J
jorlow@chromium.org 已提交
310 311 312
  char* p = EncodeVarint32(buf, internal_key_size);
  memcpy(p, key.data(), key_size);
  p += key_size;
A
agiardullo 已提交
313 314
  uint64_t packed = PackSequenceAndType(s, type);
  EncodeFixed64(p, packed);
J
jorlow@chromium.org 已提交
315 316 317
  p += 8;
  p = EncodeVarint32(p, val_size);
  memcpy(p, value.data(), val_size);
I
Igor Canadi 已提交
318
  assert((unsigned)(p + val_size - buf) == (unsigned)encoded_len);
319
  table_->Insert(handle);
320
  num_entries_++;
321 322 323
  if (type == kTypeDeletion) {
    num_deletes_++;
  }
324

325 326 327 328 329
  if (prefix_bloom_) {
    assert(prefix_extractor_);
    prefix_bloom_->Add(prefix_extractor_->Transform(key));
  }

330 331 332 333
  // The first sequence number inserted into the memtable
  assert(first_seqno_ == 0 || s > first_seqno_);
  if (first_seqno_ == 0) {
    first_seqno_ = s;
A
agiardullo 已提交
334 335 336 337 338

    if (earliest_seqno_ == kMaxSequenceNumber) {
      earliest_seqno_ = first_seqno_;
    }
    assert(first_seqno_ >= earliest_seqno_);
339
  }
340 341

  should_flush_ = ShouldFlushNow();
J
jorlow@chromium.org 已提交
342 343
}

344 345 346 347 348 349 350 351 352
// Callback from MemTable::Get()
namespace {

struct Saver {
  Status* status;
  const LookupKey* key;
  bool* found_final_value;  // Is value set correctly? Used by KeyMayExist
  bool* merge_in_progress;
  std::string* value;
A
agiardullo 已提交
353
  SequenceNumber seq;
354 355 356 357 358 359 360
  const MergeOperator* merge_operator;
  // the merge operations encountered;
  MergeContext* merge_context;
  MemTable* mem;
  Logger* logger;
  Statistics* statistics;
  bool inplace_update_support;
361
  Env* env_;
362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386
};
}  // namespace

static bool SaveValue(void* arg, const char* entry) {
  Saver* s = reinterpret_cast<Saver*>(arg);
  MergeContext* merge_context = s->merge_context;
  const MergeOperator* merge_operator = s->merge_operator;

  assert(s != nullptr && merge_context != nullptr);

  // entry format is:
  //    klength  varint32
  //    userkey  char[klength-8]
  //    tag      uint64
  //    vlength  varint32
  //    value    char[vlength]
  // Check that it belongs to same user key.  We do not check the
  // sequence number since the Seek() call above should have skipped
  // all entries with overly large sequence numbers.
  uint32_t key_length;
  const char* key_ptr = GetVarint32Ptr(entry, entry + 5, &key_length);
  if (s->mem->GetInternalKeyComparator().user_comparator()->Compare(
          Slice(key_ptr, key_length - 8), s->key->user_key()) == 0) {
    // Correct user key
    const uint64_t tag = DecodeFixed64(key_ptr + key_length - 8);
A
agiardullo 已提交
387 388 389 390
    ValueType type;
    UnPackSequenceAndType(tag, &s->seq, &type);

    switch (type) {
391 392 393 394 395 396 397 398
      case kTypeValue: {
        if (s->inplace_update_support) {
          s->mem->GetLock(s->key->user_key())->ReadLock();
        }
        Slice v = GetLengthPrefixedSlice(key_ptr + key_length);
        *(s->status) = Status::OK();
        if (*(s->merge_in_progress)) {
          assert(merge_operator);
399 400 401 402 403 404 405 406 407 408 409
          bool merge_success = false;
          {
            StopWatchNano timer(s->env_, s->statistics != nullptr);
            PERF_TIMER_GUARD(merge_operator_time_nanos);
            merge_success = merge_operator->FullMerge(
                s->key->user_key(), &v, merge_context->GetOperands(), s->value,
                s->logger);
            RecordTick(s->statistics, MERGE_OPERATION_TOTAL_TIME,
                       timer.ElapsedNanos());
          }
          if (!merge_success) {
410 411 412 413 414 415 416 417
            RecordTick(s->statistics, NUMBER_MERGE_FAILURES);
            *(s->status) =
                Status::Corruption("Error: Could not perform merge.");
          }
        } else {
          s->value->assign(v.data(), v.size());
        }
        if (s->inplace_update_support) {
418
          s->mem->GetLock(s->key->user_key())->ReadUnlock();
419 420 421 422 423 424 425 426
        }
        *(s->found_final_value) = true;
        return false;
      }
      case kTypeDeletion: {
        if (*(s->merge_in_progress)) {
          assert(merge_operator);
          *(s->status) = Status::OK();
427 428 429 430 431 432 433 434 435 436 437
          bool merge_success = false;
          {
            StopWatchNano timer(s->env_, s->statistics != nullptr);
            PERF_TIMER_GUARD(merge_operator_time_nanos);
            merge_success = merge_operator->FullMerge(
                s->key->user_key(), nullptr, merge_context->GetOperands(),
                s->value, s->logger);
            RecordTick(s->statistics, MERGE_OPERATION_TOTAL_TIME,
                       timer.ElapsedNanos());
          }
          if (!merge_success) {
438 439 440 441 442 443 444 445 446 447 448
            RecordTick(s->statistics, NUMBER_MERGE_FAILURES);
            *(s->status) =
                Status::Corruption("Error: Could not perform merge.");
          }
        } else {
          *(s->status) = Status::NotFound();
        }
        *(s->found_final_value) = true;
        return false;
      }
      case kTypeMerge: {
449 450 451 452
        if (!merge_operator) {
          *(s->status) = Status::InvalidArgument(
              "merge_operator is not properly initialized.");
          // Normally we continue the loop (return true) when we see a merge
453
          // operand.  But in case of an error, we should stop the loop
454 455 456 457 458
          // immediately and pretend we have found the value to stop further
          // seek.  Otherwise, the later call will override this error status.
          *(s->found_final_value) = true;
          return false;
        }
459 460 461 462 463 464 465 466 467 468 469 470 471 472 473
        Slice v = GetLengthPrefixedSlice(key_ptr + key_length);
        *(s->merge_in_progress) = true;
        merge_context->PushOperand(v);
        return true;
      }
      default:
        assert(false);
        return true;
    }
  }

  // s->state could be Corrupt, merge or notfound
  return false;
}

474
bool MemTable::Get(const LookupKey& key, std::string* value, Status* s,
A
agiardullo 已提交
475
                   MergeContext* merge_context, SequenceNumber* seq) {
476
  // The sequence number is updated synchronously in version_set.h
477
  if (IsEmpty()) {
478 479 480
    // Avoiding recording stats for speed.
    return false;
  }
481
  PERF_TIMER_GUARD(get_from_memtable_time);
482

483
  Slice user_key = key.user_key();
484 485
  bool found_final_value = false;
  bool merge_in_progress = s->IsMergeInProgress();
486 487 488 489

  if (prefix_bloom_ &&
      !prefix_bloom_->MayContain(prefix_extractor_->Transform(user_key))) {
    // iter is null if prefix bloom says the key does not exist
A
agiardullo 已提交
490
    *seq = kMaxSequenceNumber;
491
  } else {
492 493 494 495 496 497
    Saver saver;
    saver.status = s;
    saver.found_final_value = &found_final_value;
    saver.merge_in_progress = &merge_in_progress;
    saver.key = &key;
    saver.value = value;
A
agiardullo 已提交
498
    saver.seq = kMaxSequenceNumber;
499
    saver.mem = this;
500
    saver.merge_context = merge_context;
L
Lei Jin 已提交
501 502
    saver.merge_operator = moptions_.merge_operator;
    saver.logger = moptions_.info_log;
L
Lei Jin 已提交
503
    saver.inplace_update_support = moptions_.inplace_update_support;
L
Lei Jin 已提交
504
    saver.statistics = moptions_.statistics;
505
    saver.env_ = env_;
506
    table_->Get(key, &saver, SaveValue);
A
agiardullo 已提交
507 508

    *seq = saver.seq;
509
  }
510

511
  // No change to value, since we have not yet found a Put/Delete
512
  if (!found_final_value && merge_in_progress) {
513 514
    *s = Status::MergeInProgress("");
  }
L
Lei Jin 已提交
515
  PERF_COUNTER_ADD(get_from_memtable_count, 1);
516
  return found_final_value;
517 518
}

519
void MemTable::Update(SequenceNumber seq,
520 521 522
                      const Slice& key,
                      const Slice& value) {
  LookupKey lkey(key, seq);
523
  Slice mem_key = lkey.memtable_key();
524

525
  std::unique_ptr<MemTableRep::Iterator> iter(
526
      table_->GetDynamicPrefixIterator());
527
  iter->Seek(lkey.internal_key(), mem_key.data());
528 529 530

  if (iter->Valid()) {
    // entry format is:
531
    //    key_length  varint32
532 533 534 535 536 537 538 539
    //    userkey  char[klength-8]
    //    tag      uint64
    //    vlength  varint32
    //    value    char[vlength]
    // Check that it belongs to same user key.  We do not check the
    // sequence number since the Seek() call above should have skipped
    // all entries with overly large sequence numbers.
    const char* entry = iter->key();
K
Kai Liu 已提交
540
    uint32_t key_length = 0;
541 542 543 544 545
    const char* key_ptr = GetVarint32Ptr(entry, entry + 5, &key_length);
    if (comparator_.comparator.user_comparator()->Compare(
        Slice(key_ptr, key_length - 8), lkey.user_key()) == 0) {
      // Correct user key
      const uint64_t tag = DecodeFixed64(key_ptr + key_length - 8);
A
agiardullo 已提交
546 547 548 549
      ValueType type;
      SequenceNumber unused;
      UnPackSequenceAndType(tag, &unused, &type);
      switch (type) {
550
        case kTypeValue: {
551
          Slice prev_value = GetLengthPrefixedSlice(key_ptr + key_length);
552 553
          uint32_t prev_size = static_cast<uint32_t>(prev_value.size());
          uint32_t new_size = static_cast<uint32_t>(value.size());
554

555 556
          // Update value, if new value size  <= previous value size
          if (new_size <= prev_size ) {
557
            char* p = EncodeVarint32(const_cast<char*>(key_ptr) + key_length,
558
                                     new_size);
559 560
            WriteLock wl(GetLock(lkey.user_key()));
            memcpy(p, value.data(), value.size());
I
Igor Canadi 已提交
561 562 563
            assert((unsigned)((p + value.size()) - entry) ==
                   (unsigned)(VarintLength(key_length) + key_length +
                              VarintLength(value.size()) + value.size()));
564
            return;
565 566 567 568
          }
        }
        default:
          // If the latest value is kTypeDeletion, kTypeMerge or kTypeLogData
569 570 571
          // we don't have enough space for update inplace
            Add(seq, kTypeValue, key, value);
            return;
572 573 574 575
      }
    }
  }

576 577 578 579 580
  // key doesn't exist
  Add(seq, kTypeValue, key, value);
}

bool MemTable::UpdateCallback(SequenceNumber seq,
581
                              const Slice& key,
L
Lei Jin 已提交
582
                              const Slice& delta) {
583 584 585
  LookupKey lkey(key, seq);
  Slice memkey = lkey.memtable_key();

L
Lei Jin 已提交
586
  std::unique_ptr<MemTableRep::Iterator> iter(
587
      table_->GetDynamicPrefixIterator());
588
  iter->Seek(lkey.internal_key(), memkey.data());
589 590 591 592 593 594 595 596 597 598 599 600

  if (iter->Valid()) {
    // entry format is:
    //    key_length  varint32
    //    userkey  char[klength-8]
    //    tag      uint64
    //    vlength  varint32
    //    value    char[vlength]
    // Check that it belongs to same user key.  We do not check the
    // sequence number since the Seek() call above should have skipped
    // all entries with overly large sequence numbers.
    const char* entry = iter->key();
K
Kai Liu 已提交
601
    uint32_t key_length = 0;
602 603 604 605 606
    const char* key_ptr = GetVarint32Ptr(entry, entry + 5, &key_length);
    if (comparator_.comparator.user_comparator()->Compare(
        Slice(key_ptr, key_length - 8), lkey.user_key()) == 0) {
      // Correct user key
      const uint64_t tag = DecodeFixed64(key_ptr + key_length - 8);
A
agiardullo 已提交
607 608 609 610
      ValueType type;
      uint64_t unused;
      UnPackSequenceAndType(tag, &unused, &type);
      switch (type) {
611 612
        case kTypeValue: {
          Slice prev_value = GetLengthPrefixedSlice(key_ptr + key_length);
613
          uint32_t prev_size = static_cast<uint32_t>(prev_value.size());
614 615

          char* prev_buffer = const_cast<char*>(prev_value.data());
616
          uint32_t new_prev_size = prev_size;
617 618

          std::string str_value;
619
          WriteLock wl(GetLock(lkey.user_key()));
L
Lei Jin 已提交
620 621
          auto status = moptions_.inplace_callback(prev_buffer, &new_prev_size,
                                                   delta, &str_value);
622
          if (status == UpdateStatus::UPDATED_INPLACE) {
623
            // Value already updated by callback.
624 625 626 627 628 629 630 631 632 633
            assert(new_prev_size <= prev_size);
            if (new_prev_size < prev_size) {
              // overwrite the new prev_size
              char* p = EncodeVarint32(const_cast<char*>(key_ptr) + key_length,
                                       new_prev_size);
              if (VarintLength(new_prev_size) < VarintLength(prev_size)) {
                // shift the value buffer as well.
                memcpy(p, prev_buffer, new_prev_size);
              }
            }
L
Lei Jin 已提交
634
            RecordTick(moptions_.statistics, NUMBER_KEYS_UPDATED);
635
            should_flush_ = ShouldFlushNow();
636
            return true;
637 638
          } else if (status == UpdateStatus::UPDATED) {
            Add(seq, kTypeValue, key, Slice(str_value));
L
Lei Jin 已提交
639
            RecordTick(moptions_.statistics, NUMBER_KEYS_WRITTEN);
640
            should_flush_ = ShouldFlushNow();
641
            return true;
642 643
          } else if (status == UpdateStatus::UPDATE_FAILED) {
            // No action required. Return.
644
            should_flush_ = ShouldFlushNow();
645 646 647 648 649 650 651 652 653 654
            return true;
          }
        }
        default:
          break;
      }
    }
  }
  // If the latest value is not kTypeValue
  // or key doesn't exist
655 656
  return false;
}
657 658 659 660 661 662 663

size_t MemTable::CountSuccessiveMergeEntries(const LookupKey& key) {
  Slice memkey = key.memtable_key();

  // A total ordered iterator is costly for some memtablerep (prefix aware
  // reps). By passing in the user key, we allow efficient iterator creation.
  // The iterator only needs to be ordered within the same user key.
664
  std::unique_ptr<MemTableRep::Iterator> iter(
665
      table_->GetDynamicPrefixIterator());
666
  iter->Seek(key.internal_key(), memkey.data());
667 668 669 670 671

  size_t num_successive_merges = 0;

  for (; iter->Valid(); iter->Next()) {
    const char* entry = iter->key();
K
Kai Liu 已提交
672
    uint32_t key_length = 0;
673
    const char* iter_key_ptr = GetVarint32Ptr(entry, entry + 5, &key_length);
I
Igor Canadi 已提交
674 675
    if (comparator_.comparator.user_comparator()->Compare(
            Slice(iter_key_ptr, key_length - 8), key.user_key()) != 0) {
676 677 678 679
      break;
    }

    const uint64_t tag = DecodeFixed64(iter_key_ptr + key_length - 8);
A
agiardullo 已提交
680 681 682 683
    ValueType type;
    uint64_t unused;
    UnPackSequenceAndType(tag, &unused, &type);
    if (type != kTypeMerge) {
684 685 686 687 688 689 690 691 692
      break;
    }

    ++num_successive_merges;
  }

  return num_successive_merges;
}

693 694
void MemTableRep::Get(const LookupKey& k, void* callback_args,
                      bool (*callback_func)(void* arg, const char* entry)) {
695
  auto iter = GetDynamicPrefixIterator();
696 697 698 699 700 701
  for (iter->Seek(k.internal_key(), k.memtable_key().data());
       iter->Valid() && callback_func(callback_args, iter->key());
       iter->Next()) {
  }
}

702
}  // namespace rocksdb