memtable_list.h 18.5 KB
Newer Older
1
//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
S
Siying Dong 已提交
2 3 4
//  This source code is licensed under both the GPLv2 (found in the
//  COPYING file in the root directory) and Apache 2.0 License
//  (found in the LICENSE.Apache file in the root directory).
5
//
6
#pragma once
K
Kai Liu 已提交
7

8 9
#include <deque>
#include <limits>
10
#include <list>
11
#include <set>
12 13
#include <string>
#include <vector>
K
Kai Liu 已提交
14

S
Siying Dong 已提交
15
#include "db/logs_with_prep_tracker.h"
16
#include "db/memtable.h"
17
#include "db/range_del_aggregator.h"
18
#include "file/filename.h"
19
#include "logging/log_buffer.h"
20
#include "monitoring/instrumented_mutex.h"
K
kailiu 已提交
21 22 23
#include "rocksdb/db.h"
#include "rocksdb/iterator.h"
#include "rocksdb/options.h"
A
agiardullo 已提交
24
#include "rocksdb/types.h"
K
Kai Liu 已提交
25
#include "util/autovector.h"
26

27
namespace ROCKSDB_NAMESPACE {
28

29
class ColumnFamilyData;
30
class InternalKeyComparator;
31
class InstrumentedMutex;
32
class MergeIteratorBuilder;
33
class MemTableList;
34

35 36
struct FlushJobInfo;

I
Igor Canadi 已提交
37 38 39
// keeps a list of immutable memtables in a vector. the list is immutable
// if refcount is bigger than one. It is used as a state for Get() and
// Iterator code paths
40 41 42
//
// This class is not thread-safe.  External synchronization is required
// (such as holding the db mutex or being on the write thread).
I
Igor Canadi 已提交
43 44
class MemTableListVersion {
 public:
45
  explicit MemTableListVersion(size_t* parent_memtable_list_memory_usage,
46
                               const MemTableListVersion& old);
47
  explicit MemTableListVersion(size_t* parent_memtable_list_memory_usage,
48 49
                               int max_write_buffer_number_to_maintain,
                               int64_t max_write_buffer_size_to_maintain);
I
Igor Canadi 已提交
50 51

  void Ref();
K
kailiu 已提交
52
  void Unref(autovector<MemTable*>* to_delete = nullptr);
I
Igor Canadi 已提交
53 54 55

  // Search all the memtables starting from the most recent one.
  // Return the most recent value found, if any.
A
agiardullo 已提交
56 57 58 59
  //
  // If any operation was found for this key, its most recent sequence number
  // will be stored in *seq on success (regardless of whether true/false is
  // returned).  Otherwise, *seq will be set to kMaxSequenceNumber.
H
Huisheng Liu 已提交
60 61
  bool Get(const LookupKey& key, std::string* value, std::string* timestamp,
           Status* s, MergeContext* merge_context,
62 63 64
           SequenceNumber* max_covering_tombstone_seq, SequenceNumber* seq,
           const ReadOptions& read_opts, ReadCallback* callback = nullptr,
           bool* is_blob_index = nullptr);
M
Maysam Yabandeh 已提交
65

H
Huisheng Liu 已提交
66 67
  bool Get(const LookupKey& key, std::string* value, std::string* timestamp,
           Status* s, MergeContext* merge_context,
68
           SequenceNumber* max_covering_tombstone_seq,
Y
Yi Wu 已提交
69 70
           const ReadOptions& read_opts, ReadCallback* callback = nullptr,
           bool* is_blob_index = nullptr) {
M
Maysam Yabandeh 已提交
71
    SequenceNumber seq;
H
Huisheng Liu 已提交
72 73 74
    return Get(key, value, timestamp, s, merge_context,
               max_covering_tombstone_seq, &seq, read_opts, callback,
               is_blob_index);
M
Maysam Yabandeh 已提交
75
  }
M
Maysam Yabandeh 已提交
76

77
  void MultiGet(const ReadOptions& read_options, MultiGetRange* range,
78
                ReadCallback* callback);
79

80 81 82 83 84 85 86
  // Returns all the merge operands corresponding to the key by searching all
  // memtables starting from the most recent one.
  bool GetMergeOperands(const LookupKey& key, Status* s,
                        MergeContext* merge_context,
                        SequenceNumber* max_covering_tombstone_seq,
                        const ReadOptions& read_opts);

87 88 89 90
  // Similar to Get(), but searches the Memtable history of memtables that
  // have already been flushed.  Should only be used from in-memory only
  // queries (such as Transaction validation) as the history may contain
  // writes that are also present in the SST files.
H
Huisheng Liu 已提交
91 92
  bool GetFromHistory(const LookupKey& key, std::string* value,
                      std::string* timestamp, Status* s,
A
Andrew Kryczka 已提交
93
                      MergeContext* merge_context,
94 95
                      SequenceNumber* max_covering_tombstone_seq,
                      SequenceNumber* seq, const ReadOptions& read_opts,
96
                      bool* is_blob_index = nullptr);
H
Huisheng Liu 已提交
97 98
  bool GetFromHistory(const LookupKey& key, std::string* value,
                      std::string* timestamp, Status* s,
M
Maysam Yabandeh 已提交
99
                      MergeContext* merge_context,
100
                      SequenceNumber* max_covering_tombstone_seq,
101 102
                      const ReadOptions& read_opts,
                      bool* is_blob_index = nullptr) {
A
agiardullo 已提交
103
    SequenceNumber seq;
H
Huisheng Liu 已提交
104
    return GetFromHistory(key, value, timestamp, s, merge_context,
105 106
                          max_covering_tombstone_seq, &seq, read_opts,
                          is_blob_index);
A
agiardullo 已提交
107
  }
108

A
Andrew Kryczka 已提交
109
  Status AddRangeTombstoneIterators(const ReadOptions& read_opts, Arena* arena,
110
                                    RangeDelAggregator* range_del_agg);
A
Andrew Kryczka 已提交
111

I
Igor Canadi 已提交
112
  void AddIterators(const ReadOptions& options,
S
sdong 已提交
113 114
                    std::vector<InternalIterator*>* iterator_list,
                    Arena* arena);
I
Igor Canadi 已提交
115

116 117 118
  void AddIterators(const ReadOptions& options,
                    MergeIteratorBuilder* merge_iter_builder);

119 120
  uint64_t GetTotalNumEntries() const;

121 122
  uint64_t GetTotalNumDeletes() const;

123 124
  MemTable::MemTableStats ApproximateStats(const Slice& start_ikey,
                                           const Slice& end_ikey);
125

A
agiardullo 已提交
126 127 128 129 130 131
  // Returns the value of MemTable::GetEarliestSequenceNumber() on the most
  // recent MemTable in this list or kMaxSequenceNumber if the list is empty.
  // If include_history=true, will also search Memtables in MemTableList
  // History.
  SequenceNumber GetEarliestSequenceNumber(bool include_history = false) const;

I
Igor Canadi 已提交
132
 private:
133 134 135 136 137 138 139
  friend class MemTableList;

  friend Status InstallMemtableAtomicFlushResults(
      const autovector<MemTableList*>* imm_lists,
      const autovector<ColumnFamilyData*>& cfds,
      const autovector<const MutableCFOptions*>& mutable_cf_options_list,
      const autovector<const autovector<MemTable*>*>& mems_list,
140 141
      VersionSet* vset, LogsWithPrepTracker* prep_tracker,
      InstrumentedMutex* mu, const autovector<FileMetaData*>& file_meta,
142 143
      const autovector<std::list<std::unique_ptr<FlushJobInfo>>*>&
          committed_flush_jobs_info,
144
      autovector<MemTable*>* to_delete, FSDirectory* db_directory,
145 146
      LogBuffer* log_buffer);

147 148 149 150 151
  // REQUIRE: m is an immutable memtable
  void Add(MemTable* m, autovector<MemTable*>* to_delete);
  // REQUIRE: m is an immutable memtable
  void Remove(MemTable* m, autovector<MemTable*>* to_delete);

152 153
  // Return true if memtable is trimmed
  bool TrimHistory(autovector<MemTable*>* to_delete, size_t usage);
I
Igor Canadi 已提交
154

A
agiardullo 已提交
155
  bool GetFromList(std::list<MemTable*>* list, const LookupKey& key,
H
Huisheng Liu 已提交
156 157
                   std::string* value, std::string* timestamp, Status* s,
                   MergeContext* merge_context,
158 159
                   SequenceNumber* max_covering_tombstone_seq,
                   SequenceNumber* seq, const ReadOptions& read_opts,
Y
Yi Wu 已提交
160 161
                   ReadCallback* callback = nullptr,
                   bool* is_blob_index = nullptr);
A
agiardullo 已提交
162

163 164 165 166
  void AddMemTable(MemTable* m);

  void UnrefMemTable(autovector<MemTable*>* to_delete, MemTable* m);

167 168 169 170
  // Calculate the total amount of memory used by memlist_ and memlist_history_
  // excluding the last MemTable in memlist_history_. The reason for excluding
  // the last MemTable is to see if dropping the last MemTable will keep total
  // memory usage above or equal to max_write_buffer_size_to_maintain_
171
  size_t MemoryAllocatedBytesExcludingLast() const;
172

173 174 175 176
  // Whether this version contains flushed memtables that are only kept around
  // for transaction conflict checking.
  bool HasHistory() const { return !memlist_history_.empty(); }

177 178
  bool MemtableLimitExceeded(size_t usage);

179
  // Immutable MemTables that have not yet been flushed.
I
Igor Canadi 已提交
180
  std::list<MemTable*> memlist_;
181 182 183 184 185 186 187

  // MemTables that have already been flushed
  // (used during Transaction validation)
  std::list<MemTable*> memlist_history_;

  // Maximum number of MemTables to keep in memory (including both flushed
  const int max_write_buffer_number_to_maintain_;
188 189 190
  // Maximum size of MemTables to keep in memory (including both flushed
  // and not-yet-flushed tables).
  const int64_t max_write_buffer_size_to_maintain_;
191

I
Igor Canadi 已提交
192
  int refs_ = 0;
193 194

  size_t* parent_memtable_list_memory_usage_;
I
Igor Canadi 已提交
195 196
};

197
// This class stores references to all the immutable memtables.
A
Abhishek Kona 已提交
198
// The memtables are flushed to L0 as soon as possible and in
199
// any order. If there are more than one immutable memtable, their
A
Abhishek Kona 已提交
200
// flushes can occur concurrently.  However, they are 'committed'
201 202
// to the manifest in FIFO order to maintain correctness and
// recoverability from a crash.
203 204
//
//
205 206 207
// Other than imm_flush_needed and imm_trim_needed, this class is not
// thread-safe and requires external synchronization (such as holding the db
// mutex or being on the write thread.)
208 209
class MemTableList {
 public:
A
Abhishek Kona 已提交
210
  // A list of memtables.
211
  explicit MemTableList(int min_write_buffer_number_to_merge,
212 213
                        int max_write_buffer_number_to_maintain,
                        int64_t max_write_buffer_size_to_maintain)
I
Igor Canadi 已提交
214
      : imm_flush_needed(false),
215
        imm_trim_needed(false),
I
Igor Canadi 已提交
216
        min_write_buffer_number_to_merge_(min_write_buffer_number_to_merge),
217
        current_(new MemTableListVersion(&current_memory_usage_,
218 219
                                         max_write_buffer_number_to_maintain,
                                         max_write_buffer_size_to_maintain)),
I
Igor Canadi 已提交
220 221
        num_flush_not_started_(0),
        commit_in_progress_(false),
222 223
        flush_requested_(false),
        current_memory_usage_(0),
224
        current_memory_allocted_bytes_excluding_last_(0),
225
        current_has_history_(false) {
I
Igor Canadi 已提交
226
    current_->Ref();
227
  }
228 229 230

  // Should not delete MemTableList without making sure MemTableList::current()
  // is Unref()'d.
I
Igor Canadi 已提交
231 232
  ~MemTableList() {}

233
  MemTableListVersion* current() const { return current_; }
234

L
Lei Jin 已提交
235 236
  // so that background threads can detect non-nullptr pointer to
  // determine whether there is anything more to start flushing.
I
Igor Canadi 已提交
237
  std::atomic<bool> imm_flush_needed;
238

239 240
  std::atomic<bool> imm_trim_needed;

241 242 243 244 245 246 247
  // Returns the total number of memtables in the list that haven't yet
  // been flushed and logged.
  int NumNotFlushed() const;

  // Returns total number of memtables in the list that have been
  // completely flushed and logged.
  int NumFlushed() const;
248 249 250

  // Returns true if there is at least one memtable on which flush has
  // not yet started.
251
  bool IsFlushPending() const;
252

253 254
  // Returns the earliest memtables that needs to be flushed. The returned
  // memtables are guaranteed to be in the ascending order of created time.
255
  void PickMemtablesToFlush(uint64_t max_memtable_id,
256 257
                            autovector<MemTable*>* mems,
                            uint64_t* max_next_log_number = nullptr);
258

L
Lei Jin 已提交
259 260 261
  // Reset status of the given memtable list back to pending state so that
  // they can get picked up again on the next round of flush.
  void RollbackMemtableFlush(const autovector<MemTable*>& mems,
I
Igor Canadi 已提交
262
                             uint64_t file_number);
L
Lei Jin 已提交
263

264 265 266
  // Try commit a successful flush in the manifest file. It might just return
  // Status::OK letting a concurrent flush to do the actual the recording.
  Status TryInstallMemtableFlushResults(
267
      ColumnFamilyData* cfd, const MutableCFOptions& mutable_cf_options,
S
Siying Dong 已提交
268 269
      const autovector<MemTable*>& m, LogsWithPrepTracker* prep_tracker,
      VersionSet* vset, InstrumentedMutex* mu, uint64_t file_number,
270
      autovector<MemTable*>* to_delete, FSDirectory* db_directory,
271
      LogBuffer* log_buffer,
272
      std::list<std::unique_ptr<FlushJobInfo>>* committed_flush_jobs_info,
273
      bool write_edits = true);
274

A
Abhishek Kona 已提交
275
  // New memtables are inserted at the front of the list.
276
  // Takes ownership of the referenced held on *m by the caller of Add().
277 278 279
  // By default, adding memtables will flag that the memtable list needs to be
  // flushed, but in certain situations, like after a mempurge, we may want to
  // avoid flushing the memtable list upon addition of a memtable.
280
  void Add(MemTable* m, autovector<MemTable*>* to_delete);
281 282 283 284

  // Returns an estimate of the number of bytes of data in use.
  size_t ApproximateMemoryUsage();

285 286
  // Returns the cached current_memory_allocted_bytes_excluding_last_ value.
  size_t MemoryAllocatedBytesExcludingLast() const;
287

288 289 290
  // Returns the cached current_has_history_ value.
  bool HasHistory() const;

291 292 293
  // Updates current_memory_allocted_bytes_excluding_last_ and
  // current_has_history_ from MemTableListVersion. Must be called whenever
  // InstallNewVersion is called.
294
  void UpdateCachedValuesFromMemTableListVersion();
295 296 297 298 299

  // `usage` is the current size of the mutable Memtable. When
  // max_write_buffer_size_to_maintain is used, total size of mutable and
  // immutable memtables is checked against it to decide whether to trim
  // memtable list.
300 301 302
  //
  // Return true if memtable is trimmed
  bool TrimHistory(autovector<MemTable*>* to_delete, size_t usage);
303

304 305 306 307
  // Returns an estimate of the number of bytes of data used by
  // the unflushed mem-tables.
  size_t ApproximateUnflushedMemTablesMemoryUsage();

Y
Yi Wu 已提交
308 309 310
  // Returns an estimate of the timestamp of the earliest key.
  uint64_t ApproximateOldestKeyTime() const;

A
agiardullo 已提交
311 312 313 314 315
  // Request a flush of all existing memtables to storage.  This will
  // cause future calls to IsFlushPending() to return true if this list is
  // non-empty (regardless of the min_write_buffer_number_to_merge
  // parameter). This flush request will persist until the next time
  // PickMemtablesToFlush() is called.
316 317
  void FlushRequested() {
    flush_requested_ = true;
318
    // If there are some memtables stored in imm() that don't trigger
319 320 321 322 323 324 325 326 327
    // flush (eg: mempurge output memtable), then update imm_flush_needed.
    // Note: if race condition and imm_flush_needed is set to true
    // when there is num_flush_not_started_==0, then there is no
    // impact whatsoever. Imm_flush_needed is only used in an assert
    // in IsFlushPending().
    if (num_flush_not_started_ > 0) {
      imm_flush_needed.store(true, std::memory_order_release);
    }
  }
328

329 330
  bool HasFlushRequested() { return flush_requested_; }

331 332 333 334 335 336 337 338 339 340 341 342 343 344
  // Returns true if a trim history should be scheduled and the caller should
  // be the one to schedule it
  bool MarkTrimHistoryNeeded() {
    auto expected = false;
    return imm_trim_needed.compare_exchange_strong(
        expected, true, std::memory_order_relaxed, std::memory_order_relaxed);
  }

  void ResetTrimHistoryNeeded() {
    auto expected = true;
    imm_trim_needed.compare_exchange_strong(
        expected, false, std::memory_order_relaxed, std::memory_order_relaxed);
  }

345 346 347 348
  // Copying allowed
  // MemTableList(const MemTableList&);
  // void operator=(const MemTableList&);

349 350
  size_t* current_memory_usage() { return &current_memory_usage_; }

S
Siying Dong 已提交
351 352 353
  // Returns the min log containing the prep section after memtables listsed in
  // `memtables_to_flush` are flushed and their status is persisted in manifest.
  uint64_t PrecomputeMinLogContainingPrepSection(
354
      const std::unordered_set<MemTable*>* memtables_to_flush = nullptr);
355

356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371
  uint64_t GetEarliestMemTableID() const {
    auto& memlist = current_->memlist_;
    if (memlist.empty()) {
      return std::numeric_limits<uint64_t>::max();
    }
    return memlist.back()->GetID();
  }

  uint64_t GetLatestMemTableID() const {
    auto& memlist = current_->memlist_;
    if (memlist.empty()) {
      return 0;
    }
    return memlist.front()->GetID();
  }

Y
Yanqin Jin 已提交
372 373 374 375 376 377 378 379 380 381 382 383 384 385 386
  void AssignAtomicFlushSeq(const SequenceNumber& seq) {
    const auto& memlist = current_->memlist_;
    // Scan the memtable list from new to old
    for (auto it = memlist.begin(); it != memlist.end(); ++it) {
      MemTable* mem = *it;
      if (mem->atomic_flush_seqno_ == kMaxSequenceNumber) {
        mem->atomic_flush_seqno_ = seq;
      } else {
        // Earlier memtables must have been assigned a atomic flush seq, no
        // need to continue scan.
        break;
      }
    }
  }

387 388 389 390 391 392 393
  // Used only by DBImplSecondary during log replay.
  // Remove memtables whose data were written before the WAL with log_number
  // was created, i.e. mem->GetNextLogNumber() <= log_number. The memtables are
  // not freed, but put into a vector for future deref and reclamation.
  void RemoveOldMemTables(uint64_t log_number,
                          autovector<MemTable*>* to_delete);

394
 private:
395 396 397 398 399
  friend Status InstallMemtableAtomicFlushResults(
      const autovector<MemTableList*>* imm_lists,
      const autovector<ColumnFamilyData*>& cfds,
      const autovector<const MutableCFOptions*>& mutable_cf_options_list,
      const autovector<const autovector<MemTable*>*>& mems_list,
400 401
      VersionSet* vset, LogsWithPrepTracker* prep_tracker,
      InstrumentedMutex* mu, const autovector<FileMetaData*>& file_meta,
402 403
      const autovector<std::list<std::unique_ptr<FlushJobInfo>>*>&
          committed_flush_jobs_info,
404
      autovector<MemTable*>* to_delete, FSDirectory* db_directory,
405 406
      LogBuffer* log_buffer);

I
Igor Canadi 已提交
407 408 409
  // DB mutex held
  void InstallNewVersion();

410 411 412 413 414 415 416
  // DB mutex held
  // Called after writing to MANIFEST
  void RemoveMemTablesOrRestoreFlags(const Status& s, ColumnFamilyData* cfd,
                                     size_t batch_count, LogBuffer* log_buffer,
                                     autovector<MemTable*>* to_delete,
                                     InstrumentedMutex* mu);

417
  const int min_write_buffer_number_to_merge_;
I
Igor Canadi 已提交
418 419

  MemTableListVersion* current_;
420 421 422 423 424 425 426

  // the number of elements that still need flushing
  int num_flush_not_started_;

  // committing in progress
  bool commit_in_progress_;

427 428
  // Requested a flush of memtables to storage. It's possible to request that
  // a subset of memtables be flushed.
429 430
  bool flush_requested_;

431 432
  // The current memory usage.
  size_t current_memory_usage_;
433

434 435
  // Cached value of current_->MemoryAllocatedBytesExcludingLast().
  std::atomic<size_t> current_memory_allocted_bytes_excluding_last_;
436 437 438

  // Cached value of current_->HasHistory().
  std::atomic<bool> current_has_history_;
439 440
};

441 442 443 444 445 446 447 448 449 450 451
// Installs memtable atomic flush results.
// In most cases, imm_lists is nullptr, and the function simply uses the
// immutable memtable lists associated with the cfds. There are unit tests that
// installs flush results for external immutable memtable lists other than the
// cfds' own immutable memtable lists, e.g. MemTableLIstTest. In this case,
// imm_lists parameter is not nullptr.
extern Status InstallMemtableAtomicFlushResults(
    const autovector<MemTableList*>* imm_lists,
    const autovector<ColumnFamilyData*>& cfds,
    const autovector<const MutableCFOptions*>& mutable_cf_options_list,
    const autovector<const autovector<MemTable*>*>& mems_list, VersionSet* vset,
452 453
    LogsWithPrepTracker* prep_tracker, InstrumentedMutex* mu,
    const autovector<FileMetaData*>& file_meta,
454 455
    const autovector<std::list<std::unique_ptr<FlushJobInfo>>*>&
        committed_flush_jobs_info,
456
    autovector<MemTable*>* to_delete, FSDirectory* db_directory,
457
    LogBuffer* log_buffer);
458
}  // namespace ROCKSDB_NAMESPACE