db_impl.h 42.4 KB
Newer Older
1
//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
2 3 4 5
//  This source code is licensed under the BSD-style license found in the
//  LICENSE file in the root directory of this source tree. An additional grant
//  of patent rights can be found in the PATENTS file in the same directory.
//
J
jorlow@chromium.org 已提交
6 7 8
// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file. See the AUTHORS file for names of contributors.
9
#pragma once
K
Kai Liu 已提交
10

H
Haobo Xu 已提交
11
#include <atomic>
12
#include <deque>
13
#include <functional>
14
#include <limits>
I
Igor Canadi 已提交
15
#include <list>
16
#include <queue>
17
#include <set>
I
Igor Canadi 已提交
18
#include <string>
19 20
#include <utility>
#include <vector>
K
kailiu 已提交
21

22
#include "db/column_family.h"
23
#include "db/compaction_job.h"
24
#include "db/dbformat.h"
25
#include "db/flush_job.h"
26 27
#include "db/flush_scheduler.h"
#include "db/internal_stats.h"
A
agiardullo 已提交
28 29
#include "db/log_writer.h"
#include "db/snapshot_impl.h"
30
#include "db/version_edit.h"
I
Igor Canadi 已提交
31
#include "db/wal_manager.h"
32 33
#include "db/write_controller.h"
#include "db/write_thread.h"
34
#include "db/writebuffer.h"
K
Kai Liu 已提交
35 36
#include "memtable_list.h"
#include "port/port.h"
37 38 39 40
#include "rocksdb/db.h"
#include "rocksdb/env.h"
#include "rocksdb/memtablerep.h"
#include "rocksdb/transaction_log.h"
S
sdong 已提交
41
#include "table/scoped_arena_iterator.h"
42
#include "util/autovector.h"
I
Igor Canadi 已提交
43 44
#include "util/event_logger.h"
#include "util/hash.h"
45
#include "util/instrumented_mutex.h"
46 47
#include "util/stop_watch.h"
#include "util/thread_local.h"
48

49
namespace rocksdb {
J
jorlow@chromium.org 已提交
50 51 52 53 54 55

class MemTable;
class TableCache;
class Version;
class VersionEdit;
class VersionSet;
56
class Arena;
A
agiardullo 已提交
57
class WriteCallback;
I
Igor Canadi 已提交
58
struct JobContext;
59
struct ExternalSstFileInfo;
60
struct MemTableInfo;
J
jorlow@chromium.org 已提交
61 62 63

class DBImpl : public DB {
 public:
I
Igor Canadi 已提交
64
  DBImpl(const DBOptions& options, const std::string& dbname);
J
jorlow@chromium.org 已提交
65 66 67
  virtual ~DBImpl();

  // Implementations of the DB interface
68 69
  using DB::Put;
  virtual Status Put(const WriteOptions& options,
70
                     ColumnFamilyHandle* column_family, const Slice& key,
I
Igor Sugak 已提交
71
                     const Slice& value) override;
72 73
  using DB::Merge;
  virtual Status Merge(const WriteOptions& options,
74
                       ColumnFamilyHandle* column_family, const Slice& key,
I
Igor Sugak 已提交
75
                       const Slice& value) override;
76 77
  using DB::Delete;
  virtual Status Delete(const WriteOptions& options,
I
Igor Sugak 已提交
78 79
                        ColumnFamilyHandle* column_family,
                        const Slice& key) override;
A
Andres Noetzli 已提交
80 81 82 83
  using DB::SingleDelete;
  virtual Status SingleDelete(const WriteOptions& options,
                              ColumnFamilyHandle* column_family,
                              const Slice& key) override;
84
  using DB::Write;
I
Igor Sugak 已提交
85 86
  virtual Status Write(const WriteOptions& options,
                       WriteBatch* updates) override;
A
agiardullo 已提交
87

88
  using DB::Get;
J
jorlow@chromium.org 已提交
89
  virtual Status Get(const ReadOptions& options,
90
                     ColumnFamilyHandle* column_family, const Slice& key,
I
Igor Sugak 已提交
91
                     std::string* value) override;
92 93 94
  using DB::MultiGet;
  virtual std::vector<Status> MultiGet(
      const ReadOptions& options,
95
      const std::vector<ColumnFamilyHandle*>& column_family,
I
Igor Sugak 已提交
96 97
      const std::vector<Slice>& keys,
      std::vector<std::string>* values) override;
98

99
  virtual Status CreateColumnFamily(const ColumnFamilyOptions& options,
100
                                    const std::string& column_family,
I
Igor Sugak 已提交
101 102
                                    ColumnFamilyHandle** handle) override;
  virtual Status DropColumnFamily(ColumnFamilyHandle* column_family) override;
103

104 105 106 107
  // Returns false if key doesn't exist in the database and true if it may.
  // If value_found is not passed in as null, then return the value if found in
  // memory. On return, if value was found, then value_found will be set to true
  // , otherwise false.
108
  using DB::KeyMayExist;
109
  virtual bool KeyMayExist(const ReadOptions& options,
110
                           ColumnFamilyHandle* column_family, const Slice& key,
I
Igor Sugak 已提交
111 112
                           std::string* value,
                           bool* value_found = nullptr) override;
113 114
  using DB::NewIterator;
  virtual Iterator* NewIterator(const ReadOptions& options,
I
Igor Sugak 已提交
115
                                ColumnFamilyHandle* column_family) override;
116 117
  virtual Status NewIterators(
      const ReadOptions& options,
I
Igor Canadi 已提交
118
      const std::vector<ColumnFamilyHandle*>& column_families,
I
Igor Sugak 已提交
119 120 121
      std::vector<Iterator*>* iterators) override;
  virtual const Snapshot* GetSnapshot() override;
  virtual void ReleaseSnapshot(const Snapshot* snapshot) override;
122
  using DB::GetProperty;
123
  virtual bool GetProperty(ColumnFamilyHandle* column_family,
I
Igor Sugak 已提交
124
                           const Slice& property, std::string* value) override;
125 126 127
  using DB::GetIntProperty;
  virtual bool GetIntProperty(ColumnFamilyHandle* column_family,
                              const Slice& property, uint64_t* value) override;
128 129 130
  using DB::GetAggregatedIntProperty;
  virtual bool GetAggregatedIntProperty(const Slice& property,
                                        uint64_t* aggregated_value) override;
131
  using DB::GetApproximateSizes;
132
  virtual void GetApproximateSizes(ColumnFamilyHandle* column_family,
133 134
                                   const Range* range, int n, uint64_t* sizes,
                                   bool include_memtable = false) override;
135
  using DB::CompactRange;
136 137 138
  virtual Status CompactRange(const CompactRangeOptions& options,
                              ColumnFamilyHandle* column_family,
                              const Slice* begin, const Slice* end) override;
139

140
  using DB::CompactFiles;
I
Igor Sugak 已提交
141 142 143 144 145
  virtual Status CompactFiles(const CompactionOptions& compact_options,
                              ColumnFamilyHandle* column_family,
                              const std::vector<std::string>& input_file_names,
                              const int output_level,
                              const int output_path_id = -1) override;
146

147 148 149
  virtual Status PauseBackgroundWork() override;
  virtual Status ContinueBackgroundWork() override;

150 151 152
  virtual Status EnableAutoCompaction(
      const std::vector<ColumnFamilyHandle*>& column_family_handles) override;

153
  using DB::SetOptions;
I
Igor Sugak 已提交
154 155 156
  Status SetOptions(
      ColumnFamilyHandle* column_family,
      const std::unordered_map<std::string, std::string>& options_map) override;
157

158
  using DB::NumberLevels;
I
Igor Sugak 已提交
159
  virtual int NumberLevels(ColumnFamilyHandle* column_family) override;
160
  using DB::MaxMemCompactionLevel;
I
Igor Sugak 已提交
161
  virtual int MaxMemCompactionLevel(ColumnFamilyHandle* column_family) override;
162
  using DB::Level0StopWriteTrigger;
I
Igor Sugak 已提交
163 164 165 166
  virtual int Level0StopWriteTrigger(
      ColumnFamilyHandle* column_family) override;
  virtual const std::string& GetName() const override;
  virtual Env* GetEnv() const override;
167
  using DB::GetOptions;
I
Igor Sugak 已提交
168 169
  virtual const Options& GetOptions(
      ColumnFamilyHandle* column_family) const override;
170 171
  using DB::GetDBOptions;
  virtual const DBOptions& GetDBOptions() const override;
172 173
  using DB::Flush;
  virtual Status Flush(const FlushOptions& options,
I
Igor Sugak 已提交
174
                       ColumnFamilyHandle* column_family) override;
175
  virtual Status SyncWAL() override;
I
Igor Canadi 已提交
176

I
Igor Sugak 已提交
177
  virtual SequenceNumber GetLatestSequenceNumber() const override;
I
Igor Canadi 已提交
178 179

#ifndef ROCKSDB_LITE
I
Igor Sugak 已提交
180 181
  virtual Status DisableFileDeletions() override;
  virtual Status EnableFileDeletions(bool force) override;
182
  virtual int IsFileDeletionsEnabled() const;
I
Igor Canadi 已提交
183
  // All the returned filenames start with "/"
184
  virtual Status GetLiveFiles(std::vector<std::string>&,
185
                              uint64_t* manifest_file_size,
I
Igor Sugak 已提交
186 187
                              bool flush_memtable = true) override;
  virtual Status GetSortedWalFiles(VectorLogPtr& files) override;
I
Igor Canadi 已提交
188

189 190 191
  virtual Status GetUpdatesSince(
      SequenceNumber seq_number, unique_ptr<TransactionLogIterator>* iter,
      const TransactionLogIterator::ReadOptions&
I
Igor Sugak 已提交
192 193
          read_options = TransactionLogIterator::ReadOptions()) override;
  virtual Status DeleteFile(std::string name) override;
194 195
  Status DeleteFilesInRange(ColumnFamilyHandle* column_family,
                            const Slice* begin, const Slice* end);
196

I
Igor Sugak 已提交
197 198
  virtual void GetLiveFilesMetaData(
      std::vector<LiveFileMetaData>* metadata) override;
199 200 201 202 203 204 205 206 207

  // Obtains the meta data of the specified column family of the DB.
  // Status::NotFound() will be returned if the current DB does not have
  // any column family match the specified name.
  // TODO(yhchiang): output parameter is placed in the end in this codebase.
  virtual void GetColumnFamilyMetaData(
      ColumnFamilyHandle* column_family,
      ColumnFamilyMetaData* metadata) override;

208 209 210 211
  // experimental API
  Status SuggestCompactRange(ColumnFamilyHandle* column_family,
                             const Slice* begin, const Slice* end);

212 213
  Status PromoteL0(ColumnFamilyHandle* column_family, int target_level);

A
agiardullo 已提交
214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233
  // Similar to Write() but will call the callback once on the single write
  // thread to determine whether it is safe to perform the write.
  virtual Status WriteWithCallback(const WriteOptions& write_options,
                                   WriteBatch* my_batch,
                                   WriteCallback* callback);

  // Returns the sequence number that is guaranteed to be smaller than or equal
  // to the sequence number of any key that could be inserted into the current
  // memtables. It can then be assumed that any write with a larger(or equal)
  // sequence number will be present in this memtable or a later memtable.
  //
  // If the earliest sequence number could not be determined,
  // kMaxSequenceNumber will be returned.
  //
  // If include_history=true, will also search Memtables in MemTableList
  // History.
  SequenceNumber GetEarliestMemTableSequenceNumber(SuperVersion* sv,
                                                   bool include_history);

  // For a given key, check to see if there are any records for this key
234 235 236 237
  // in the memtables, including memtable history.  If cache_only is false,
  // SST files will also be checked.
  //
  // If a key is found, *found_record_for_key will be set to true and
238
  // *seq will be set to the stored sequence number for the latest
239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257
  // operation on this key or kMaxSequenceNumber if unknown.
  // If no key is found, *found_record_for_key will be set to false.
  //
  // Note: If cache_only=false, it is possible for *seq to be set to 0 if
  // the sequence number has been cleared from the record.  If the caller is
  // holding an active db snapshot, we know the missing sequence must be less
  // than the snapshot's sequence number (sequence numbers are only cleared
  // when there are no earlier active snapshots).
  //
  // If NotFound is returned and found_record_for_key is set to false, then no
  // record for this key was found.  If the caller is holding an active db
  // snapshot, we know that no key could have existing after this snapshot
  // (since we do not compact keys that have an earlier snapshot).
  //
  // Returns OK or NotFound on success,
  // other status on unexpected error.
  Status GetLatestSequenceForKey(SuperVersion* sv, const Slice& key,
                                 bool cache_only, SequenceNumber* seq,
                                 bool* found_record_for_key);
A
agiardullo 已提交
258

259 260 261 262 263 264 265
  using DB::AddFile;
  virtual Status AddFile(ColumnFamilyHandle* column_family,
                         const ExternalSstFileInfo* file_info,
                         bool move_file) override;
  virtual Status AddFile(ColumnFamilyHandle* column_family,
                         const std::string& file_path, bool move_file) override;

I
Igor Canadi 已提交
266
#endif  // ROCKSDB_LITE
267

268 269 270 271 272 273
  // Similar to GetSnapshot(), but also lets the db know that this snapshot
  // will be used for transaction write-conflict checking.  The DB can then
  // make sure not to compact any keys that would prevent a write-conflict from
  // being detected.
  const Snapshot* GetSnapshotForWriteConflictBoundary();

I
Igor Canadi 已提交
274 275 276 277
  // checks if all live files exist on file system and that their file sizes
  // match to our in-memory records
  virtual Status CheckConsistency();

278
  virtual Status GetDbIdentity(std::string& identity) const override;
279

I
Igor Canadi 已提交
280
  Status RunManualCompaction(ColumnFamilyData* cfd, int input_level,
281
                             int output_level, uint32_t output_path_id,
282
                             const Slice* begin, const Slice* end,
283
                             bool exclusive,
284
                             bool disallow_trivial_move = false);
285

286 287 288
  // Return an internal iterator over the current state of the database.
  // The keys of this iterator are internal keys (see format.h).
  // The returned iterator should be deleted when no longer needed.
S
sdong 已提交
289 290
  InternalIterator* NewInternalIterator(
      Arena* arena, ColumnFamilyHandle* column_family = nullptr);
291

292
#ifndef NDEBUG
J
jorlow@chromium.org 已提交
293
  // Extra methods (for testing) that are not in the public DB interface
I
Igor Canadi 已提交
294
  // Implemented in db_impl_debug.cc
J
jorlow@chromium.org 已提交
295

296
  // Compact any files in the named level that overlap [*begin, *end]
297
  Status TEST_CompactRange(int level, const Slice* begin, const Slice* end,
298 299
                           ColumnFamilyHandle* column_family = nullptr,
                           bool disallow_trivial_move = false);
J
jorlow@chromium.org 已提交
300

301
  // Force current memtable contents to be flushed.
302 303
  Status TEST_FlushMemTable(bool wait = true,
                            ColumnFamilyHandle* cfh = nullptr);
J
jorlow@chromium.org 已提交
304

305
  // Wait for memtable compaction
306
  Status TEST_WaitForFlushMemTable(ColumnFamilyHandle* column_family = nullptr);
307 308 309 310

  // Wait for any compaction
  Status TEST_WaitForCompact();

311 312
  // Return the maximum overlapping data (in bytes) at next level for any
  // file at a level >= 1.
313 314
  int64_t TEST_MaxNextLevelOverlappingBytes(ColumnFamilyHandle* column_family =
                                                nullptr);
315

A
Abhishek Kona 已提交
316 317
  // Return the current manifest file no.
  uint64_t TEST_Current_Manifest_FileNo();
318

319
  // get total level0 file size. Only for testing.
320
  uint64_t TEST_GetLevel0TotalSize();
321

322 323
  void TEST_GetFilesMetaData(ColumnFamilyHandle* column_family,
                             std::vector<std::vector<FileMetaData>>* metadata);
324

325 326 327 328 329 330 331 332 333 334
  void TEST_LockMutex();

  void TEST_UnlockMutex();

  // REQUIRES: mutex locked
  void* TEST_BeginWrite();

  // REQUIRES: mutex locked
  // pass the pointer that you got from TEST_BeginWrite()
  void TEST_EndWrite(void* w);
335

336
  uint64_t TEST_MaxTotalInMemoryState() const {
337 338
    return max_total_in_memory_state_;
  }
339

340 341
  size_t TEST_LogsToFreeSize();

342 343
  uint64_t TEST_LogfileNumber();

344 345 346 347
  // Returns column family name to ImmutableCFOptions map.
  Status TEST_GetAllImmutableCFOptions(
      std::unordered_map<std::string, const ImmutableCFOptions*>* iopts_map);

A
Aaron Gao 已提交
348 349 350 351
  // Return the lastest MutableCFOptions of of a column family
  Status TEST_GetLatestMutableCFOptions(ColumnFamilyHandle* column_family,
                                        MutableCFOptions* mutable_cf_opitons);

352 353
  Cache* TEST_table_cache() { return table_cache_.get(); }

354
  WriteController& TEST_write_controler() { return write_controller_; }
355

356 357 358
  uint64_t TEST_FindMinLogContainingOutstandingPrep();
  uint64_t TEST_FindMinPrepLogReferencedByMemTable();

359
#endif  // NDEBUG
I
Igor Canadi 已提交
360

361 362 363 364
  // Return maximum background compaction alowed to be scheduled based on
  // compaction status.
  int BGCompactionsAllowed() const;

I
Igor Canadi 已提交
365
  // Returns the list of live files in 'live' and the list
K
kailiu 已提交
366
  // of all files in the filesystem in 'candidate_files'.
I
Igor Canadi 已提交
367
  // If force == false and the last call was less than
368
  // db_options_.delete_obsolete_files_period_micros microseconds ago,
I
Igor Canadi 已提交
369 370
  // it will not fill up the job_context
  void FindObsoleteFiles(JobContext* job_context, bool force,
I
Igor Canadi 已提交
371 372 373 374 375 376
                         bool no_full_scan = false);

  // Diffs the files listed in filenames and those that do not
  // belong to live files are posibly removed. Also, removes all the
  // files in sst_delete_files and log_delete_files.
  // It is not necessary to hold the mutex when invoking this method.
377 378 379 380
  void PurgeObsoleteFiles(const JobContext& background_contet,
                          bool schedule_only = false);

  void SchedulePurge();
I
Igor Canadi 已提交
381

I
Igor Sugak 已提交
382
  ColumnFamilyHandle* DefaultColumnFamily() const override;
383

384 385
  const SnapshotList& snapshots() const { return snapshots_; }

386
  void CancelAllBackgroundWork(bool wait);
387

A
agiardullo 已提交
388 389 390 391 392 393 394 395 396 397 398
  // Find Super version and reference it. Based on options, it might return
  // the thread local cached one.
  // Call ReturnAndCleanupSuperVersion() when it is no longer needed.
  SuperVersion* GetAndRefSuperVersion(ColumnFamilyData* cfd);

  // Similar to the previous function but looks up based on a column family id.
  // nullptr will be returned if this column family no longer exists.
  // REQUIRED: this function should only be called on the write thread or if the
  // mutex is held.
  SuperVersion* GetAndRefSuperVersion(uint32_t column_family_id);

A
agiardullo 已提交
399 400 401
  // Same as above, should called without mutex held and not on write thread.
  SuperVersion* GetAndRefSuperVersionUnlocked(uint32_t column_family_id);

A
agiardullo 已提交
402 403 404 405 406 407 408 409 410 411
  // Un-reference the super version and return it to thread local cache if
  // needed. If it is the last reference of the super version. Clean it up
  // after un-referencing it.
  void ReturnAndCleanupSuperVersion(ColumnFamilyData* cfd, SuperVersion* sv);

  // Similar to the previous function but looks up based on a column family id.
  // nullptr will be returned if this column family no longer exists.
  // REQUIRED: this function should only be called on the write thread.
  void ReturnAndCleanupSuperVersion(uint32_t colun_family_id, SuperVersion* sv);

A
agiardullo 已提交
412 413 414 415
  // Same as above, should called without mutex held and not on write thread.
  void ReturnAndCleanupSuperVersionUnlocked(uint32_t colun_family_id,
                                            SuperVersion* sv);

A
agiardullo 已提交
416 417 418 419 420
  // REQUIRED: this function should only be called on the write thread or if the
  // mutex is held.  Return value only valid until next call to this function or
  // mutex is released.
  ColumnFamilyHandle* GetColumnFamilyHandle(uint32_t column_family_id);

A
agiardullo 已提交
421 422 423
  // Same as above, should called without mutex held and not on write thread.
  ColumnFamilyHandle* GetColumnFamilyHandleUnlocked(uint32_t column_family_id);

424 425 426 427 428 429 430 431 432 433 434 435 436 437
  // Returns the number of currently running flushes.
  // REQUIREMENT: mutex_ must be held when calling this function.
  int num_running_flushes() {
    mutex_.AssertHeld();
    return num_running_flushes_;
  }

  // Returns the number of currently running compactions.
  // REQUIREMENT: mutex_ must be held when calling this function.
  int num_running_compactions() {
    mutex_.AssertHeld();
    return num_running_compactions_;
  }

438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453
  // hollow transactions shell used for recovery.
  // these will then be passed to TransactionDB so that
  // locks can be reacquired before writing can resume.
  struct RecoveredTransaction {
    uint64_t log_number_;
    std::string name_;
    WriteBatch* batch_;
    explicit RecoveredTransaction(const uint64_t log, const std::string& name,
                                  WriteBatch* batch)
        : log_number_(log), name_(name), batch_(batch) {}

    ~RecoveredTransaction() { delete batch_; }
  };

  bool allow_2pc() const { return db_options_.allow_2pc; }

R
Reid Horuff 已提交
454 455 456 457 458
  std::unordered_map<std::string, RecoveredTransaction*>
  recovered_transactions() {
    return recovered_transactions_;
  }

459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482
  RecoveredTransaction* GetRecoveredTransaction(const std::string& name) {
    auto it = recovered_transactions_.find(name);
    if (it == recovered_transactions_.end()) {
      return nullptr;
    } else {
      return it->second;
    }
  }

  void InsertRecoveredTransaction(const uint64_t log, const std::string& name,
                                  WriteBatch* batch) {
    recovered_transactions_[name] = new RecoveredTransaction(log, name, batch);
    MarkLogAsContainingPrepSection(log);
  }

  void DeleteRecoveredTransaction(const std::string& name) {
    auto it = recovered_transactions_.find(name);
    assert(it != recovered_transactions_.end());
    auto* trx = it->second;
    recovered_transactions_.erase(it);
    MarkLogAsHavingPrepSectionFlushed(trx->log_number_);
    delete trx;
  }

R
Reid Horuff 已提交
483 484 485 486 487 488 489 490
  void DeleteAllRecoveredTransactions() {
    for (auto it = recovered_transactions_.begin();
         it != recovered_transactions_.end(); it++) {
      delete it->second;
    }
    recovered_transactions_.clear();
  }

491 492 493
  void MarkLogAsHavingPrepSectionFlushed(uint64_t log);
  void MarkLogAsContainingPrepSection(uint64_t log);

494 495
  Status NewDB();

496
 protected:
H
heyongqiang 已提交
497 498
  Env* const env_;
  const std::string dbname_;
499
  unique_ptr<VersionSet> versions_;
500
  const DBOptions db_options_;
L
Lei Jin 已提交
501
  Statistics* stats_;
502 503
  std::unordered_map<std::string, RecoveredTransaction*>
      recovered_transactions_;
H
heyongqiang 已提交
504

S
sdong 已提交
505 506 507 508
  InternalIterator* NewInternalIterator(const ReadOptions&,
                                        ColumnFamilyData* cfd,
                                        SuperVersion* super_version,
                                        Arena* arena);
509

510 511 512
  // Except in DB::Open(), WriteOptionsFile can only be called when:
  // 1. WriteThread::Writer::EnterUnbatched() is used.
  // 2. db_mutex is held
513
  Status WriteOptionsFile();
514 515 516 517

  // The following two functions can only be called when:
  // 1. WriteThread::Writer::EnterUnbatched() is used.
  // 2. db_mutex is NOT held
518 519 520
  Status RenameTempFileToOptionsFile(const std::string& file_name);
  Status DeleteObsoleteOptionsFiles();

521
  void NotifyOnFlushCompleted(ColumnFamilyData* cfd, FileMetaData* file_meta,
522
                              const MutableCFOptions& mutable_cf_options,
523
                              int job_id, TableProperties prop);
524

O
Ori Bernstein 已提交
525
  void NotifyOnCompactionCompleted(ColumnFamilyData* cfd,
526 527
                                   Compaction *c, const Status &st,
                                   const CompactionJobStats& job_stats,
528
                                   int job_id);
W
Wanning Jiang 已提交
529
  void NotifyOnMemTableSealed(ColumnFamilyData* cfd,
530
                              const MemTableInfo& mem_table_info);
O
Ori Bernstein 已提交
531

Y
Yueh-Hsuan Chiang 已提交
532 533 534 535 536 537
  void NewThreadStatusCfInfo(ColumnFamilyData* cfd) const;

  void EraseThreadStatusCfInfo(ColumnFamilyData* cfd) const;

  void EraseThreadStatusDbInfo() const;

A
agiardullo 已提交
538
  Status WriteImpl(const WriteOptions& options, WriteBatch* updates,
539 540 541 542 543 544
                   WriteCallback* callback = nullptr,
                   uint64_t* log_used = nullptr, uint64_t log_ref = 0,
                   bool disable_memtable = false);

  uint64_t FindMinLogContainingOutstandingPrep();
  uint64_t FindMinPrepLogReferencedByMemTable();
A
agiardullo 已提交
545

J
jorlow@chromium.org 已提交
546 547
 private:
  friend class DB;
548
  friend class InternalStats;
R
Reid Horuff 已提交
549
  friend class TransactionImpl;
I
Igor Canadi 已提交
550
#ifndef ROCKSDB_LITE
L
Lei Jin 已提交
551
  friend class ForwardIterator;
I
Igor Canadi 已提交
552
#endif
553
  friend struct SuperVersion;
L
Lei Jin 已提交
554
  friend class CompactedDBImpl;
A
agiardullo 已提交
555 556 557
#ifndef NDEBUG
  friend class XFTransactionWriteHandler;
#endif
558
  struct CompactionState;
559

S
Stanislau Hlebik 已提交
560
  struct WriteContext;
J
jorlow@chromium.org 已提交
561

562 563
  struct PurgeFileInfo;

J
jorlow@chromium.org 已提交
564 565 566
  // Recover the descriptor from persistent storage.  May do a significant
  // amount of work to recover recently logged updates.  Any changes to
  // be made to the descriptor are added to *edit.
567
  Status Recover(const std::vector<ColumnFamilyDescriptor>& column_families,
568 569
                 bool read_only = false, bool error_if_log_file_exist = false,
                 bool error_if_data_exists_in_logs = false);
J
jorlow@chromium.org 已提交
570 571 572

  void MaybeIgnoreError(Status* s) const;

573 574
  const Status CreateArchivalDirectory();

J
jorlow@chromium.org 已提交
575 576
  // Delete any unneeded files and stale in-memory entries.
  void DeleteObsoleteFiles();
577 578 579 580
  // Delete obsolete files and log status and information of file deletion
  void DeleteObsoleteFileImpl(Status file_deletion_status, int job_id,
                              const std::string& fname, FileType type,
                              uint64_t number, uint32_t path_id);
J
jorlow@chromium.org 已提交
581

I
Igor Canadi 已提交
582 583
  // Background process needs to call
  //     auto x = CaptureCurrentFileNumberInPendingOutputs()
584
  //     auto file_num = versions_->NewFileNumber();
I
Igor Canadi 已提交
585 586
  //     <do something>
  //     ReleaseFileNumberFromPendingOutputs(x)
587 588
  // This will protect any file with number `file_num` or greater from being
  // deleted while <do something> is running.
I
Igor Canadi 已提交
589 590 591 592 593 594 595 596 597 598 599 600
  // -----------
  // This function will capture current file number and append it to
  // pending_outputs_. This will prevent any background process to delete any
  // file created after this point.
  std::list<uint64_t>::iterator CaptureCurrentFileNumberInPendingOutputs();
  // This function should be called with the result of
  // CaptureCurrentFileNumberInPendingOutputs(). It then marks that any file
  // created between the calls CaptureCurrentFileNumberInPendingOutputs() and
  // ReleaseFileNumberFromPendingOutputs() can now be deleted (if it's not live
  // and blocked by any other pending_outputs_ calls)
  void ReleaseFileNumberFromPendingOutputs(std::list<uint64_t>::iterator v);

601
  // Flush the in-memory write buffer to storage.  Switches to a new
J
jorlow@chromium.org 已提交
602
  // log-file/memtable and writes a new descriptor iff successful.
I
Igor Canadi 已提交
603 604 605 606
  Status FlushMemTableToOutputFile(ColumnFamilyData* cfd,
                                   const MutableCFOptions& mutable_cf_options,
                                   bool* madeProgress, JobContext* job_context,
                                   LogBuffer* log_buffer);
J
jorlow@chromium.org 已提交
607

S
Stanislau Hlebik 已提交
608 609 610
  // REQUIRES: log_numbers are sorted in ascending order
  Status RecoverLogFiles(const std::vector<uint64_t>& log_numbers,
                         SequenceNumber* max_sequence, bool read_only);
J
jorlow@chromium.org 已提交
611

612
  // The following two methods are used to flush a memtable to
Y
Yueh-Hsuan Chiang 已提交
613
  // storage. The first one is used at database RecoveryTime (when the
614 615 616
  // database is opened) and is heavyweight because it holds the mutex
  // for the entire period. The second method WriteLevel0Table supports
  // concurrent flush memtables to storage.
617 618
  Status WriteLevel0TableForRecovery(int job_id, ColumnFamilyData* cfd,
                                     MemTable* mem, VersionEdit* edit);
S
sdong 已提交
619 620 621

  // num_bytes: for slowdown case, delay time is calculated based on
  //            `num_bytes` going through.
622
  Status DelayWrite(uint64_t num_bytes);
623

I
Igor Canadi 已提交
624
  Status ScheduleFlushes(WriteContext* context);
625

I
Igor Canadi 已提交
626
  Status SwitchMemtable(ColumnFamilyData* cfd, WriteContext* context);
S
Stanislau Hlebik 已提交
627

H
heyongqiang 已提交
628
  // Force current memtable contents to be flushed.
629
  Status FlushMemTable(ColumnFamilyData* cfd, const FlushOptions& options);
H
heyongqiang 已提交
630

631
  // Wait for memtable flushed
632
  Status WaitForFlushMemTable(ColumnFamilyData* cfd);
H
heyongqiang 已提交
633

I
Igor Canadi 已提交
634
#ifndef ROCKSDB_LITE
635 636 637 638 639
  // Finds the lowest level in the DB that the ingested file can be added to
  // REQUIRES: mutex_ held
  int PickLevelForIngestedFile(ColumnFamilyData* cfd,
                               const ExternalSstFileInfo* file_info);

640 641 642
  Status CompactFilesImpl(
      const CompactionOptions& compact_options, ColumnFamilyData* cfd,
      Version* version, const std::vector<std::string>& input_file_names,
643 644
      const int output_level, int output_path_id, JobContext* job_context,
      LogBuffer* log_buffer);
I
Igor Canadi 已提交
645
#endif  // ROCKSDB_LITE
646 647 648

  ColumnFamilyData* GetColumnFamilyDataByName(const std::string& cf_name);

649
  void MaybeScheduleFlushOrCompaction();
650 651
  void SchedulePendingFlush(ColumnFamilyData* cfd);
  void SchedulePendingCompaction(ColumnFamilyData* cfd);
652 653
  void SchedulePendingPurge(std::string fname, FileType type, uint64_t number,
                            uint32_t path_id, int job_id);
654
  static void BGWorkCompaction(void* arg);
655
  static void BGWorkFlush(void* db);
656
  static void BGWorkPurge(void* arg);
657 658
  static void UnscheduleCallback(void* arg);
  void BackgroundCallCompaction(void* arg);
659
  void BackgroundCallFlush();
660
  void BackgroundCallPurge();
I
Igor Canadi 已提交
661
  Status BackgroundCompaction(bool* madeProgress, JobContext* job_context,
662
                              LogBuffer* log_buffer, void* m = 0);
I
Igor Canadi 已提交
663
  Status BackgroundFlush(bool* madeProgress, JobContext* job_context,
H
Haobo Xu 已提交
664
                         LogBuffer* log_buffer);
J
jorlow@chromium.org 已提交
665

666 667
  void PrintStatistics();

668
  // dump rocksdb.stats to LOG
669 670
  void MaybeDumpStats();

671 672
  // Return the minimum empty level that could hold the total data in the
  // input level. Return the input level, if such level could not be found.
673 674
  int FindMinimumEmptyLevelFitting(ColumnFamilyData* cfd,
      const MutableCFOptions& mutable_cf_options, int level);
675

676 677 678
  // Move the files in the input level to the target level.
  // If target_level < 0, automatically calculate the minimum level that could
  // hold the data set.
I
Igor Canadi 已提交
679
  Status ReFitLevel(ColumnFamilyData* cfd, int level, int target_level = -1);
680

681 682 683 684 685 686
  // helper functions for adding and removing from flush & compaction queues
  void AddToCompactionQueue(ColumnFamilyData* cfd);
  ColumnFamilyData* PopFirstFromCompactionQueue();
  void AddToFlushQueue(ColumnFamilyData* cfd);
  ColumnFamilyData* PopFirstFromFlushQueue();

687 688 689
  // helper function to call after some of the logs_ were synced
  void MarkLogsSynced(uint64_t up_to, bool synced_dir, const Status& status);

690 691
  const Snapshot* GetSnapshotImpl(bool is_write_conflict_boundary);

J
jorlow@chromium.org 已提交
692
  // table_cache_ provides its own synchronization
I
Igor Canadi 已提交
693
  std::shared_ptr<Cache> table_cache_;
J
jorlow@chromium.org 已提交
694

695
  // Lock over the persistent DB state.  Non-nullptr iff successfully acquired.
J
jorlow@chromium.org 已提交
696 697
  FileLock* db_lock_;

698 699 700 701
  // The mutex for options file related operations.
  // NOTE: should never acquire options_file_mutex_ and mutex_ at the
  //       same time.
  InstrumentedMutex options_files_mutex_;
J
jorlow@chromium.org 已提交
702
  // State below is protected by mutex_
703
  InstrumentedMutex mutex_;
704

I
Igor Canadi 已提交
705
  std::atomic<bool> shutting_down_;
706 707
  // This condition variable is signaled on these conditions:
  // * whenever bg_compaction_scheduled_ goes down to 0
708
  // * if AnyManualCompaction, whenever a compaction finishes, even if it hasn't
709 710
  // made any progress
  // * whenever a compaction made any progress
711 712 713
  // * whenever bg_flush_scheduled_ or bg_purge_scheduled_ value decreases
  // (i.e. whenever a flush is done, even if it didn't make any progress)
  // * whenever there is an error in background purge, flush or compaction
714
  InstrumentedCondVar bg_cv_;
715
  uint64_t logfile_number_;
S
Sage Weil 已提交
716 717
  std::deque<uint64_t>
      log_recycle_files;  // a list of log files that we can recycle
718
  bool log_dir_synced_;
I
Igor Canadi 已提交
719
  bool log_empty_;
720
  ColumnFamilyHandleImpl* default_cf_handle_;
721
  InternalStats* default_cf_internal_stats_;
722
  unique_ptr<ColumnFamilyMemTablesImpl> column_family_memtables_;
I
Igor Canadi 已提交
723 724
  struct LogFileNumberSize {
    explicit LogFileNumberSize(uint64_t _number)
725
        : number(_number) {}
I
Igor Canadi 已提交
726 727
    void AddSize(uint64_t new_size) { size += new_size; }
    uint64_t number;
728 729 730 731
    uint64_t size = 0;
    bool getting_flushed = false;
  };
  struct LogWriterNumber {
732 733 734 735 736 737 738 739 740 741 742 743 744 745
    // pass ownership of _writer
    LogWriterNumber(uint64_t _number, log::Writer* _writer)
        : number(_number), writer(_writer) {}

    log::Writer* ReleaseWriter() {
      auto* w = writer;
      writer = nullptr;
      return w;
    }
    void ClearWriter() {
      delete writer;
      writer = nullptr;
    }

746
    uint64_t number;
747 748 749
    // Visual Studio doesn't support deque's member to be noncopyable because
    // of a unique_ptr as a member.
    log::Writer* writer;  // own
750 751
    // true for some prefix of logs_
    bool getting_synced = false;
I
Igor Canadi 已提交
752 753
  };
  std::deque<LogFileNumberSize> alive_log_files_;
754 755 756 757 758 759 760 761 762 763
  // Log files that aren't fully synced, and the current log file.
  // Synchronization:
  //  - push_back() is done from write thread with locked mutex_,
  //  - pop_front() is done from any thread with locked mutex_,
  //  - back() and items with getting_synced=true are not popped,
  //  - it follows that write thread with unlocked mutex_ can safely access
  //    back() and items with getting_synced=true.
  std::deque<LogWriterNumber> logs_;
  // Signaled when getting_synced becomes false for some of the logs_.
  InstrumentedCondVar log_sync_cv_;
I
Igor Canadi 已提交
764 765 766 767
  uint64_t total_log_size_;
  // only used for dynamically adjusting max_total_wal_size. it is a sum of
  // [write_buffer_size * max_write_buffer_number] over all column families
  uint64_t max_total_in_memory_state_;
768 769 770
  // If true, we have only one (default) column family. We use this to optimize
  // some code-paths
  bool single_column_family_mode_;
771 772 773
  // If this is non-empty, we need to delete these log files in background
  // threads. Protected by db mutex.
  autovector<log::Writer*> logs_to_free_;
I
Igor Canadi 已提交
774

S
sdong 已提交
775 776
  bool is_snapshot_supported_;

777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804
  // Class to maintain directories for all database paths other than main one.
  class Directories {
   public:
    Status SetDirectories(Env* env, const std::string& dbname,
                          const std::string& wal_dir,
                          const std::vector<DbPath>& data_paths);

    Directory* GetDataDir(size_t path_id);

    Directory* GetWalDir() {
      if (wal_dir_) {
        return wal_dir_.get();
      }
      return db_dir_.get();
    }

    Directory* GetDbDir() { return db_dir_.get(); }

   private:
    std::unique_ptr<Directory> db_dir_;
    std::vector<std::unique_ptr<Directory>> data_dirs_;
    std::unique_ptr<Directory> wal_dir_;

    Status CreateAndNewDirectory(Env* env, const std::string& dirname,
                                 std::unique_ptr<Directory>* directory) const;
  };

  Directories directories_;
805

806 807
  WriteBuffer write_buffer_;

I
Igor Canadi 已提交
808 809
  WriteThread write_thread_;

810
  WriteBatch tmp_batch_;
811

812
  WriteController write_controller_;
S
sdong 已提交
813 814 815 816 817

  // Size of the last batch group. In slowdown mode, next write needs to
  // sleep if it uses up the quota.
  uint64_t last_batch_group_size_;

I
Igor Canadi 已提交
818
  FlushScheduler flush_scheduler_;
819

J
jorlow@chromium.org 已提交
820 821
  SnapshotList snapshots_;

I
Igor Canadi 已提交
822 823 824 825 826 827 828 829 830 831
  // For each background job, pending_outputs_ keeps the current file number at
  // the time that background job started.
  // FindObsoleteFiles()/PurgeObsoleteFiles() never deletes any file that has
  // number bigger than any of the file number in pending_outputs_. Since file
  // numbers grow monotonically, this also means that pending_outputs_ is always
  // sorted. After a background job is done executing, its file number is
  // deleted from pending_outputs_, which allows PurgeObsoleteFiles() to clean
  // it up.
  // State is protected with db mutex.
  std::list<uint64_t> pending_outputs_;
J
jorlow@chromium.org 已提交
832

833 834 835 836 837 838 839 840 841 842 843 844 845
  // PurgeFileInfo is a structure to hold information of files to be deleted in
  // purge_queue_
  struct PurgeFileInfo {
    std::string fname;
    FileType type;
    uint64_t number;
    uint32_t path_id;
    int job_id;
    PurgeFileInfo(std::string fn, FileType t, uint64_t num, uint32_t pid,
                  int jid)
        : fname(fn), type(t), number(num), path_id(pid), job_id(jid) {}
  };

846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869
  // flush_queue_ and compaction_queue_ hold column families that we need to
  // flush and compact, respectively.
  // A column family is inserted into flush_queue_ when it satisfies condition
  // cfd->imm()->IsFlushPending()
  // A column family is inserted into compaction_queue_ when it satisfied
  // condition cfd->NeedsCompaction()
  // Column families in this list are all Ref()-erenced
  // TODO(icanadi) Provide some kind of ReferencedColumnFamily class that will
  // do RAII on ColumnFamilyData
  // Column families are in this queue when they need to be flushed or
  // compacted. Consumers of these queues are flush and compaction threads. When
  // column family is put on this queue, we increase unscheduled_flushes_ and
  // unscheduled_compactions_. When these variables are bigger than zero, that
  // means we need to schedule background threads for compaction and thread.
  // Once the background threads are scheduled, we decrease unscheduled_flushes_
  // and unscheduled_compactions_. That way we keep track of number of
  // compaction and flush threads we need to schedule. This scheduling is done
  // in MaybeScheduleFlushOrCompaction()
  // invariant(column family present in flush_queue_ <==>
  // ColumnFamilyData::pending_flush_ == true)
  std::deque<ColumnFamilyData*> flush_queue_;
  // invariant(column family present in compaction_queue_ <==>
  // ColumnFamilyData::pending_compaction_ == true)
  std::deque<ColumnFamilyData*> compaction_queue_;
870 871 872

  // A queue to store filenames of the files to be purged
  std::deque<PurgeFileInfo> purge_queue_;
873 874
  int unscheduled_flushes_;
  int unscheduled_compactions_;
875

876
  // count how many background compactions are running or have been scheduled
877
  int bg_compaction_scheduled_;
J
jorlow@chromium.org 已提交
878

879 880 881
  // stores the number of compactions are currently running
  int num_running_compactions_;

882 883 884
  // number of background memtable flush jobs, submitted to the HIGH pool
  int bg_flush_scheduled_;

885 886 887
  // stores the number of flushes are currently running
  int num_running_flushes_;

888 889 890
  // number of background obsolete file purge jobs, submitted to the HIGH pool
  int bg_purge_scheduled_;

H
hans@chromium.org 已提交
891 892
  // Information for a manual compaction
  struct ManualCompaction {
I
Igor Canadi 已提交
893
    ColumnFamilyData* cfd;
894 895
    int input_level;
    int output_level;
896
    uint32_t output_path_id;
L
Lei Jin 已提交
897
    Status status;
898
    bool done;
899
    bool in_progress;             // compaction request being processed?
900 901 902
    bool incomplete;              // only part of requested range compacted
    bool exclusive;               // current behavior of only one manual
    bool disallow_trivial_move;   // Force actual compaction to run
903 904
    const InternalKey* begin;     // nullptr means beginning of key range
    const InternalKey* end;       // nullptr means end of key range
905
    InternalKey* manual_end;      // how far we are compacting
906
    InternalKey tmp_storage;      // Used to keep track of compaction progress
907 908 909 910 911 912 913 914
    InternalKey tmp_storage1;     // Used to keep track of compaction progress
    Compaction* compaction;
  };
  std::deque<ManualCompaction*> manual_compaction_dequeue_;

  struct CompactionArg {
    DBImpl* db;
    ManualCompaction* m;
H
hans@chromium.org 已提交
915
  };
J
jorlow@chromium.org 已提交
916 917 918 919

  // Have we encountered a background error in paranoid mode?
  Status bg_error_;

920
  // shall we disable deletion of obsolete files
921 922 923 924 925 926
  // if 0 the deletion is enabled.
  // if non-zero, files will not be getting deleted
  // This enables two different threads to call
  // EnableFileDeletions() and DisableFileDeletions()
  // without any synchronization
  int disable_delete_obsolete_files_;
927

I
Igor Canadi 已提交
928 929
  // next time when we should run DeleteObsoleteFiles with full scan
  uint64_t delete_obsolete_files_next_run_;
930

931
  // last time stats were dumped to LOG
H
Haobo Xu 已提交
932
  std::atomic<uint64_t> last_stats_dump_time_microsec_;
933

934 935 936 937
  // Each flush or compaction gets its own job id. this counter makes sure
  // they're unique
  std::atomic<int> next_job_id_;

938 939 940 941
  // A flag indicating whether the current rocksdb database has any
  // data that is not yet persisted into either WAL or SST file.
  // Used when disableWAL is true.
  bool has_unpersisted_data_;
942

H
heyongqiang 已提交
943
  static const int KEEP_LOG_FILE_NUM = 1000;
D
Dmitri Smirnov 已提交
944
  // MSVC version 1800 still does not have constexpr for ::max()
945
  static const uint64_t kNoTimeOut = port::kMaxUint64;
D
Dmitri Smirnov 已提交
946

H
heyongqiang 已提交
947
  std::string db_absolute_path_;
H
heyongqiang 已提交
948

949
  // The options to access storage files
L
Lei Jin 已提交
950
  const EnvOptions env_options_;
951

952 953 954 955
  // A set of compactions that are running right now
  // REQUIRES: mutex held
  std::unordered_set<Compaction*> running_compactions_;

I
Igor Canadi 已提交
956 957 958 959
#ifndef ROCKSDB_LITE
  WalManager wal_manager_;
#endif  // ROCKSDB_LITE

I
Igor Canadi 已提交
960 961 962
  // Unified interface for logging events
  EventLogger event_logger_;

963
  // A value of > 0 temporarily disables scheduling of background work
964
  int bg_work_paused_;
965

966 967 968
  // A value of > 0 temporarily disables scheduling of background compaction
  int bg_compaction_paused_;

969 970 971
  // Guard against multiple concurrent refitting
  bool refitting_level_;

972 973 974
  // Indicate DB was opened successfully
  bool opened_successfully_;

975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996
  // minmum log number still containing prepared data.
  // this is used by FindObsoleteFiles to determine which
  // flushed logs we must keep around because they still
  // contain prepared data which has not been flushed or rolled back
  std::priority_queue<uint64_t, std::vector<uint64_t>, std::greater<uint64_t>>
      min_log_with_prep_;

  // to be used in conjunction with min_log_with_prep_.
  // once a transaction with data in log L is committed or rolled back
  // rather than removing the value from the heap we add that value
  // to prepared_section_completed_ which maps LOG -> instance_count
  // since a log could contain multiple prepared sections
  //
  // when trying to determine the minmum log still active we first
  // consult min_log_with_prep_. while that root value maps to
  // a value > 0 in prepared_section_completed_ we decrement the
  // instance_count for that log and pop the root value in
  // min_log_with_prep_. This will work the same as a min_heap
  // where we are deleteing arbitrary elements and the up heaping.
  std::unordered_map<uint64_t, uint64_t> prepared_section_completed_;
  std::mutex prep_heap_mutex_;

J
jorlow@chromium.org 已提交
997 998 999 1000
  // No copying allowed
  DBImpl(const DBImpl&);
  void operator=(const DBImpl&);

1001 1002 1003 1004 1005 1006
  // Return the earliest snapshot where seqno is visible.
  // Store the snapshot right before that, if any, in prev_snapshot
  inline SequenceNumber findEarliestVisibleSnapshot(
    SequenceNumber in,
    std::vector<SequenceNumber>& snapshots,
    SequenceNumber* prev_snapshot);
1007

I
Igor Canadi 已提交
1008
  // Background threads call this function, which is just a wrapper around
I
Igor Canadi 已提交
1009 1010 1011
  // the InstallSuperVersion() function. Background threads carry
  // job_context which can have new_superversion already
  // allocated.
I
Igor Canadi 已提交
1012
  void InstallSuperVersionAndScheduleWorkWrapper(
I
Igor Canadi 已提交
1013 1014
      ColumnFamilyData* cfd, JobContext* job_context,
      const MutableCFOptions& mutable_cf_options);
L
Lei Jin 已提交
1015

1016 1017 1018
  // All ColumnFamily state changes go through this function. Here we analyze
  // the new state and we schedule background work if we detect that the new
  // state needs flush or compaction.
I
Igor Canadi 已提交
1019 1020 1021
  SuperVersion* InstallSuperVersionAndScheduleWork(
      ColumnFamilyData* cfd, SuperVersion* new_sv,
      const MutableCFOptions& mutable_cf_options);
I
Igor Canadi 已提交
1022

I
Igor Canadi 已提交
1023
#ifndef ROCKSDB_LITE
I
Igor Canadi 已提交
1024 1025 1026
  using DB::GetPropertiesOfAllTables;
  virtual Status GetPropertiesOfAllTables(ColumnFamilyHandle* column_family,
                                          TablePropertiesCollection* props)
1027
      override;
1028
  virtual Status GetPropertiesOfTablesInRange(
1029
      ColumnFamilyHandle* column_family, const Range* range, std::size_t n,
1030 1031
      TablePropertiesCollection* props) override;

I
Igor Canadi 已提交
1032
#endif  // ROCKSDB_LITE
1033

1034 1035
  // Function that Get and KeyMayExist call with no_io true or false
  // Note: 'value_found' from KeyMayExist propagates here
1036 1037 1038
  Status GetImpl(const ReadOptions& options, ColumnFamilyHandle* column_family,
                 const Slice& key, std::string* value,
                 bool* value_found = nullptr);
1039

1040
  bool GetIntPropertyInternal(ColumnFamilyData* cfd,
1041 1042
                              const DBPropertyInfo& property_info,
                              bool is_locked, uint64_t* value);
1043 1044 1045 1046 1047

  bool HasPendingManualCompaction();
  bool HasExclusiveManualCompaction();
  void AddManualCompaction(ManualCompaction* m);
  void RemoveManualCompaction(ManualCompaction* m);
1048
  bool ShouldntRunManualCompaction(ManualCompaction* m);
1049 1050
  bool HaveManualCompaction(ColumnFamilyData* cfd);
  bool MCOverlap(ManualCompaction* m, ManualCompaction* m1);
J
jorlow@chromium.org 已提交
1051 1052 1053 1054 1055 1056 1057
};

// Sanitize db options.  The caller should delete result.info_log if
// it is not equal to src.info_log.
extern Options SanitizeOptions(const std::string& db,
                               const InternalKeyComparator* icmp,
                               const Options& src);
1058
extern DBOptions SanitizeOptions(const std::string& db, const DBOptions& src);
S
Siying Dong 已提交
1059

M
miguelportilla 已提交
1060 1061 1062 1063 1064 1065 1066
// Fix user-supplied options to be reasonable
template <class T, class V>
static void ClipToRange(T* ptr, V minvalue, V maxvalue) {
  if (static_cast<V>(*ptr) > maxvalue) *ptr = maxvalue;
  if (static_cast<V>(*ptr) < minvalue) *ptr = minvalue;
}

1067
}  // namespace rocksdb