db_impl.h 17.9 KB
Newer Older
1 2 3 4 5
//  Copyright (c) 2013, Facebook, Inc.  All rights reserved.
//  This source code is licensed under the BSD-style license found in the
//  LICENSE file in the root directory of this source tree. An additional grant
//  of patent rights can be found in the PATENTS file in the same directory.
//
J
jorlow@chromium.org 已提交
6 7 8
// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file. See the AUTHORS file for names of contributors.
9
#pragma once
H
Haobo Xu 已提交
10
#include <atomic>
11
#include <deque>
J
jorlow@chromium.org 已提交
12
#include <set>
13
#include <vector>
J
jorlow@chromium.org 已提交
14 15 16
#include "db/dbformat.h"
#include "db/log_writer.h"
#include "db/snapshot.h"
17
#include "db/version_edit.h"
18 19 20 21
#include "rocksdb/db.h"
#include "rocksdb/env.h"
#include "rocksdb/memtablerep.h"
#include "rocksdb/transaction_log.h"
J
jorlow@chromium.org 已提交
22
#include "port/port.h"
23
#include "util/stats_logger.h"
24
#include "memtablelist.h"
25

26
namespace rocksdb {
J
jorlow@chromium.org 已提交
27 28 29 30 31 32 33 34 35 36 37 38 39 40

class MemTable;
class TableCache;
class Version;
class VersionEdit;
class VersionSet;

class DBImpl : public DB {
 public:
  DBImpl(const Options& options, const std::string& dbname);
  virtual ~DBImpl();

  // Implementations of the DB interface
  virtual Status Put(const WriteOptions&, const Slice& key, const Slice& value);
41 42
  virtual Status Merge(const WriteOptions&, const Slice& key,
                       const Slice& value);
J
jorlow@chromium.org 已提交
43 44 45 46 47
  virtual Status Delete(const WriteOptions&, const Slice& key);
  virtual Status Write(const WriteOptions& options, WriteBatch* updates);
  virtual Status Get(const ReadOptions& options,
                     const Slice& key,
                     std::string* value);
48 49 50
  virtual std::vector<Status> MultiGet(const ReadOptions& options,
                                       const std::vector<Slice>& keys,
                                       std::vector<std::string>* values);
51

52 53 54 55 56 57 58 59
  // Returns false if key doesn't exist in the database and true if it may.
  // If value_found is not passed in as null, then return the value if found in
  // memory. On return, if value was found, then value_found will be set to true
  // , otherwise false.
  virtual bool KeyMayExist(const ReadOptions& options,
                           const Slice& key,
                           std::string* value,
                           bool* value_found = nullptr);
J
jorlow@chromium.org 已提交
60 61 62
  virtual Iterator* NewIterator(const ReadOptions&);
  virtual const Snapshot* GetSnapshot();
  virtual void ReleaseSnapshot(const Snapshot* snapshot);
63
  virtual bool GetProperty(const Slice& property, std::string* value);
J
jorlow@chromium.org 已提交
64
  virtual void GetApproximateSizes(const Range* range, int n, uint64_t* sizes);
65
  virtual void CompactRange(const Slice* begin, const Slice* end,
66
                            bool reduce_level = false, int target_level = -1);
67 68 69
  virtual int NumberLevels();
  virtual int MaxMemCompactionLevel();
  virtual int Level0StopWriteTrigger();
70
  virtual Env* GetEnv() const;
I
Igor Canadi 已提交
71
  virtual const Options& GetOptions() const;
H
heyongqiang 已提交
72
  virtual Status Flush(const FlushOptions& options);
73 74
  virtual Status DisableFileDeletions();
  virtual Status EnableFileDeletions();
I
Igor Canadi 已提交
75
  // All the returned filenames start with "/"
76
  virtual Status GetLiveFiles(std::vector<std::string>&,
77 78
                              uint64_t* manifest_file_size,
                              bool flush_memtable = true);
79
  virtual Status GetSortedWalFiles(VectorLogPtr& files);
80
  virtual SequenceNumber GetLatestSequenceNumber() const;
81
  virtual Status GetUpdatesSince(SequenceNumber seq_number,
82
                                 unique_ptr<TransactionLogIterator>* iter);
83 84 85 86
  virtual Status DeleteFile(std::string name);

  virtual void GetLiveFilesMetaData(
    std::vector<LiveFileMetaData> *metadata);
87

J
jorlow@chromium.org 已提交
88 89
  // Extra methods (for testing) that are not in the public DB interface

90
  // Compact any files in the named level that overlap [*begin, *end]
G
Gabor Cselle 已提交
91
  void TEST_CompactRange(int level, const Slice* begin, const Slice* end);
J
jorlow@chromium.org 已提交
92

93 94
  // Force current memtable contents to be flushed.
  Status TEST_FlushMemTable();
J
jorlow@chromium.org 已提交
95

96
  // Wait for memtable compaction
97
  Status TEST_WaitForFlushMemTable();
98 99 100 101

  // Wait for any compaction
  Status TEST_WaitForCompact();

J
jorlow@chromium.org 已提交
102 103 104 105 106
  // Return an internal iterator over the current state of the database.
  // The keys of this iterator are internal keys (see format.h).
  // The returned iterator should be deleted when no longer needed.
  Iterator* TEST_NewInternalIterator();

107 108
  // Return the maximum overlapping data (in bytes) at next level for any
  // file at a level >= 1.
J
jorlow@chromium.org 已提交
109
  int64_t TEST_MaxNextLevelOverlappingBytes();
110

111 112 113
  // Simulate a db crash, no elegant closing of database.
  void TEST_Destroy_DBImpl();

A
Abhishek Kona 已提交
114 115
  // Return the current manifest file no.
  uint64_t TEST_Current_Manifest_FileNo();
116 117 118 119

  // Trigger's a background call for testing.
  void TEST_PurgeObsoleteteWAL();

120 121 122
  // get total level0 file size. Only for testing.
  uint64_t TEST_GetLevel0TotalSize() { return versions_->NumLevelBytes(0);}

123 124 125 126 127
  void TEST_SetDefaultTimeToCheck(uint64_t default_interval_to_delete_obsolete_WAL)
  {
    default_interval_to_delete_obsolete_WAL_ = default_interval_to_delete_obsolete_WAL;
  }

I
Igor Canadi 已提交
128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175
  // needed for CleanupIteratorState

  struct DeletionState {
    inline bool HaveSomethingToDelete() const {
      return all_files.size() ||
        sst_delete_files.size() ||
        log_delete_files.size();
    }
    // a list of all files that we'll consider deleting
    // (every once in a while this is filled up with all files
    // in the DB directory)
    std::vector<std::string> all_files;

    // the list of all live sst files that cannot be deleted
    std::vector<uint64_t> sst_live;

    // a list of sst files that we need to delete
    std::vector<FileMetaData*> sst_delete_files;

    // a list of log files that we need to delete
    std::vector<uint64_t> log_delete_files;

    // the current manifest_file_number, log_number and prev_log_number
    // that corresponds to the set of files in 'live'.
    uint64_t manifest_file_number, log_number, prev_log_number;

    DeletionState() {
      manifest_file_number = 0;
      log_number = 0;
      prev_log_number = 0;
    }
  };

  // Returns the list of live files in 'live' and the list
  // of all files in the filesystem in 'all_files'.
  // If force == false and the last call was less than
  // options_.delete_obsolete_files_period_micros microseconds ago,
  // it will not fill up the deletion_state
  void FindObsoleteFiles(DeletionState& deletion_state,
                         bool force,
                         bool no_full_scan = false);

  // Diffs the files listed in filenames and those that do not
  // belong to live files are posibly removed. Also, removes all the
  // files in sst_delete_files and log_delete_files.
  // It is not necessary to hold the mutex when invoking this method.
  void PurgeObsoleteFiles(DeletionState& deletion_state);

176
 protected:
H
heyongqiang 已提交
177 178
  Env* const env_;
  const std::string dbname_;
179
  unique_ptr<VersionSet> versions_;
H
heyongqiang 已提交
180 181 182 183 184 185
  const InternalKeyComparator internal_comparator_;
  const Options options_;  // options_.comparator == &internal_comparator_

  const Comparator* user_comparator() const {
    return internal_comparator_.user_comparator();
  }
186

187 188
  MemTable* GetMemTable() {
    return mem_;
A
Abhishek Kona 已提交
189
  }
H
heyongqiang 已提交
190

191 192 193
  Iterator* NewInternalIterator(const ReadOptions&,
                                SequenceNumber* latest_snapshot);

J
jorlow@chromium.org 已提交
194 195
 private:
  friend class DB;
196 197
  struct CompactionState;
  struct Writer;
J
jorlow@chromium.org 已提交
198 199 200 201 202 203

  Status NewDB();

  // Recover the descriptor from persistent storage.  May do a significant
  // amount of work to recover recently logged updates.  Any changes to
  // be made to the descriptor are added to *edit.
204
  Status Recover(VersionEdit* edit, MemTable* external_table = nullptr,
H
heyongqiang 已提交
205
      bool error_if_log_file_exist = false);
J
jorlow@chromium.org 已提交
206 207 208

  void MaybeIgnoreError(Status* s) const;

209 210
  const Status CreateArchivalDirectory();

J
jorlow@chromium.org 已提交
211 212 213
  // Delete any unneeded files and stale in-memory entries.
  void DeleteObsoleteFiles();

214
  // Flush the in-memory write buffer to storage.  Switches to a new
J
jorlow@chromium.org 已提交
215
  // log-file/memtable and writes a new descriptor iff successful.
I
Igor Canadi 已提交
216 217
  Status FlushMemTableToOutputFile(bool* madeProgress,
                                   DeletionState& deletion_state);
J
jorlow@chromium.org 已提交
218 219 220

  Status RecoverLogFile(uint64_t log_number,
                        VersionEdit* edit,
221 222
                        SequenceNumber* max_sequence,
                        MemTable* external_table);
J
jorlow@chromium.org 已提交
223

224 225 226 227 228 229
  // The following two methods are used to flush a memtable to
  // storage. The first one is used atdatabase RecoveryTime (when the
  // database is opened) and is heavyweight because it holds the mutex
  // for the entire period. The second method WriteLevel0Table supports
  // concurrent flush memtables to storage.
  Status WriteLevel0TableForRecovery(MemTable* mem, VersionEdit* edit);
230
  Status WriteLevel0Table(std::vector<MemTable*> &mems, VersionEdit* edit,
231
                                uint64_t* filenumber);
J
jorlow@chromium.org 已提交
232

J
Jim Paton 已提交
233
  uint64_t SlowdownAmount(int n, int top, int bottom);
234
  Status MakeRoomForWrite(bool force /* compact even if there is room? */);
235
  WriteBatch* BuildBatchGroup(Writer** last_writer);
J
jorlow@chromium.org 已提交
236

H
heyongqiang 已提交
237 238 239
  // Force current memtable contents to be flushed.
  Status FlushMemTable(const FlushOptions& options);

240 241
  // Wait for memtable flushed
  Status WaitForFlushMemTable();
H
heyongqiang 已提交
242

243
  void MaybeScheduleLogDBDeployStats();
244 245
  static void BGLogDBDeployStats(void* db);
  void LogDBDeployStats();
246

247
  void MaybeScheduleFlushOrCompaction();
248 249 250 251
  static void BGWorkCompaction(void* db);
  static void BGWorkFlush(void* db);
  void BackgroundCallCompaction();
  void BackgroundCallFlush();
252
  Status BackgroundCompaction(bool* madeProgress,DeletionState& deletion_state);
I
Igor Canadi 已提交
253
  Status BackgroundFlush(bool* madeProgress, DeletionState& deletion_state);
254
  void CleanupCompaction(CompactionState* compact, Status status);
I
Igor Canadi 已提交
255 256
  Status DoCompactionWork(CompactionState* compact,
                          DeletionState& deletion_state);
J
jorlow@chromium.org 已提交
257 258 259 260

  Status OpenCompactionOutputFile(CompactionState* compact);
  Status FinishCompactionOutputFile(CompactionState* compact, Iterator* input);
  Status InstallCompactionResults(CompactionState* compact);
261 262
  void AllocateCompactionOutputFileNumbers(CompactionState* compact);
  void ReleaseCompactionUnusedFileNumbers(CompactionState* compact);
A
Abhishek Kona 已提交
263

264
  void PurgeObsoleteWALFiles();
265

266 267 268
  Status AppendSortedWalsOfType(const std::string& path,
                                VectorLogPtr& log_files,
                                WalFileType type);
269

270 271 272 273 274
  // Requires: all_logs should be sorted with earliest log file first
  // Retains all log files in all_logs which contain updates with seq no.
  // Greater Than or Equal to the requested SequenceNumber.
  Status RetainProbableWalFiles(VectorLogPtr& all_logs,
                                const SequenceNumber target);
275
  //  return true if
276 277
  bool CheckWalFileExistsAndEmpty(const WalFileType type,
                                  const uint64_t number);
278

279 280
  Status ReadFirstRecord(const WalFileType type, const uint64_t number,
                         WriteBatch* const result);
281 282

  Status ReadFirstLine(const std::string& fname, WriteBatch* const batch);
283

284 285
  void PrintStatistics();

286
  // dump rocksdb.stats to LOG
287 288
  void MaybeDumpStats();

289 290 291 292
  // Return the minimum empty level that could hold the total data in the
  // input level. Return the input level, if such level could not be found.
  int FindMinimumEmptyLevelFitting(int level);

293 294 295 296
  // Move the files in the input level to the target level.
  // If target_level < 0, automatically calculate the minimum level that could
  // hold the data set.
  void ReFitLevel(int level, int target_level = -1);
297

J
jorlow@chromium.org 已提交
298
  // Constant after construction
S
Sanjay Ghemawat 已提交
299
  const InternalFilterPolicy internal_filter_policy_;
J
jorlow@chromium.org 已提交
300 301 302
  bool owns_info_log_;

  // table_cache_ provides its own synchronization
303
  unique_ptr<TableCache> table_cache_;
J
jorlow@chromium.org 已提交
304

305
  // Lock over the persistent DB state.  Non-nullptr iff successfully acquired.
J
jorlow@chromium.org 已提交
306 307 308 309 310
  FileLock* db_lock_;

  // State below is protected by mutex_
  port::Mutex mutex_;
  port::AtomicPointer shutting_down_;
H
hans@chromium.org 已提交
311
  port::CondVar bg_cv_;          // Signalled when background work finishes
J
Jim Paton 已提交
312
  std::shared_ptr<MemTableRepFactory> mem_rep_factory_;
J
jorlow@chromium.org 已提交
313
  MemTable* mem_;
314
  MemTableList imm_;             // Memtable that are not changing
315
  uint64_t logfile_number_;
316
  unique_ptr<log::Writer> log_;
317

318 319
  std::string host_name_;

320 321
  // Queue of writers.
  std::deque<Writer*> writers_;
322
  WriteBatch tmp_batch_;
323

J
jorlow@chromium.org 已提交
324 325 326 327 328 329
  SnapshotList snapshots_;

  // Set of table files to protect from deletion because they are
  // part of ongoing compactions.
  std::set<uint64_t> pending_outputs_;

330 331
  // count how many background compaction been scheduled or is running?
  int bg_compaction_scheduled_;
J
jorlow@chromium.org 已提交
332

333 334 335
  // number of background memtable flush jobs, submitted to the HIGH pool
  int bg_flush_scheduled_;

336 337 338
  // Has a background stats log thread scheduled?
  bool bg_logstats_scheduled_;

H
hans@chromium.org 已提交
339 340 341
  // Information for a manual compaction
  struct ManualCompaction {
    int level;
G
Gabor Cselle 已提交
342
    bool done;
343
    bool in_progress;           // compaction request being processed?
344 345
    const InternalKey* begin;   // nullptr means beginning of key range
    const InternalKey* end;     // nullptr means end of key range
G
Gabor Cselle 已提交
346
    InternalKey tmp_storage;    // Used to keep track of compaction progress
H
hans@chromium.org 已提交
347 348
  };
  ManualCompaction* manual_compaction_;
J
jorlow@chromium.org 已提交
349 350 351 352

  // Have we encountered a background error in paranoid mode?
  Status bg_error_;

353
  std::unique_ptr<StatsLogger> logger_;
354

355
  int64_t volatile last_log_ts;
356

357 358 359
  // shall we disable deletion of obsolete files
  bool disable_delete_obsolete_files_;

360 361 362
  // last time when DeleteObsoleteFiles was invoked
  uint64_t delete_obsolete_files_last_run_;

363 364 365
  // last time when PurgeObsoleteWALFiles ran.
  uint64_t purge_wal_files_last_run_;

366
  // last time stats were dumped to LOG
H
Haobo Xu 已提交
367
  std::atomic<uint64_t> last_stats_dump_time_microsec_;
368

369 370 371 372
  // obsolete files will be deleted every this seconds if ttl deletion is
  // enabled and archive size_limit is disabled.
  uint64_t default_interval_to_delete_obsolete_WAL_;

M
Mark Callaghan 已提交
373 374 375 376
  // These count the number of microseconds for which MakeRoomForWrite stalls.
  uint64_t stall_level0_slowdown_;
  uint64_t stall_memtable_compaction_;
  uint64_t stall_level0_num_files_;
377
  std::vector<uint64_t> stall_leveln_slowdown_;
J
Jim Paton 已提交
378 379 380 381
  uint64_t stall_level0_slowdown_count_;
  uint64_t stall_memtable_compaction_count_;
  uint64_t stall_level0_num_files_count_;
  std::vector<uint64_t> stall_leveln_slowdown_count_;
M
Mark Callaghan 已提交
382 383 384 385

  // Time at which this instance was started.
  const uint64_t started_at_;

386 387
  bool flush_on_destroy_; // Used when disableWAL is true.

388 389 390
  // Per level compaction stats.  stats_[level] stores the stats for
  // compactions that produced data for the specified "level".
  struct CompactionStats {
A
Abhishek Kona 已提交
391
    uint64_t micros;
M
Mark Callaghan 已提交
392 393 394 395 396 397 398 399

    // Bytes read from level N during compaction between levels N and N+1
    int64_t bytes_readn;

    // Bytes read from level N+1 during compaction between levels N and N+1
    int64_t bytes_readnp1;

    // Total bytes written during compaction between levels N and N+1
400 401
    int64_t bytes_written;

M
Mark Callaghan 已提交
402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417
    // Files read from level N during compaction between levels N and N+1
    int     files_in_leveln;

    // Files read from level N+1 during compaction between levels N and N+1
    int     files_in_levelnp1;

    // Files written during compaction between levels N and N+1
    int     files_out_levelnp1;

    // Number of compactions done
    int     count;

    CompactionStats() : micros(0), bytes_readn(0), bytes_readnp1(0),
                        bytes_written(0), files_in_leveln(0),
                        files_in_levelnp1(0), files_out_levelnp1(0),
                        count(0) { }
418 419 420

    void Add(const CompactionStats& c) {
      this->micros += c.micros;
M
Mark Callaghan 已提交
421 422
      this->bytes_readn += c.bytes_readn;
      this->bytes_readnp1 += c.bytes_readnp1;
423
      this->bytes_written += c.bytes_written;
M
Mark Callaghan 已提交
424 425 426 427
      this->files_in_leveln += c.files_in_leveln;
      this->files_in_levelnp1 += c.files_in_levelnp1;
      this->files_out_levelnp1 += c.files_out_levelnp1;
      this->count += 1;
428 429
    }
  };
M
Mark Callaghan 已提交
430

431
  std::vector<CompactionStats> stats_;
432

433 434 435 436 437 438 439 440 441 442 443 444 445
  // Used to compute per-interval statistics
  struct StatsSnapshot {
    uint64_t bytes_read_;
    uint64_t bytes_written_;
    uint64_t bytes_new_;
    double   seconds_up_;

    StatsSnapshot() : bytes_read_(0), bytes_written_(0),
                      bytes_new_(0), seconds_up_(0) {}
  };

  StatsSnapshot last_stats_;

H
heyongqiang 已提交
446
  static const int KEEP_LOG_FILE_NUM = 1000;
H
heyongqiang 已提交
447
  std::string db_absolute_path_;
H
heyongqiang 已提交
448

449 450 451
  // count of the number of contiguous delaying writes
  int delayed_writes_;

452
  // The options to access storage files
H
Haobo Xu 已提交
453
  const EnvOptions storage_options_;
454

455 456 457 458 459 460
  // A value of true temporarily disables scheduling of background work
  bool bg_work_gate_closed_;

  // Guard against multiple concurrent refitting
  bool refitting_level_;

J
jorlow@chromium.org 已提交
461 462 463 464
  // No copying allowed
  DBImpl(const DBImpl&);
  void operator=(const DBImpl&);

465 466
  // dump the delayed_writes_ to the log file and reset counter.
  void DelayLoggingAndReset();
467

468 469 470 471 472 473
  // Return the earliest snapshot where seqno is visible.
  // Store the snapshot right before that, if any, in prev_snapshot
  inline SequenceNumber findEarliestVisibleSnapshot(
    SequenceNumber in,
    std::vector<SequenceNumber>& snapshots,
    SequenceNumber* prev_snapshot);
474

475 476
  // Function that Get and KeyMayExist call with no_io true or false
  // Note: 'value_found' from KeyMayExist propagates here
477 478 479
  Status GetImpl(const ReadOptions& options,
                 const Slice& key,
                 std::string* value,
480
                 bool* value_found = nullptr);
J
jorlow@chromium.org 已提交
481 482 483 484 485 486
};

// Sanitize db options.  The caller should delete result.info_log if
// it is not equal to src.info_log.
extern Options SanitizeOptions(const std::string& db,
                               const InternalKeyComparator* icmp,
S
Sanjay Ghemawat 已提交
487
                               const InternalFilterPolicy* ipolicy,
J
jorlow@chromium.org 已提交
488 489
                               const Options& src);

S
Siying Dong 已提交
490 491 492 493 494 495 496 497 498

// Determine compression type, based on user options, level of the output
// file and whether compression is disabled.
// If enable_compression is false, then compression is always disabled no
// matter what the values of the other two parameters are.
// Otherwise, the compression type is determined based on options and level.
CompressionType GetCompressionType(const Options& options, int level,
                                   const bool enable_compression);

499
}  // namespace rocksdb