db_impl.h 17.9 KB
Newer Older
1 2 3 4 5
//  Copyright (c) 2013, Facebook, Inc.  All rights reserved.
//  This source code is licensed under the BSD-style license found in the
//  LICENSE file in the root directory of this source tree. An additional grant
//  of patent rights can be found in the PATENTS file in the same directory.
//
J
jorlow@chromium.org 已提交
6 7 8
// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file. See the AUTHORS file for names of contributors.
9
#pragma once
H
Haobo Xu 已提交
10
#include <atomic>
11
#include <deque>
J
jorlow@chromium.org 已提交
12
#include <set>
13
#include <vector>
J
jorlow@chromium.org 已提交
14 15 16
#include "db/dbformat.h"
#include "db/log_writer.h"
#include "db/snapshot.h"
17
#include "db/version_edit.h"
18 19 20 21
#include "rocksdb/db.h"
#include "rocksdb/env.h"
#include "rocksdb/memtablerep.h"
#include "rocksdb/transaction_log.h"
J
jorlow@chromium.org 已提交
22
#include "port/port.h"
23
#include "util/stats_logger.h"
24
#include "memtablelist.h"
25

26
namespace rocksdb {
J
jorlow@chromium.org 已提交
27 28 29 30 31 32 33 34 35 36 37 38 39 40

class MemTable;
class TableCache;
class Version;
class VersionEdit;
class VersionSet;

class DBImpl : public DB {
 public:
  DBImpl(const Options& options, const std::string& dbname);
  virtual ~DBImpl();

  // Implementations of the DB interface
  virtual Status Put(const WriteOptions&, const Slice& key, const Slice& value);
41 42
  virtual Status Merge(const WriteOptions&, const Slice& key,
                       const Slice& value);
J
jorlow@chromium.org 已提交
43 44 45 46 47
  virtual Status Delete(const WriteOptions&, const Slice& key);
  virtual Status Write(const WriteOptions& options, WriteBatch* updates);
  virtual Status Get(const ReadOptions& options,
                     const Slice& key,
                     std::string* value);
48 49 50
  virtual std::vector<Status> MultiGet(const ReadOptions& options,
                                       const std::vector<Slice>& keys,
                                       std::vector<std::string>* values);
51

52 53 54 55 56 57 58 59
  // Returns false if key doesn't exist in the database and true if it may.
  // If value_found is not passed in as null, then return the value if found in
  // memory. On return, if value was found, then value_found will be set to true
  // , otherwise false.
  virtual bool KeyMayExist(const ReadOptions& options,
                           const Slice& key,
                           std::string* value,
                           bool* value_found = nullptr);
J
jorlow@chromium.org 已提交
60 61 62
  virtual Iterator* NewIterator(const ReadOptions&);
  virtual const Snapshot* GetSnapshot();
  virtual void ReleaseSnapshot(const Snapshot* snapshot);
63
  virtual bool GetProperty(const Slice& property, std::string* value);
J
jorlow@chromium.org 已提交
64
  virtual void GetApproximateSizes(const Range* range, int n, uint64_t* sizes);
65
  virtual void CompactRange(const Slice* begin, const Slice* end,
66
                            bool reduce_level = false, int target_level = -1);
67 68 69
  virtual int NumberLevels();
  virtual int MaxMemCompactionLevel();
  virtual int Level0StopWriteTrigger();
H
heyongqiang 已提交
70
  virtual Status Flush(const FlushOptions& options);
71 72
  virtual Status DisableFileDeletions();
  virtual Status EnableFileDeletions();
I
Igor Canadi 已提交
73
  // All the returned filenames start with "/"
74
  virtual Status GetLiveFiles(std::vector<std::string>&,
75 76
                              uint64_t* manifest_file_size,
                              bool flush_memtable = true);
77
  virtual Status GetSortedWalFiles(VectorLogPtr& files);
78
  virtual SequenceNumber GetLatestSequenceNumber() const;
79
  virtual Status GetUpdatesSince(SequenceNumber seq_number,
80
                                 unique_ptr<TransactionLogIterator>* iter);
81 82 83 84
  virtual Status DeleteFile(std::string name);

  virtual void GetLiveFilesMetaData(
    std::vector<LiveFileMetaData> *metadata);
85

J
jorlow@chromium.org 已提交
86 87
  // Extra methods (for testing) that are not in the public DB interface

88
  // Compact any files in the named level that overlap [*begin, *end]
G
Gabor Cselle 已提交
89
  void TEST_CompactRange(int level, const Slice* begin, const Slice* end);
J
jorlow@chromium.org 已提交
90

91 92
  // Force current memtable contents to be flushed.
  Status TEST_FlushMemTable();
J
jorlow@chromium.org 已提交
93

94
  // Wait for memtable compaction
95
  Status TEST_WaitForFlushMemTable();
96 97 98 99

  // Wait for any compaction
  Status TEST_WaitForCompact();

J
jorlow@chromium.org 已提交
100 101 102 103 104
  // Return an internal iterator over the current state of the database.
  // The keys of this iterator are internal keys (see format.h).
  // The returned iterator should be deleted when no longer needed.
  Iterator* TEST_NewInternalIterator();

105 106
  // Return the maximum overlapping data (in bytes) at next level for any
  // file at a level >= 1.
J
jorlow@chromium.org 已提交
107
  int64_t TEST_MaxNextLevelOverlappingBytes();
108

109 110 111
  // Simulate a db crash, no elegant closing of database.
  void TEST_Destroy_DBImpl();

A
Abhishek Kona 已提交
112 113
  // Return the current manifest file no.
  uint64_t TEST_Current_Manifest_FileNo();
114 115 116 117

  // Trigger's a background call for testing.
  void TEST_PurgeObsoleteteWAL();

118 119 120
  // get total level0 file size. Only for testing.
  uint64_t TEST_GetLevel0TotalSize() { return versions_->NumLevelBytes(0);}

121 122 123 124 125
  void TEST_SetDefaultTimeToCheck(uint64_t default_interval_to_delete_obsolete_WAL)
  {
    default_interval_to_delete_obsolete_WAL_ = default_interval_to_delete_obsolete_WAL;
  }

I
Igor Canadi 已提交
126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173
  // needed for CleanupIteratorState

  struct DeletionState {
    inline bool HaveSomethingToDelete() const {
      return all_files.size() ||
        sst_delete_files.size() ||
        log_delete_files.size();
    }
    // a list of all files that we'll consider deleting
    // (every once in a while this is filled up with all files
    // in the DB directory)
    std::vector<std::string> all_files;

    // the list of all live sst files that cannot be deleted
    std::vector<uint64_t> sst_live;

    // a list of sst files that we need to delete
    std::vector<FileMetaData*> sst_delete_files;

    // a list of log files that we need to delete
    std::vector<uint64_t> log_delete_files;

    // the current manifest_file_number, log_number and prev_log_number
    // that corresponds to the set of files in 'live'.
    uint64_t manifest_file_number, log_number, prev_log_number;

    DeletionState() {
      manifest_file_number = 0;
      log_number = 0;
      prev_log_number = 0;
    }
  };

  // Returns the list of live files in 'live' and the list
  // of all files in the filesystem in 'all_files'.
  // If force == false and the last call was less than
  // options_.delete_obsolete_files_period_micros microseconds ago,
  // it will not fill up the deletion_state
  void FindObsoleteFiles(DeletionState& deletion_state,
                         bool force,
                         bool no_full_scan = false);

  // Diffs the files listed in filenames and those that do not
  // belong to live files are posibly removed. Also, removes all the
  // files in sst_delete_files and log_delete_files.
  // It is not necessary to hold the mutex when invoking this method.
  void PurgeObsoleteFiles(DeletionState& deletion_state);

174
 protected:
H
heyongqiang 已提交
175 176
  Env* const env_;
  const std::string dbname_;
177
  unique_ptr<VersionSet> versions_;
H
heyongqiang 已提交
178 179 180 181 182 183
  const InternalKeyComparator internal_comparator_;
  const Options options_;  // options_.comparator == &internal_comparator_

  const Comparator* user_comparator() const {
    return internal_comparator_.user_comparator();
  }
184

185 186
  MemTable* GetMemTable() {
    return mem_;
A
Abhishek Kona 已提交
187
  }
H
heyongqiang 已提交
188

189 190 191
  Iterator* NewInternalIterator(const ReadOptions&,
                                SequenceNumber* latest_snapshot);

J
jorlow@chromium.org 已提交
192 193
 private:
  friend class DB;
194 195
  struct CompactionState;
  struct Writer;
J
jorlow@chromium.org 已提交
196 197 198 199 200 201

  Status NewDB();

  // Recover the descriptor from persistent storage.  May do a significant
  // amount of work to recover recently logged updates.  Any changes to
  // be made to the descriptor are added to *edit.
202
  Status Recover(VersionEdit* edit, MemTable* external_table = nullptr,
H
heyongqiang 已提交
203
      bool error_if_log_file_exist = false);
J
jorlow@chromium.org 已提交
204 205 206

  void MaybeIgnoreError(Status* s) const;

207 208
  const Status CreateArchivalDirectory();

J
jorlow@chromium.org 已提交
209 210 211
  // Delete any unneeded files and stale in-memory entries.
  void DeleteObsoleteFiles();

212
  // Flush the in-memory write buffer to storage.  Switches to a new
J
jorlow@chromium.org 已提交
213
  // log-file/memtable and writes a new descriptor iff successful.
I
Igor Canadi 已提交
214 215
  Status FlushMemTableToOutputFile(bool* madeProgress,
                                   DeletionState& deletion_state);
J
jorlow@chromium.org 已提交
216 217 218

  Status RecoverLogFile(uint64_t log_number,
                        VersionEdit* edit,
219 220
                        SequenceNumber* max_sequence,
                        MemTable* external_table);
J
jorlow@chromium.org 已提交
221

222 223 224 225 226 227
  // The following two methods are used to flush a memtable to
  // storage. The first one is used atdatabase RecoveryTime (when the
  // database is opened) and is heavyweight because it holds the mutex
  // for the entire period. The second method WriteLevel0Table supports
  // concurrent flush memtables to storage.
  Status WriteLevel0TableForRecovery(MemTable* mem, VersionEdit* edit);
228
  Status WriteLevel0Table(std::vector<MemTable*> &mems, VersionEdit* edit,
229
                                uint64_t* filenumber);
J
jorlow@chromium.org 已提交
230

J
Jim Paton 已提交
231
  uint64_t SlowdownAmount(int n, int top, int bottom);
232
  Status MakeRoomForWrite(bool force /* compact even if there is room? */);
233
  WriteBatch* BuildBatchGroup(Writer** last_writer);
J
jorlow@chromium.org 已提交
234

H
heyongqiang 已提交
235 236 237
  // Force current memtable contents to be flushed.
  Status FlushMemTable(const FlushOptions& options);

238 239
  // Wait for memtable flushed
  Status WaitForFlushMemTable();
H
heyongqiang 已提交
240

241
  void MaybeScheduleLogDBDeployStats();
242 243
  static void BGLogDBDeployStats(void* db);
  void LogDBDeployStats();
244

245
  void MaybeScheduleFlushOrCompaction();
246 247 248 249
  static void BGWorkCompaction(void* db);
  static void BGWorkFlush(void* db);
  void BackgroundCallCompaction();
  void BackgroundCallFlush();
250
  Status BackgroundCompaction(bool* madeProgress,DeletionState& deletion_state);
I
Igor Canadi 已提交
251
  Status BackgroundFlush(bool* madeProgress, DeletionState& deletion_state);
252
  void CleanupCompaction(CompactionState* compact, Status status);
I
Igor Canadi 已提交
253 254
  Status DoCompactionWork(CompactionState* compact,
                          DeletionState& deletion_state);
J
jorlow@chromium.org 已提交
255 256 257 258

  Status OpenCompactionOutputFile(CompactionState* compact);
  Status FinishCompactionOutputFile(CompactionState* compact, Iterator* input);
  Status InstallCompactionResults(CompactionState* compact);
259 260
  void AllocateCompactionOutputFileNumbers(CompactionState* compact);
  void ReleaseCompactionUnusedFileNumbers(CompactionState* compact);
A
Abhishek Kona 已提交
261

262
  void PurgeObsoleteWALFiles();
263

264 265 266
  Status AppendSortedWalsOfType(const std::string& path,
                                VectorLogPtr& log_files,
                                WalFileType type);
267

268 269 270 271 272
  // Requires: all_logs should be sorted with earliest log file first
  // Retains all log files in all_logs which contain updates with seq no.
  // Greater Than or Equal to the requested SequenceNumber.
  Status RetainProbableWalFiles(VectorLogPtr& all_logs,
                                const SequenceNumber target);
273
  //  return true if
274 275
  bool CheckWalFileExistsAndEmpty(const WalFileType type,
                                  const uint64_t number);
276

277 278
  Status ReadFirstRecord(const WalFileType type, const uint64_t number,
                         WriteBatch* const result);
279 280

  Status ReadFirstLine(const std::string& fname, WriteBatch* const batch);
281

282 283
  void PrintStatistics();

284
  // dump rocksdb.stats to LOG
285 286
  void MaybeDumpStats();

287 288 289 290
  // Return the minimum empty level that could hold the total data in the
  // input level. Return the input level, if such level could not be found.
  int FindMinimumEmptyLevelFitting(int level);

291 292 293 294
  // Move the files in the input level to the target level.
  // If target_level < 0, automatically calculate the minimum level that could
  // hold the data set.
  void ReFitLevel(int level, int target_level = -1);
295

J
jorlow@chromium.org 已提交
296
  // Constant after construction
S
Sanjay Ghemawat 已提交
297
  const InternalFilterPolicy internal_filter_policy_;
J
jorlow@chromium.org 已提交
298 299 300
  bool owns_info_log_;

  // table_cache_ provides its own synchronization
301
  unique_ptr<TableCache> table_cache_;
J
jorlow@chromium.org 已提交
302

303
  // Lock over the persistent DB state.  Non-nullptr iff successfully acquired.
J
jorlow@chromium.org 已提交
304 305 306 307 308
  FileLock* db_lock_;

  // State below is protected by mutex_
  port::Mutex mutex_;
  port::AtomicPointer shutting_down_;
H
hans@chromium.org 已提交
309
  port::CondVar bg_cv_;          // Signalled when background work finishes
J
Jim Paton 已提交
310
  std::shared_ptr<MemTableRepFactory> mem_rep_factory_;
J
jorlow@chromium.org 已提交
311
  MemTable* mem_;
312
  MemTableList imm_;             // Memtable that are not changing
313
  uint64_t logfile_number_;
314
  unique_ptr<log::Writer> log_;
315

316 317
  std::string host_name_;

318 319
  // Queue of writers.
  std::deque<Writer*> writers_;
320
  WriteBatch tmp_batch_;
321

J
jorlow@chromium.org 已提交
322 323 324 325 326 327
  SnapshotList snapshots_;

  // Set of table files to protect from deletion because they are
  // part of ongoing compactions.
  std::set<uint64_t> pending_outputs_;

328 329
  // count how many background compaction been scheduled or is running?
  int bg_compaction_scheduled_;
J
jorlow@chromium.org 已提交
330

331 332 333
  // number of background memtable flush jobs, submitted to the HIGH pool
  int bg_flush_scheduled_;

334 335 336
  // Has a background stats log thread scheduled?
  bool bg_logstats_scheduled_;

H
hans@chromium.org 已提交
337 338 339
  // Information for a manual compaction
  struct ManualCompaction {
    int level;
G
Gabor Cselle 已提交
340
    bool done;
341
    bool in_progress;           // compaction request being processed?
342 343
    const InternalKey* begin;   // nullptr means beginning of key range
    const InternalKey* end;     // nullptr means end of key range
G
Gabor Cselle 已提交
344
    InternalKey tmp_storage;    // Used to keep track of compaction progress
H
hans@chromium.org 已提交
345 346
  };
  ManualCompaction* manual_compaction_;
J
jorlow@chromium.org 已提交
347 348 349 350

  // Have we encountered a background error in paranoid mode?
  Status bg_error_;

351
  std::unique_ptr<StatsLogger> logger_;
352

353
  int64_t volatile last_log_ts;
354

355 356 357
  // shall we disable deletion of obsolete files
  bool disable_delete_obsolete_files_;

358 359 360
  // last time when DeleteObsoleteFiles was invoked
  uint64_t delete_obsolete_files_last_run_;

361 362 363
  // last time when PurgeObsoleteWALFiles ran.
  uint64_t purge_wal_files_last_run_;

364
  // last time stats were dumped to LOG
H
Haobo Xu 已提交
365
  std::atomic<uint64_t> last_stats_dump_time_microsec_;
366

367 368 369 370
  // obsolete files will be deleted every this seconds if ttl deletion is
  // enabled and archive size_limit is disabled.
  uint64_t default_interval_to_delete_obsolete_WAL_;

M
Mark Callaghan 已提交
371 372 373 374
  // These count the number of microseconds for which MakeRoomForWrite stalls.
  uint64_t stall_level0_slowdown_;
  uint64_t stall_memtable_compaction_;
  uint64_t stall_level0_num_files_;
375
  std::vector<uint64_t> stall_leveln_slowdown_;
J
Jim Paton 已提交
376 377 378 379
  uint64_t stall_level0_slowdown_count_;
  uint64_t stall_memtable_compaction_count_;
  uint64_t stall_level0_num_files_count_;
  std::vector<uint64_t> stall_leveln_slowdown_count_;
M
Mark Callaghan 已提交
380 381 382 383

  // Time at which this instance was started.
  const uint64_t started_at_;

384 385
  bool flush_on_destroy_; // Used when disableWAL is true.

386 387 388
  // Per level compaction stats.  stats_[level] stores the stats for
  // compactions that produced data for the specified "level".
  struct CompactionStats {
A
Abhishek Kona 已提交
389
    uint64_t micros;
M
Mark Callaghan 已提交
390 391 392 393 394 395 396 397

    // Bytes read from level N during compaction between levels N and N+1
    int64_t bytes_readn;

    // Bytes read from level N+1 during compaction between levels N and N+1
    int64_t bytes_readnp1;

    // Total bytes written during compaction between levels N and N+1
398 399
    int64_t bytes_written;

M
Mark Callaghan 已提交
400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415
    // Files read from level N during compaction between levels N and N+1
    int     files_in_leveln;

    // Files read from level N+1 during compaction between levels N and N+1
    int     files_in_levelnp1;

    // Files written during compaction between levels N and N+1
    int     files_out_levelnp1;

    // Number of compactions done
    int     count;

    CompactionStats() : micros(0), bytes_readn(0), bytes_readnp1(0),
                        bytes_written(0), files_in_leveln(0),
                        files_in_levelnp1(0), files_out_levelnp1(0),
                        count(0) { }
416 417 418

    void Add(const CompactionStats& c) {
      this->micros += c.micros;
M
Mark Callaghan 已提交
419 420
      this->bytes_readn += c.bytes_readn;
      this->bytes_readnp1 += c.bytes_readnp1;
421
      this->bytes_written += c.bytes_written;
M
Mark Callaghan 已提交
422 423 424 425
      this->files_in_leveln += c.files_in_leveln;
      this->files_in_levelnp1 += c.files_in_levelnp1;
      this->files_out_levelnp1 += c.files_out_levelnp1;
      this->count += 1;
426 427
    }
  };
M
Mark Callaghan 已提交
428

429
  std::vector<CompactionStats> stats_;
430

431 432 433 434 435 436 437 438 439 440 441 442 443
  // Used to compute per-interval statistics
  struct StatsSnapshot {
    uint64_t bytes_read_;
    uint64_t bytes_written_;
    uint64_t bytes_new_;
    double   seconds_up_;

    StatsSnapshot() : bytes_read_(0), bytes_written_(0),
                      bytes_new_(0), seconds_up_(0) {}
  };

  StatsSnapshot last_stats_;

H
heyongqiang 已提交
444
  static const int KEEP_LOG_FILE_NUM = 1000;
H
heyongqiang 已提交
445
  std::string db_absolute_path_;
H
heyongqiang 已提交
446

447 448 449
  // count of the number of contiguous delaying writes
  int delayed_writes_;

450
  // The options to access storage files
H
Haobo Xu 已提交
451
  const EnvOptions storage_options_;
452

453 454 455 456 457 458
  // A value of true temporarily disables scheduling of background work
  bool bg_work_gate_closed_;

  // Guard against multiple concurrent refitting
  bool refitting_level_;

J
jorlow@chromium.org 已提交
459 460 461 462
  // No copying allowed
  DBImpl(const DBImpl&);
  void operator=(const DBImpl&);

463 464
  // dump the delayed_writes_ to the log file and reset counter.
  void DelayLoggingAndReset();
465

466 467 468 469 470 471
  // Return the earliest snapshot where seqno is visible.
  // Store the snapshot right before that, if any, in prev_snapshot
  inline SequenceNumber findEarliestVisibleSnapshot(
    SequenceNumber in,
    std::vector<SequenceNumber>& snapshots,
    SequenceNumber* prev_snapshot);
472

473 474
  // Function that Get and KeyMayExist call with no_io true or false
  // Note: 'value_found' from KeyMayExist propagates here
475 476 477
  Status GetImpl(const ReadOptions& options,
                 const Slice& key,
                 std::string* value,
478
                 bool* value_found = nullptr);
J
jorlow@chromium.org 已提交
479 480 481 482 483 484
};

// Sanitize db options.  The caller should delete result.info_log if
// it is not equal to src.info_log.
extern Options SanitizeOptions(const std::string& db,
                               const InternalKeyComparator* icmp,
S
Sanjay Ghemawat 已提交
485
                               const InternalFilterPolicy* ipolicy,
J
jorlow@chromium.org 已提交
486 487
                               const Options& src);

S
Siying Dong 已提交
488 489 490 491 492 493 494 495 496

// Determine compression type, based on user options, level of the output
// file and whether compression is disabled.
// If enable_compression is false, then compression is always disabled no
// matter what the values of the other two parameters are.
// Otherwise, the compression type is determined based on options and level.
CompressionType GetCompressionType(const Options& options, int level,
                                   const bool enable_compression);

497
}  // namespace rocksdb