version_set.h 25.0 KB
Newer Older
1 2 3 4 5
//  Copyright (c) 2013, Facebook, Inc.  All rights reserved.
//  This source code is licensed under the BSD-style license found in the
//  LICENSE file in the root directory of this source tree. An additional grant
//  of patent rights can be found in the PATENTS file in the same directory.
//
J
jorlow@chromium.org 已提交
6 7 8 9 10 11 12 13 14 15 16 17 18 19
// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file. See the AUTHORS file for names of contributors.
//
// The representation of a DBImpl consists of a set of Versions.  The
// newest version is called "current".  Older versions may be kept
// around to provide a consistent view to live iterators.
//
// Each Version keeps track of a set of Table files per level.  The
// entire set of versions is maintained in a VersionSet.
//
// Version,VersionSet are thread-compatible, but require external
// synchronization on all accesses.

20
#pragma once
J
jorlow@chromium.org 已提交
21
#include <map>
22
#include <memory>
J
jorlow@chromium.org 已提交
23 24
#include <set>
#include <vector>
25
#include <deque>
26
#include <atomic>
27
#include <limits>
J
jorlow@chromium.org 已提交
28
#include "db/dbformat.h"
S
sdong 已提交
29
#include "db/version_builder.h"
J
jorlow@chromium.org 已提交
30 31
#include "db/version_edit.h"
#include "port/port.h"
32
#include "db/table_cache.h"
33
#include "db/compaction.h"
I
Igor Canadi 已提交
34
#include "db/compaction_picker.h"
I
Igor Canadi 已提交
35 36
#include "db/column_family.h"
#include "db/log_reader.h"
37
#include "db/file_indexer.h"
38
#include "db/write_controller.h"
J
jorlow@chromium.org 已提交
39

40
namespace rocksdb {
J
jorlow@chromium.org 已提交
41

S
sdong 已提交
42 43 44
namespace log {
class Writer;
}
J
jorlow@chromium.org 已提交
45 46 47

class Compaction;
class Iterator;
H
Haobo Xu 已提交
48 49
class LogBuffer;
class LookupKey;
J
jorlow@chromium.org 已提交
50 51 52
class MemTable;
class Version;
class VersionSet;
53
class MergeContext;
I
Igor Canadi 已提交
54
class ColumnFamilyData;
I
Igor Canadi 已提交
55
class ColumnFamilySet;
56
class TableCache;
57
class MergeIteratorBuilder;
J
jorlow@chromium.org 已提交
58

59 60 61 62 63
// Return the smallest index i such that file_level.files[i]->largest >= key.
// Return file_level.num_files if there is no such file.
// REQUIRES: "file_level.files" contains a sorted list of
// non-overlapping files.
extern int FindFile(const InternalKeyComparator& icmp,
64
                    const LevelFilesBrief& file_level,
65 66
                    const Slice& key);

67
// Returns true iff some file in "files" overlaps the user key range
G
Gabor Cselle 已提交
68
// [*smallest,*largest].
A
Abhishek Kona 已提交
69 70
// smallest==nullptr represents a key smaller than all keys in the DB.
// largest==nullptr represents a key largest than all keys in the DB.
71 72
// REQUIRES: If disjoint_sorted_files, file_level.files[]
// contains disjoint ranges in sorted order.
73 74
extern bool SomeFileOverlapsRange(
    const InternalKeyComparator& icmp,
G
Gabor Cselle 已提交
75
    bool disjoint_sorted_files,
76
    const LevelFilesBrief& file_level,
G
Gabor Cselle 已提交
77 78
    const Slice* smallest_user_key,
    const Slice* largest_user_key);
79

80
// Generate LevelFilesBrief from vector<FdWithKeyRange*>
F
Feng Zhu 已提交
81 82
// Would copy smallest_key and largest_key data to sequential memory
// arena: Arena used to allocate the memory
83
extern void DoGenerateLevelFilesBrief(LevelFilesBrief* file_level,
F
Feng Zhu 已提交
84 85 86
        const std::vector<FileMetaData*>& files,
        Arena* arena);

S
sdong 已提交
87
class VersionStorageInfo {
J
jorlow@chromium.org 已提交
88
 public:
S
sdong 已提交
89 90 91 92 93
  VersionStorageInfo(const InternalKeyComparator* internal_comparator,
                     const Comparator* user_comparator, int num_levels,
                     CompactionStyle compaction_style,
                     VersionStorageInfo* src_vstorage);
  ~VersionStorageInfo();
94

S
sdong 已提交
95 96 97 98
  void Reserve(int level, size_t size) { files_[level].reserve(size); }

  void MaybeAddFile(int level, FileMetaData* f);

S
sdong 已提交
99 100 101 102 103 104 105 106 107 108 109 110 111
  void SetFinalized() { finalized_ = true; }

  // Update num_non_empty_levels_.
  void UpdateNumNonEmptyLevels();

  void GenerateFileIndexer() {
    file_indexer_.UpdateIndex(&arena_, num_non_empty_levels_, files_);
  }

  // Update the accumulated stats from a file-meta.
  void UpdateAccumulatedStats(FileMetaData* file_meta);

  void ComputeCompensatedSizes();
112

113 114
  // Updates internal structures that keep track of compaction scores
  // We use compaction scores to figure out which compaction to do next
115 116
  // REQUIRES: If Version is not yet saved to current_, it can be called without
  // a lock. Once a version is saved to current_, call only with mutex held
S
sdong 已提交
117
  // TODO find a better way to pass compaction_options_fifo.
118 119
  void ComputeCompactionScore(
      const MutableCFOptions& mutable_cf_options,
S
sdong 已提交
120
      const CompactionOptionsFIFO& compaction_options_fifo,
121
      std::vector<uint64_t>& size_being_compacted);
122

123 124
  // Generate level_files_brief_ from files_
  void GenerateLevelFilesBrief();
S
sdong 已提交
125 126 127
  // Sort all files for this version based on their file size and
  // record results in files_by_size_. The largest files are listed first.
  void UpdateFilesBySize();
128

S
sdong 已提交
129
  int MaxInputLevel() const;
J
jorlow@chromium.org 已提交
130

131 132 133 134
  // Returns true iff some level needs a compaction.
  bool NeedsCompaction() const;

  // Returns the maxmimum compaction score for levels 1 to max
135
  double max_compaction_score() const { return max_compaction_score_; }
136 137

  // See field declaration
138
  int max_compaction_score_level() const { return max_compaction_score_level_; }
139

140 141 142 143 144 145
  // Return level number that has idx'th highest score
  int CompactionScoreLevel(int idx) const { return compaction_level_[idx]; }

  // Return idx'th highest score
  double CompactionScore(int idx) const { return compaction_score_[idx]; }

G
Gabor Cselle 已提交
146
  void GetOverlappingInputs(
S
sdong 已提交
147 148
      int level, const InternalKey* begin,  // nullptr means before all keys
      const InternalKey* end,               // nullptr means after all keys
149
      std::vector<FileMetaData*>* inputs,
S
sdong 已提交
150 151
      int hint_index = -1,         // index of overlap file
      int* file_index = nullptr);  // return index of overlap file
G
Gabor Cselle 已提交
152

153
  void GetOverlappingInputsBinarySearch(
S
sdong 已提交
154 155
      int level, const Slice& begin,  // nullptr means before all keys
      const Slice& end,               // nullptr means after all keys
156
      std::vector<FileMetaData*>* inputs,
S
sdong 已提交
157 158
      int hint_index,    // index of overlap file
      int* file_index);  // return index of overlap file
159 160

  void ExtendOverlappingInputs(
S
sdong 已提交
161 162
      int level, const Slice& begin,  // nullptr means before all keys
      const Slice& end,               // nullptr means after all keys
163
      std::vector<FileMetaData*>* inputs,
S
sdong 已提交
164
      unsigned int index);  // start extending from this index
165

166
  // Returns true iff some file in the specified level overlaps
G
Gabor Cselle 已提交
167 168 169
  // some part of [*smallest_user_key,*largest_user_key].
  // smallest_user_key==NULL represents a key smaller than all keys in the DB.
  // largest_user_key==NULL represents a key largest than all keys in the DB.
S
sdong 已提交
170
  bool OverlapInLevel(int level, const Slice* smallest_user_key,
G
Gabor Cselle 已提交
171 172
                      const Slice* largest_user_key);

173 174 175 176 177 178 179
  // Returns true iff the first or last file in inputs contains
  // an overlapping user key to the file "just outside" of it (i.e.
  // just after the last file, or just before the first file)
  // REQUIRES: "*inputs" is a sorted list of non-overlapping files
  bool HasOverlappingUserKey(const std::vector<FileMetaData*>* inputs,
                             int level);

G
Gabor Cselle 已提交
180 181
  // Return the level at which we should place a new memtable compaction
  // result that covers the range [smallest_user_key,largest_user_key].
182 183
  int PickLevelForMemTableOutput(const MutableCFOptions& mutable_cf_options,
                                 const Slice& smallest_user_key,
G
Gabor Cselle 已提交
184
                                 const Slice& largest_user_key);
185

186
  int num_levels() const { return num_levels_; }
187

188
  // REQUIRES: This version has been saved (see VersionSet::SaveTo)
189
  int num_non_empty_levels() const {
190 191 192 193
    assert(finalized_);
    return num_non_empty_levels_;
  }

194 195 196 197 198
  // REQUIRES: This version has been saved (see VersionSet::SaveTo)
  int NumLevelFiles(int level) const {
    assert(finalized_);
    return files_[level].size();
  }
199

200
  // Return the combined file size of all files at the specified level.
201
  uint64_t NumLevelBytes(int level) const;
202

S
sdong 已提交
203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230
  // REQUIRES: This version has been saved (see VersionSet::SaveTo)
  const std::vector<FileMetaData*>& LevelFiles(int level) const {
    return files_[level];
  }

  const rocksdb::LevelFilesBrief& LevelFilesBrief(int level) const {
    return level_files_brief_[level];
  }

  // REQUIRES: This version has been saved (see VersionSet::SaveTo)
  const std::vector<int>& FilesBySize(int level) const {
    assert(finalized_);
    return files_by_size_[level];
  }

  // REQUIRES: lock is held
  // Set the index that is used to offset into files_by_size_ to find
  // the next compaction candidate file.
  void SetNextCompactionIndex(int level, int index) {
    next_file_to_compact_by_size_[level] = index;
  }

  // REQUIRES: lock is held
  int NextCompactionIndex(int level) const {
    return next_file_to_compact_by_size_[level];
  }

  // REQUIRES: This version has been saved (see VersionSet::SaveTo)
231
  const FileIndexer& file_indexer() const {
S
sdong 已提交
232 233 234 235 236 237 238 239 240 241 242
    assert(finalized_);
    return file_indexer_;
  }

  // Only the first few entries of files_by_size_ are sorted.
  // There is no need to sort all the files because it is likely
  // that on a running system, we need to look at only the first
  // few largest files because a new version is created every few
  // seconds/minutes (because of concurrent compactions).
  static const size_t kNumberFilesToSort = 50;

243 244 245
  // Return a human-readable short (single-line) summary of the number
  // of files per level.  Uses *scratch as backing store.
  struct LevelSummaryStorage {
246
    char buffer[1000];
247 248
  };
  struct FileSummaryStorage {
249
    char buffer[3000];
250 251 252 253 254 255 256 257 258 259
  };
  const char* LevelSummary(LevelSummaryStorage* scratch) const;
  // Return a human-readable short (single-line) summary of files
  // in a specified level.  Uses *scratch as backing store.
  const char* LevelFileSummary(FileSummaryStorage* scratch, int level) const;

  // Return the maximum overlapping data (in bytes) at next level for any
  // file at a level >= 1.
  int64_t MaxNextLevelOverlappingBytes();

J
jorlow@chromium.org 已提交
260
  // Return a human readable string that describes this version's contents.
Z
Zheng Shao 已提交
261
  std::string DebugString(bool hex = false) const;
J
jorlow@chromium.org 已提交
262

263
  uint64_t GetAverageValueSize() const {
264
    if (accumulated_num_non_deletions_ == 0) {
265 266
      return 0;
    }
267 268
    assert(accumulated_raw_key_size_ + accumulated_raw_value_size_ > 0);
    assert(accumulated_file_size_ > 0);
S
sdong 已提交
269
    return accumulated_raw_value_size_ / accumulated_num_non_deletions_ *
270 271
           accumulated_file_size_ /
           (accumulated_raw_key_size_ + accumulated_raw_value_size_);
272 273
  }

S
sdong 已提交
274
  uint64_t GetEstimatedActiveKeys() const;
S
sdong 已提交
275 276 277 278 279 280 281

  // re-initializes the index that is used to offset into files_by_size_
  // to find the next compaction candidate file.
  void ResetNextCompactionIndex(int level) {
    next_file_to_compact_by_size_[level] = 0;
  }

S
sdong 已提交
282 283 284 285
  const InternalKeyComparator* InternalComparator() {
    return internal_comparator_;
  }

S
sdong 已提交
286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389
 private:
  const InternalKeyComparator* internal_comparator_;
  const Comparator* user_comparator_;
  int num_levels_;            // Number of levels
  int num_non_empty_levels_;  // Number of levels. Any level larger than it
                              // is guaranteed to be empty.
  // A short brief metadata of files per level
  autovector<rocksdb::LevelFilesBrief> level_files_brief_;
  FileIndexer file_indexer_;
  Arena arena_;  // Used to allocate space for file_levels_

  CompactionStyle compaction_style_;

  // List of files per level, files in each level are arranged
  // in increasing order of keys
  std::vector<FileMetaData*>* files_;

  // A list for the same set of files that are stored in files_,
  // but files in each level are now sorted based on file
  // size. The file with the largest size is at the front.
  // This vector stores the index of the file from files_.
  std::vector<std::vector<int>> files_by_size_;

  // An index into files_by_size_ that specifies the first
  // file that is not yet compacted
  std::vector<int> next_file_to_compact_by_size_;

  // Only the first few entries of files_by_size_ are sorted.
  // There is no need to sort all the files because it is likely
  // that on a running system, we need to look at only the first
  // few largest files because a new version is created every few
  // seconds/minutes (because of concurrent compactions).
  static const size_t number_of_files_to_sort_ = 50;

  // Level that should be compacted next and its compaction score.
  // Score < 1 means compaction is not strictly needed.  These fields
  // are initialized by Finalize().
  // The most critical level to be compacted is listed first
  // These are used to pick the best compaction level
  std::vector<double> compaction_score_;
  std::vector<int> compaction_level_;
  double max_compaction_score_ = 0.0;   // max score in l1 to ln-1
  int max_compaction_score_level_ = 0;  // level on which max score occurs

  // the following are the sampled temporary stats.
  // the current accumulated size of sampled files.
  uint64_t accumulated_file_size_;
  // the current accumulated size of all raw keys based on the sampled files.
  uint64_t accumulated_raw_key_size_;
  // the current accumulated size of all raw keys based on the sampled files.
  uint64_t accumulated_raw_value_size_;
  // total number of non-deletion entries
  uint64_t accumulated_num_non_deletions_;
  // total number of deletion entries
  uint64_t accumulated_num_deletions_;
  // the number of samples
  uint64_t num_samples_;

  bool finalized_;

  friend class Version;
  friend class VersionSet;
  // No copying allowed
  VersionStorageInfo(const VersionStorageInfo&) = delete;
  void operator=(const VersionStorageInfo&) = delete;
};

class Version {
 public:
  // Append to *iters a sequence of iterators that will
  // yield the contents of this Version when merged together.
  // REQUIRES: This version has been saved (see VersionSet::SaveTo)
  void AddIterators(const ReadOptions&, const EnvOptions& soptions,
                    MergeIteratorBuilder* merger_iter_builder);

  // Lookup the value for key.  If found, store it in *val and
  // return OK.  Else return a non-OK status.
  // Uses *operands to store merge_operator operations to apply later
  // REQUIRES: lock is not held
  void Get(const ReadOptions&, const LookupKey& key, std::string* val,
           Status* status, MergeContext* merge_context,
           bool* value_found = nullptr);

  // Update scores, pre-calculated variables. It needs to be called before
  // applying the version to the version set.
  void PrepareApply(const MutableCFOptions& mutable_cf_options,
                    std::vector<uint64_t>& size_being_compacted);

  // Reference count management (so Versions do not disappear out from
  // under live iterators)
  void Ref();
  // Decrease reference count. Delete the object if no reference left
  // and return true. Otherwise, return false.
  bool Unref();

  // Add all files listed in the current version to *live.
  void AddLiveFiles(std::vector<FileDescriptor>* live);

  // Return a human readable string that describes this version's contents.
  std::string DebugString(bool hex = false) const;

  // Returns the version nuber of this version
  uint64_t GetVersionNumber() const { return version_number_; }

390 391 392 393 394 395 396 397 398
  // REQUIRES: lock is held
  // On success, "tp" will contains the table properties of the file
  // specified in "file_meta".  If the file name of "file_meta" is
  // known ahread, passing it by a non-null "fname" can save a
  // file-name conversion.
  Status GetTableProperties(std::shared_ptr<const TableProperties>* tp,
                            const FileMetaData* file_meta,
                            const std::string* fname = nullptr);

399 400 401 402 403 404
  // REQUIRES: lock is held
  // On success, *props will be populated with all SSTables' table properties.
  // The keys of `props` are the sst file name, the values of `props` are the
  // tables' propertis, represented as shared_ptr.
  Status GetPropertiesOfAllTables(TablePropertiesCollection* props);

S
sdong 已提交
405
  uint64_t GetEstimatedActiveKeys() {
S
sdong 已提交
406
    return storage_info_.GetEstimatedActiveKeys();
S
sdong 已提交
407
  }
S
sdong 已提交
408

409 410
  size_t GetMemoryUsageByTableReaders();

411 412
  ColumnFamilyData* cfd() const { return cfd_; }

413

414 415 416 417 418
  // Return the next Version in the linked list. Used for debug only
  Version* TEST_Next() const {
    return next_;
  }

S
sdong 已提交
419 420 421
  VersionStorageInfo* storage_info() { return &storage_info_; }

  VersionSet* version_set() { return vset_; }
L
Lei Jin 已提交
422

423 424
  void GetColumnFamilyMetaData(ColumnFamilyMetaData* cf_meta);

J
jorlow@chromium.org 已提交
425 426
 private:
  friend class VersionSet;
S
sdong 已提交
427

S
sdong 已提交
428 429
  const InternalKeyComparator* internal_comparator() const {
    return storage_info_.internal_comparator_;
S
sdong 已提交
430
  }
S
sdong 已提交
431 432
  const Comparator* user_comparator() const {
    return storage_info_.user_comparator_;
S
sdong 已提交
433
  }
J
jorlow@chromium.org 已提交
434

435
  bool PrefixMayMatch(const ReadOptions& read_options, Iterator* level_iter,
L
Lei Jin 已提交
436
                      const Slice& internal_prefix) const;
J
jorlow@chromium.org 已提交
437

438
  // The helper function of UpdateAccumulatedStats, which may fill the missing
439 440 441 442
  // fields of file_mata from its associated TableProperties.
  // Returns true if it does initialize FileMetaData.
  bool MaybeInitializeFileMetaData(FileMetaData* file_meta);

443 444 445
  // Update the accumulated stats associated with the current version.
  // This accumulated stats will be used in compaction.
  void UpdateAccumulatedStats();
446

447 448 449 450
  // Sort all files for this version based on their file size and
  // record results in files_by_size_. The largest files are listed first.
  void UpdateFilesBySize();

451
  ColumnFamilyData* cfd_;  // ColumnFamilyData to which this Version belongs
S
sdong 已提交
452 453
  Logger* info_log_;
  Statistics* db_statistics_;
454 455
  TableCache* table_cache_;
  const MergeOperator* merge_operator_;
456

S
sdong 已提交
457
  VersionStorageInfo storage_info_;
J
jorlow@chromium.org 已提交
458 459
  VersionSet* vset_;            // VersionSet to which this Version belongs
  Version* next_;               // Next version in linked list
460
  Version* prev_;               // Previous version in linked list
J
jorlow@chromium.org 已提交
461 462
  int refs_;                    // Number of live refs to this version

463 464 465 466
  // A version number that uniquely represents this version. This is
  // used for debugging and logging purposes only.
  uint64_t version_number_;

467
  Version(ColumnFamilyData* cfd, VersionSet* vset, uint64_t version_number = 0);
J
jorlow@chromium.org 已提交
468

469
  ~Version();
470

J
jorlow@chromium.org 已提交
471 472 473 474 475 476 477
  // No copying allowed
  Version(const Version&);
  void operator=(const Version&);
};

class VersionSet {
 public:
478 479
  VersionSet(const std::string& dbname, const DBOptions* db_options,
             const EnvOptions& env_options, Cache* table_cache,
480
             WriteController* write_controller);
J
jorlow@chromium.org 已提交
481 482 483 484
  ~VersionSet();

  // Apply *edit to the current version to form a new descriptor that
  // is both saved to persistent state and installed as the new
485
  // current version.  Will release *mu while actually writing to the file.
486
  // column_family_options has to be set if edit is column family add
487 488
  // REQUIRES: *mu is held on entry.
  // REQUIRES: no other thread concurrently calls LogAndApply()
S
sdong 已提交
489 490 491 492 493 494
  Status LogAndApply(
      ColumnFamilyData* column_family_data,
      const MutableCFOptions& mutable_cf_options, VersionEdit* edit,
      port::Mutex* mu, Directory* db_directory = nullptr,
      bool new_descriptor_log = false,
      const ColumnFamilyOptions* column_family_options = nullptr);
495

J
jorlow@chromium.org 已提交
496
  // Recover the last saved descriptor from persistent storage.
497 498 499 500
  // If read_only == true, Recover() will not complain if some column families
  // are not opened
  Status Recover(const std::vector<ColumnFamilyDescriptor>& column_families,
                 bool read_only = false);
I
Igor Canadi 已提交
501 502 503 504 505

  // Reads a manifest file and returns a list of column families in
  // column_families.
  static Status ListColumnFamilies(std::vector<std::string>* column_families,
                                   const std::string& dbname, Env* env);
J
jorlow@chromium.org 已提交
506

I
Igor Canadi 已提交
507
#ifndef ROCKSDB_LITE
508 509 510
  // Try to reduce the number of levels. This call is valid when
  // only one level from the new max level to the old
  // max level containing files.
511 512 513
  // The call is static, since number of levels is immutable during
  // the lifetime of a RocksDB instance. It reduces number of levels
  // in a DB by applying changes to manifest.
514 515 516
  // For example, a db currently has 7 levels [0-6], and a call to
  // to reduce to 5 [0-4] can only be executed when only one level
  // among [4-6] contains files.
517 518
  static Status ReduceNumberOfLevels(const std::string& dbname,
                                     const Options* options,
519
                                     const EnvOptions& env_options,
520
                                     int new_levels);
521

I
Igor Canadi 已提交
522 523 524 525 526 527
  // printf contents (for debugging)
  Status DumpManifest(Options& options, std::string& manifestFileName,
                      bool verbose, bool hex = false);

#endif  // ROCKSDB_LITE

J
jorlow@chromium.org 已提交
528
  // Return the current manifest file number
529
  uint64_t manifest_file_number() const { return manifest_file_number_; }
J
jorlow@chromium.org 已提交
530

531
  uint64_t pending_manifest_file_number() const {
532 533 534
    return pending_manifest_file_number_;
  }

535
  uint64_t current_next_file_number() const { return next_file_number_.load(); }
I
Igor Canadi 已提交
536

J
jorlow@chromium.org 已提交
537
  // Allocate and return a new file number
538
  uint64_t NewFileNumber() { return next_file_number_.fetch_add(1) + 1; }
J
jorlow@chromium.org 已提交
539

H
heyongqiang 已提交
540 541 542
  // Arrange to reuse "file_number" unless a newer file number has
  // already been allocated.
  // REQUIRES: "file_number" was returned by a call to NewFileNumber().
543
  void ReuseLogFileNumber(uint64_t file_number) {
544 545 546
    auto expected = file_number + 1;
    std::atomic_compare_exchange_strong(&next_file_number_, &expected,
                                        file_number);
H
heyongqiang 已提交
547 548
  }

549
  // Return the last sequence number.
I
Igor Canadi 已提交
550 551 552
  uint64_t LastSequence() const {
    return last_sequence_.load(std::memory_order_acquire);
  }
553 554 555 556

  // Set the last sequence number to s.
  void SetLastSequence(uint64_t s) {
    assert(s >= last_sequence_);
I
Igor Canadi 已提交
557
    last_sequence_.store(s, std::memory_order_release);
558 559
  }

560
  // Mark the specified file number as used.
561 562
  // REQUIRED: this is only called during single-threaded recovery
  void MarkFileNumberUsedDuringRecovery(uint64_t number);
563

564 565
  // Return the log file number for the log file that is currently
  // being compacted, or zero if there is no such log file.
566
  uint64_t prev_log_number() const { return prev_log_number_; }
567

568 569 570
  // Returns the minimum log number such that all
  // log numbers less than or equal to it can be deleted
  uint64_t MinLogNumber() const {
571
    uint64_t min_log_num = std::numeric_limits<uint64_t>::max();
572
    for (auto cfd : *column_family_set_) {
573
      if (min_log_num > cfd->GetLogNumber()) {
574
        min_log_num = cfd->GetLogNumber();
575 576 577 578 579
      }
    }
    return min_log_num;
  }

J
jorlow@chromium.org 已提交
580 581 582 583 584
  // Create an iterator that reads over the compaction inputs for "*c".
  // The caller should delete the iterator when no longer needed.
  Iterator* MakeInputIterator(Compaction* c);

  // Add all files listed in any live version to *live.
585
  void AddLiveFiles(std::vector<FileDescriptor>* live_list);
J
jorlow@chromium.org 已提交
586 587 588 589 590

  // Return the approximate offset in the database of the data for
  // "key" as of version "v".
  uint64_t ApproximateOffsetOf(Version* v, const InternalKey& key);

591
  // Return the size of the current manifest file
592
  uint64_t manifest_file_size() const { return manifest_file_size_; }
593 594 595 596 597 598 599

  // verify that the files that we started with for a compaction
  // still exist in the current version and in the same original level.
  // This ensures that a concurrent compaction did not erroneously
  // pick the same files to compact.
  bool VerifyCompactionFileConsistency(Compaction* c);

600
  Status GetMetadataForFile(uint64_t number, int* filelevel,
601
                            FileMetaData** metadata, ColumnFamilyData** cfd);
602

603
  void GetLiveFilesMetaData(std::vector<LiveFileMetaData> *metadata);
604

605
  void GetObsoleteFiles(std::vector<FileMetaData*>* files);
I
Igor Canadi 已提交
606

I
Igor Canadi 已提交
607
  ColumnFamilySet* GetColumnFamilySet() { return column_family_set_.get(); }
608
  const EnvOptions& env_options() { return env_options_; }
609

J
jorlow@chromium.org 已提交
610
 private:
611
  struct ManifestWriter;
J
jorlow@chromium.org 已提交
612 613

  friend class Version;
614
  friend class DBImpl;
J
jorlow@chromium.org 已提交
615

I
Igor Canadi 已提交
616 617 618 619 620 621 622
  struct LogReporter : public log::Reader::Reporter {
    Status* status;
    virtual void Corruption(size_t bytes, const Status& s) {
      if (this->status->ok()) *this->status = s;
    }
  };

623 624 625
  // Save current contents to *log
  Status WriteSnapshot(log::Writer* log);

626
  void AppendVersion(ColumnFamilyData* column_family_data, Version* v);
627

628 629
  bool ManifestContains(uint64_t manifest_file_number,
                        const std::string& record) const;
630

631
  ColumnFamilyData* CreateColumnFamily(const ColumnFamilyOptions& cf_options,
632 633
                                       VersionEdit* edit);

I
Igor Canadi 已提交
634 635
  std::unique_ptr<ColumnFamilySet> column_family_set_;

J
jorlow@chromium.org 已提交
636 637
  Env* const env_;
  const std::string dbname_;
638
  const DBOptions* const db_options_;
639
  std::atomic<uint64_t> next_file_number_;
J
jorlow@chromium.org 已提交
640
  uint64_t manifest_file_number_;
641
  uint64_t pending_manifest_file_number_;
I
Igor Canadi 已提交
642
  std::atomic<uint64_t> last_sequence_;
643
  uint64_t prev_log_number_;  // 0 or backing store for memtable being compacted
J
jorlow@chromium.org 已提交
644 645

  // Opened lazily
646
  unique_ptr<log::Writer> descriptor_log_;
J
jorlow@chromium.org 已提交
647

648 649 650 651 652 653
  // generates a increasing version number for every new version
  uint64_t current_version_number_;

  // Queue of writers to the manifest file
  std::deque<ManifestWriter*> manifest_writers_;

654
  // Current size of manifest file
655
  uint64_t manifest_file_size_;
A
Abhishek Kona 已提交
656

I
Igor Canadi 已提交
657 658
  std::vector<FileMetaData*> obsolete_files_;

659 660
  // env options for all reads and writes except compactions
  const EnvOptions& env_options_;
661

662 663 664
  // env options used for compactions. This is a copy of
  // env_options_ but with readaheads set to readahead_compactions_.
  const EnvOptions env_options_compactions_;
665

J
jorlow@chromium.org 已提交
666 667 668
  // No copying allowed
  VersionSet(const VersionSet&);
  void operator=(const VersionSet&);
669

I
Igor Canadi 已提交
670
  void LogAndApplyCFHelper(VersionEdit* edit);
S
sdong 已提交
671
  void LogAndApplyHelper(ColumnFamilyData* cfd, VersionBuilder* b, Version* v,
672
                         VersionEdit* edit, port::Mutex* mu);
J
jorlow@chromium.org 已提交
673 674
};

675
}  // namespace rocksdb