options.h 21.2 KB
Newer Older
J
jorlow@chromium.org 已提交
1 2 3 4 5 6 7 8
// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file. See the AUTHORS file for names of contributors.

#ifndef STORAGE_LEVELDB_INCLUDE_OPTIONS_H_
#define STORAGE_LEVELDB_INCLUDE_OPTIONS_H_

#include <stddef.h>
H
heyongqiang 已提交
9
#include <string>
10
#include <memory>
11
#include <vector>
12
#include <stdint.h>
13
#include "leveldb/slice.h"
A
Abhishek Kona 已提交
14
#include "leveldb/statistics.h"
J
Jim Paton 已提交
15
#include "leveldb/memtablerep.h"
J
jorlow@chromium.org 已提交
16 17 18 19 20 21

namespace leveldb {

class Cache;
class Comparator;
class Env;
S
Sanjay Ghemawat 已提交
22
class FilterPolicy;
23
class Logger;
24
class MergeOperator;
J
jorlow@chromium.org 已提交
25
class Snapshot;
26
class CompactionFilter;
J
jorlow@chromium.org 已提交
27

28 29
using std::shared_ptr;

J
jorlow@chromium.org 已提交
30 31 32 33 34 35 36
// DB contents are stored in a set of blocks, each of which holds a
// sequence of key,value pairs.  Each block may be compressed before
// being stored in a file.  The following enum describes which
// compression method (if any) is used to compress a block.
enum CompressionType {
  // NOTE: do not change the values of existing entries, as these are
  // part of the persistent format on disk.
J
jorlow@chromium.org 已提交
37
  kNoCompression     = 0x0,
H
heyongqiang 已提交
38
  kSnappyCompression = 0x1,
H
heyongqiang 已提交
39 40
  kZlibCompression = 0x2,
  kBZip2Compression = 0x3
J
jorlow@chromium.org 已提交
41 42
};

43 44 45 46 47 48 49 50 51 52 53 54 55
// Compression options for different compression algorithms like Zlib
struct CompressionOptions {
  int window_bits;
  int level;
  int strategy;
  CompressionOptions():window_bits(-14),
                       level(-1),
                       strategy(0){}
  CompressionOptions(int wbits, int lev, int strategy):window_bits(wbits),
                                                       level(lev),
                                                       strategy(strategy){}
};

J
jorlow@chromium.org 已提交
56 57 58 59 60 61 62 63 64 65 66 67 68
// Options to control the behavior of a database (passed to DB::Open)
struct Options {
  // -------------------
  // Parameters that affect behavior

  // Comparator used to define the order of keys in the table.
  // Default: a comparator that uses lexicographic byte-wise ordering
  //
  // REQUIRES: The client must ensure that the comparator supplied
  // here has the same name and orders keys *exactly* the same as the
  // comparator provided to previous open calls on the same DB.
  const Comparator* comparator;

69 70 71 72 73 74 75 76 77 78 79 80
  // REQUIRES: The client must provide a merge operator if Merge operation
  // needs to be accessed. Calling Merge on a DB without a merge operator
  // would result in Status::NotSupported. The client must ensure that the
  // merge operator supplied here has the same name and *exactly* the same
  // semantics as the merge operator provided to previous open calls on
  // the same DB. The only exception is reserved for upgrade, where a DB
  // previously without a merge operator is introduced to Merge operation
  // for the first time. It's necessary to specify a merge operator when
  // openning the DB in this case.
  // Default: nullptr
  const MergeOperator* merge_operator;

81 82 83 84 85
  // Allows an application to modify/delete a key-value during background
  // compaction.
  // Default: nullptr
  const CompactionFilter* compaction_filter;

J
jorlow@chromium.org 已提交
86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107
  // If true, the database will be created if it is missing.
  // Default: false
  bool create_if_missing;

  // If true, an error is raised if the database already exists.
  // Default: false
  bool error_if_exists;

  // If true, the implementation will do aggressive checking of the
  // data it is processing and will stop early if it detects any
  // errors.  This may have unforeseen ramifications: for example, a
  // corruption of one DB entry may cause a large number of entries to
  // become unreadable or for the entire DB to become unopenable.
  // Default: false
  bool paranoid_checks;

  // Use the specified object to interact with the environment,
  // e.g. to read/write files, schedule background work, etc.
  // Default: Env::Default()
  Env* env;

  // Any internal progress/error information generated by the db will
A
Abhishek Kona 已提交
108 109 110
  // be written to info_log if it is non-nullptr, or to a file stored
  // in the same directory as the DB contents if info_log is nullptr.
  // Default: nullptr
111
  shared_ptr<Logger> info_log;
J
jorlow@chromium.org 已提交
112 113 114 115

  // -------------------
  // Parameters that affect performance

116 117
  // Amount of data to build up in memory (backed by an unsorted log
  // on disk) before converting to a sorted on-disk file.
J
jorlow@chromium.org 已提交
118
  //
119
  // Larger values increase performance, especially during bulk loads.
A
Abhishek Kona 已提交
120
  // Up to max_write_buffer_number write buffers may be held in memory
121
  // at the same time,
122
  // so you may wish to adjust this parameter to control memory usage.
123 124
  // Also, a larger write buffer will result in a longer recovery time
  // the next time the database is opened.
J
jorlow@chromium.org 已提交
125
  //
126
  // Default: 4MB
J
jorlow@chromium.org 已提交
127 128
  size_t write_buffer_size;

129
  // The maximum number of write buffers that are built up in memory.
A
Abhishek Kona 已提交
130
  // The default is 2, so that when 1 write buffer is being flushed to
131 132 133 134
  // storage, new writes can continue to the other write buffer.
  // Default: 2
  int max_write_buffer_number;

135
  // The minimum number of write buffers that will be merged together
136
  // before writing to storage.  If set to 1, then
137 138 139 140 141 142 143
  // all write buffers are fushed to L0 as individual files and this increases
  // read amplification because a get request has to check in all of these
  // files. Also, an in-memory merge may result in writing lesser
  // data to storage if there are duplicate records in each of these
  // individual write buffers.  Default: 1
  int min_write_buffer_number_to_merge;

J
jorlow@chromium.org 已提交
144 145 146 147 148 149 150 151 152 153
  // Number of open files that can be used by the DB.  You may need to
  // increase this if your database has a large working set (budget
  // one open file per 2MB of working set).
  //
  // Default: 1000
  int max_open_files;

  // Control over blocks (user data is stored in a set of blocks, and
  // a block is the unit of reading from disk).

A
Abhishek Kona 已提交
154
  // If non-NULL use the specified cache for blocks.
155
  // If NULL, leveldb will automatically create and use an 8MB internal cache.
A
Abhishek Kona 已提交
156
  // Default: nullptr
157
  shared_ptr<Cache> block_cache;
J
jorlow@chromium.org 已提交
158 159 160 161 162 163

  // Approximate size of user data packed per block.  Note that the
  // block size specified here corresponds to uncompressed data.  The
  // actual size of the unit read from disk may be smaller if
  // compression is enabled.  This parameter can be changed dynamically.
  //
164
  // Default: 4K
D
dgrogan@chromium.org 已提交
165
  size_t block_size;
J
jorlow@chromium.org 已提交
166 167 168 169 170 171 172 173

  // Number of keys between restart points for delta encoding of keys.
  // This parameter can be changed dynamically.  Most clients should
  // leave this parameter alone.
  //
  // Default: 16
  int block_restart_interval;

174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190

  // Compress blocks using the specified compression algorithm.  This
  // parameter can be changed dynamically.
  //
  // Default: kSnappyCompression, which gives lightweight but fast
  // compression.
  //
  // Typical speeds of kSnappyCompression on an Intel(R) Core(TM)2 2.4GHz:
  //    ~200-500MB/s compression
  //    ~400-800MB/s decompression
  // Note that these speeds are significantly faster than most
  // persistent storage speeds, and therefore it is typically never
  // worth switching to kNoCompression.  Even if the input data is
  // incompressible, the kSnappyCompression implementation will
  // efficiently detect that and will switch to uncompressed mode.
  CompressionType compression;

191 192 193 194
  // Different levels can have different compression policies. There
  // are cases where most lower levels would like to quick compression
  // algorithm while the higher levels (which have more data) use
  // compression algorithms that have better compression but could
A
Abhishek Kona 已提交
195 196
  // be slower. This array, if non nullptr, should have an entry for
  // each level of the database. This array, if non nullptr, overides the
197 198 199 200
  // value specified in the previous field 'compression'. The caller is
  // reponsible for allocating memory and initializing the values in it
  // before invoking Open(). The caller is responsible for freeing this
  // array and it could be freed anytime after the return from Open().
201
  // This could have been a std::vector but that makes the equivalent
202
  // java/C api hard to construct.
203
  std::vector<CompressionType> compression_per_level;
204

205 206 207
  //different options for compression algorithms
  CompressionOptions compression_opts;

A
Abhishek Kona 已提交
208
  // If non-nullptr, use the specified filter policy to reduce disk reads.
209 210 211
  // Many applications will benefit from passing the result of
  // NewBloomFilterPolicy() here.
  //
A
Abhishek Kona 已提交
212
  // Default: nullptr
213 214
  const FilterPolicy* filter_policy;

215 216 217 218 219 220 221
  // Number of levels for this database
  int num_levels;

  // Number of files to trigger level-0 compaction. A value <0 means that
  // level-0 compaction will not be triggered by number of files at all.
  int level0_file_num_compaction_trigger;

222 223 224
  // Soft limit on number of level-0 files. We start slowing down writes at this
  // point. A value <0 means that no writing slow down will be triggered by
  // number of files in level-0.
225 226 227 228 229 230 231 232 233 234 235 236 237
  int level0_slowdown_writes_trigger;

  // Maximum number of level-0 files.  We stop writes at this point.
  int level0_stop_writes_trigger;

  // Maximum level to which a new compacted memtable is pushed if it
  // does not create overlap.  We try to push to level 2 to avoid the
  // relatively expensive level 0=>1 compactions and to avoid some
  // expensive manifest file operations.  We do not push all the way to
  // the largest level since that can generate a lot of wasted disk
  // space if the same key space is being repeatedly overwritten.
  int max_mem_compaction_level;

238 239 240 241 242 243 244 245 246 247
  // Target file size for compaction.
  // target_file_size_base is per-file size for level-1.
  // Target file size for level L can be calculated by
  // target_file_size_base * (target_file_size_multiplier ^ (L-1))
  // For example, if target_file_size_base is 2MB and
  // target_file_size_multiplier is 10, then each file on level-1 will
  // be 2MB, and each file on level 2 will be 20MB,
  // and each file on level-3 will be 200MB.

  // by default target_file_size_base is 2MB.
248
  int target_file_size_base;
249 250
  // by default target_file_size_multiplier is 1, which means
  // by default files in different levels will have similar size.
251 252
  int target_file_size_multiplier;

253 254 255 256 257 258 259 260 261
  // Control maximum total data size for a level.
  // max_bytes_for_level_base is the max total for level-1.
  // Maximum number of bytes for level L can be calculated as
  // (max_bytes_for_level_base) * (max_bytes_for_level_multiplier ^ (L-1))
  // For example, if max_bytes_for_level_base is 20MB, and if
  // max_bytes_for_level_multiplier is 10, total data size for level-1
  // will be 20MB, total file size for level-2 will be 200MB,
  // and total file size for level-3 will be 2GB.

262

263
  // by default 'max_bytes_for_level_base' is 10MB.
264
  uint64_t max_bytes_for_level_base;
265
  // by default 'max_bytes_for_level_base' is 10.
H
heyongqiang 已提交
266
  int max_bytes_for_level_multiplier;
267

268 269 270 271 272 273
  // Different max-size multipliers for different levels.
  // These are multiplied by max_bytes_for_level_multiplier to arrive
  // at the max-size of each level.
  // Default: 1
  std::vector<int> max_bytes_for_level_multiplier_additional;

H
heyongqiang 已提交
274 275 276 277 278
  // Maximum number of bytes in all compacted files.  We avoid expanding
  // the lower level file set of a compaction if it would make the
  // total compaction cover more than
  // (expanded_compaction_factor * targetFileSizeLevel()) many bytes.
  int expanded_compaction_factor;
279

280
  // Maximum number of bytes in all source files to be compacted in a
A
Abhishek Kona 已提交
281
  // single compaction run. We avoid picking too many files in the
282 283 284 285 286 287 288
  // source level so that we do not exceed the total source bytes
  // for compaction to exceed
  // (source_compaction_factor * targetFileSizeLevel()) many bytes.
  // Default:1, i.e. pick maxfilesize amount of data as the source of
  // a compaction.
  int source_compaction_factor;

H
heyongqiang 已提交
289 290 291
  // Control maximum bytes of overlaps in grandparent (i.e., level+2) before we
  // stop building a single file in a level->level+1 compaction.
  int max_grandparent_overlap_factor;
292

293
  // If non-null, then we should collect metrics about database operations
294 295
  // Statistics objects should not be shared between DB instances as
  // it does not use any locks to prevent concurrent updates.
A
Abhishek Kona 已提交
296
  shared_ptr<Statistics> statistics;
297

298 299 300 301 302 303 304 305
  // If true, then the contents of data files are not synced
  // to stable storage. Their contents remain in the OS buffers till the
  // OS decides to flush them. This option is good for bulk-loading
  // of data. Once the bulk-loading is complete, please issue a
  // sync to the OS to flush all dirty buffesrs to stable storage.
  // Default: false
  bool disableDataSync;

306 307 308 309 310 311 312
  // If true, then every store to stable storage will issue a fsync.
  // If false, then every store to stable storage will issue a fdatasync.
  // This parameter should be set to true while storing data to
  // filesystem like ext3 which can lose files after a reboot.
  // Default: false
  bool use_fsync;

313 314 315 316 317 318
  // This number controls how often a new scribe log about
  // db deploy stats is written out.
  // -1 indicates no logging at all.
  // Default value is 1800 (half an hour).
  int db_stats_log_interval;

H
heyongqiang 已提交
319 320 321 322 323 324 325
  // This specifies the log dir.
  // If it is empty, the log files will be in the same dir as data.
  // If it is non empty, the log files will be in the specified dir,
  // and the db data dir's absolute path will be used as the log file
  // name's prefix.
  std::string db_log_dir;

326 327 328 329 330 331
  // Disable compaction triggered by seek.
  // With bloomfilter and fast storage, a miss on one level
  // is very cheap if the file handle is cached in table cache
  // (which is true if max_open_files is large).
  bool disable_seek_compaction;

332 333 334 335
  // The periodicity when obsolete files get deleted. The default
  // value is 0 which means that obsolete files get removed after
  // every compaction run.
  uint64_t delete_obsolete_files_period_micros;
A
Abhishek Kona 已提交
336

337 338 339
  // Maximum number of concurrent background compactions.
  // Default: 1
  int max_background_compactions;
340

341 342 343 344 345 346 347
  // Specify the maximal size of the info log file. If the log file
  // is larger than `max_log_file_size`, a new info log file will
  // be created.
  // If max_log_file_size == 0, all logs will be written to one
  // log file.
  size_t max_log_file_size;

K
Kai Liu 已提交
348 349 350 351 352 353 354 355 356 357
  // Time for the info log file to roll (in seconds).
  // If specified with non-zero value, log file will be rolled
  // if it has been active longer than `log_file_time_to_roll`.
  // Default: 0 (disabled)
  size_t log_file_time_to_roll;

  // Maximal info log files to be kept.
  // Default: 1000
  size_t keep_log_file_num;

358 359 360 361
  // Puts are delayed when any level has a compaction score that
  // exceeds rate_limit. This is ignored when <= 1.0.
  double rate_limit;

362
  // Max time a put will be stalled when rate_limit is enforced
363
  unsigned int rate_limit_delay_milliseconds;
364

A
Abhishek Kona 已提交
365 366 367 368 369
  // manifest file is rolled over on reaching this limit.
  // The older manifest file be deleted.
  // The default value is MAX_INT so that roll-over does not take place.
  uint64_t max_manifest_file_size;

370 371
  // Disable block cache. If this is set to false,
  // then no block cache should be used, and the block_cache should
A
Abhishek Kona 已提交
372
  // point to a nullptr object.
373 374 375 376 377
  bool no_block_cache;

  // Number of shards used for table cache.
  int table_cache_numshardbits;

X
Xing Jin 已提交
378 379 380 381 382 383 384
  // size of one block in arena memory allocation.
  // If <= 0, a proper value is automatically calculated (usually 1/10 of
  // writer_buffer_size).
  //
  // Default: 0
  size_t arena_block_size;

J
jorlow@chromium.org 已提交
385 386
  // Create an Options object with default values for all fields.
  Options();
387

388
  void Dump(Logger* log) const;
389

390 391
  // Set appropriate parameters for bulk loading.
  // The reason that this is a function that returns "this" instead of a
392
  // constructor is to enable chaining of multiple similar calls in the future.
393 394 395 396 397 398
  //
  // All data will be in level 0 without any automatic compaction.
  // It's recommended to manually call CompactRange(NULL, NULL) before reading
  // from the database, because otherwise the read can be very slow.
  Options* PrepareForBulkLoad();

399 400 401
  // Disable automatic compactions. Manual compactions can still
  // be issued on this database.
  bool disable_auto_compactions;
402 403 404 405 406 407 408 409 410 411

  // The number of seconds a WAL(write ahead log) should be kept after it has
  // been marked as Not Live. If the value is set. The WAL files are moved to
  // the archive direcotory and deleted after the given TTL.
  // If set to 0, WAL files are deleted as soon as they are not required by
  // the database.
  // If set to std::numeric_limits<uint64_t>::max() the WAL files will never be
  // deleted.
  // Default : 0
  uint64_t WAL_ttl_seconds;
412 413 414 415 416 417

  // Number of bytes to preallocate (via fallocate) the manifest
  // files.  Default is 4mb, which is reasonable to reduce random IO
  // as well as prevent overallocation for mounts that preallocate
  // large amounts of data (such as xfs's allocsize option).
  size_t manifest_preallocation_size;
418 419 420 421

  // Purge duplicate/deleted keys when a memtable is flushed to storage.
  // Default: true
  bool purge_redundant_kvs_while_flush;
422 423 424 425 426

  // Data being read from file storage may be buffered in the OS
  // Default: true
  bool allow_os_buffer;

427
  // Allow the OS to mmap file for reading sst tables. Default: false
428 429 430 431
  bool allow_mmap_reads;

  // Allow the OS to mmap file for writing. Default: true
  bool allow_mmap_writes;
432 433 434

  // Disable child process inherit open files. Default: true
  bool is_fd_close_on_exec;
435 436 437 438 439 440

  // Skip log corruption error on recovery (If client is ok with
  // losing most recent changes)
  // Default: false
  bool skip_log_error_on_recovery;

441 442 443
  // if not zero, dump leveldb.stats to LOG every stats_dump_period_sec
  // Default: 3600 (1 hour)
  unsigned int stats_dump_period_sec;
444 445 446 447 448 449 450 451

  // This is used to close a block before it reaches the configured
  // 'block_size'. If the percentage of free space in the current block is less
  // than this specified number and adding a new record to the block will
  // exceed the configured block size, then this block will be closed and the
  // new record will be written to the next block.
  // Default is 10.
  int block_size_deviation;
452 453 454 455 456 457 458 459 460 461

  // If set true, will hint the underlying file system that the file
  // access pattern is random, when a sst file is opened.
  // Default: true
  bool advise_random_on_open;

  // Specify the file access pattern once a compaction is started.
  // It will be applied to all input files of a compaction.
  // Default: NORMAL
  enum { NONE, NORMAL, SEQUENTIAL, WILLNEED } access_hint_on_compaction_start;
H
Haobo Xu 已提交
462 463 464 465 466 467 468 469

  // Use adaptive mutex, which spins in the user space before resorting
  // to kernel. This could reduce context switch when the mutex is not
  // heavily contended. However, if the mutex is hot, we could end up
  // wasting spin time.
  // Default: false
  bool use_adaptive_mutex;

H
Haobo Xu 已提交
470 471 472 473 474 475
  // Allows OS to incrementally sync files to disk while they are being
  // written, asynchronously, in the background.
  // Issue one request for every bytes_per_sync written. 0 turns it off.
  // Default: 0
  uint64_t bytes_per_sync;

476 477 478 479
  // Use KeyMayExist API to filter deletes when this is true.
  // If KeyMayExist returns false, i.e. the key definitely does not exist, then
  // the delete is a noop. KeyMayExist only incurs in-memory look up.
  // This optimization avoids writing the delete to storage when appropriate.
480
  // Default: false
481
  bool filter_deletes;
482

J
Jim Paton 已提交
483 484 485 486 487
  // This is a factory that provides MemTableRep objects.
  // Default: a factory that provides a skip-list-based implementation of
  // MemTableRep.
  std::shared_ptr<MemTableRepFactory> memtable_factory;

J
jorlow@chromium.org 已提交
488 489 490 491 492 493 494 495 496 497 498 499 500 501
};

// Options that control read operations
struct ReadOptions {
  // If true, all data read from underlying storage will be
  // verified against corresponding checksums.
  // Default: false
  bool verify_checksums;

  // Should the data read for this iteration be cached in memory?
  // Callers may wish to set this field to false for bulk scans.
  // Default: true
  bool fill_cache;

A
Abhishek Kona 已提交
502
  // If "snapshot" is non-nullptr, read as of the supplied snapshot
J
jorlow@chromium.org 已提交
503
  // (which must belong to the DB that is being read and which must
A
Abhishek Kona 已提交
504
  // not have been released).  If "snapshot" is nullptr, use an impliicit
J
jorlow@chromium.org 已提交
505
  // snapshot of the state at the beginning of this read operation.
A
Abhishek Kona 已提交
506
  // Default: nullptr
J
jorlow@chromium.org 已提交
507 508 509 510 511
  const Snapshot* snapshot;

  ReadOptions()
      : verify_checksums(false),
        fill_cache(true),
A
Abhishek Kona 已提交
512
        snapshot(nullptr) {
J
jorlow@chromium.org 已提交
513
  }
514 515
  ReadOptions(bool cksum, bool cache) :
              verify_checksums(cksum), fill_cache(cache),
A
Abhishek Kona 已提交
516
              snapshot(nullptr) {
517
  }
J
jorlow@chromium.org 已提交
518 519 520 521 522 523 524 525 526 527 528 529 530 531
};

// Options that control write operations
struct WriteOptions {
  // If true, the write will be flushed from the operating system
  // buffer cache (by calling WritableFile::Sync()) before the write
  // is considered complete.  If this flag is true, writes will be
  // slower.
  //
  // If this flag is false, and the machine crashes, some recent
  // writes may be lost.  Note that if it is just the process that
  // crashes (i.e., the machine does not reboot), no writes will be
  // lost even if sync==false.
  //
532 533 534 535 536 537
  // In other words, a DB write with sync==false has similar
  // crash semantics as the "write()" system call.  A DB write
  // with sync==true has similar crash semantics to a "write()"
  // system call followed by "fsync()".
  //
  // Default: false
J
jorlow@chromium.org 已提交
538 539
  bool sync;

H
heyongqiang 已提交
540 541 542 543
  // If true, writes will not first go to the write ahead log,
  // and the write may got lost after a crash.
  bool disableWAL;

J
jorlow@chromium.org 已提交
544
  WriteOptions()
H
heyongqiang 已提交
545 546
      : sync(false),
        disableWAL(false) {
J
jorlow@chromium.org 已提交
547 548 549
  }
};

H
heyongqiang 已提交
550 551 552 553 554 555 556 557 558 559 560
// Options that control flush operations
struct FlushOptions {
  // If true, the flush will wait until the flush is done.
  // Default: true
  bool wait;

  FlushOptions()
      : wait(true) {
  }
};

H
Hans Wennborg 已提交
561
}  // namespace leveldb
J
jorlow@chromium.org 已提交
562 563

#endif  // STORAGE_LEVELDB_INCLUDE_OPTIONS_H_