options.h 14.6 KB
Newer Older
J
jorlow@chromium.org 已提交
1 2 3 4 5 6 7 8
// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file. See the AUTHORS file for names of contributors.

#ifndef STORAGE_LEVELDB_INCLUDE_OPTIONS_H_
#define STORAGE_LEVELDB_INCLUDE_OPTIONS_H_

#include <stddef.h>
H
heyongqiang 已提交
9
#include <string>
10
#include <stdint.h>
11
#include "leveldb/slice.h"
J
jorlow@chromium.org 已提交
12 13 14 15 16 17

namespace leveldb {

class Cache;
class Comparator;
class Env;
S
Sanjay Ghemawat 已提交
18
class FilterPolicy;
19
class Logger;
J
jorlow@chromium.org 已提交
20
class Snapshot;
21
class Statistics;
J
jorlow@chromium.org 已提交
22 23 24 25 26 27 28 29

// DB contents are stored in a set of blocks, each of which holds a
// sequence of key,value pairs.  Each block may be compressed before
// being stored in a file.  The following enum describes which
// compression method (if any) is used to compress a block.
enum CompressionType {
  // NOTE: do not change the values of existing entries, as these are
  // part of the persistent format on disk.
J
jorlow@chromium.org 已提交
30
  kNoCompression     = 0x0,
H
heyongqiang 已提交
31
  kSnappyCompression = 0x1,
H
heyongqiang 已提交
32 33
  kZlibCompression = 0x2,
  kBZip2Compression = 0x3
J
jorlow@chromium.org 已提交
34 35
};

36 37 38 39 40 41 42 43 44 45 46 47 48
// Compression options for different compression algorithms like Zlib
struct CompressionOptions {
  int window_bits;
  int level;
  int strategy;
  CompressionOptions():window_bits(-14),
                       level(-1),
                       strategy(0){}
  CompressionOptions(int wbits, int lev, int strategy):window_bits(wbits),
                                                       level(lev),
                                                       strategy(strategy){}
};

J
jorlow@chromium.org 已提交
49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83
// Options to control the behavior of a database (passed to DB::Open)
struct Options {
  // -------------------
  // Parameters that affect behavior

  // Comparator used to define the order of keys in the table.
  // Default: a comparator that uses lexicographic byte-wise ordering
  //
  // REQUIRES: The client must ensure that the comparator supplied
  // here has the same name and orders keys *exactly* the same as the
  // comparator provided to previous open calls on the same DB.
  const Comparator* comparator;

  // If true, the database will be created if it is missing.
  // Default: false
  bool create_if_missing;

  // If true, an error is raised if the database already exists.
  // Default: false
  bool error_if_exists;

  // If true, the implementation will do aggressive checking of the
  // data it is processing and will stop early if it detects any
  // errors.  This may have unforeseen ramifications: for example, a
  // corruption of one DB entry may cause a large number of entries to
  // become unreadable or for the entire DB to become unopenable.
  // Default: false
  bool paranoid_checks;

  // Use the specified object to interact with the environment,
  // e.g. to read/write files, schedule background work, etc.
  // Default: Env::Default()
  Env* env;

  // Any internal progress/error information generated by the db will
84
  // be written to info_log if it is non-NULL, or to a file stored
J
jorlow@chromium.org 已提交
85 86
  // in the same directory as the DB contents if info_log is NULL.
  // Default: NULL
87
  Logger* info_log;
J
jorlow@chromium.org 已提交
88 89 90 91

  // -------------------
  // Parameters that affect performance

92 93
  // Amount of data to build up in memory (backed by an unsorted log
  // on disk) before converting to a sorted on-disk file.
J
jorlow@chromium.org 已提交
94
  //
95 96 97
  // Larger values increase performance, especially during bulk loads.
  // Up to two write buffers may be held in memory at the same time,
  // so you may wish to adjust this parameter to control memory usage.
98 99
  // Also, a larger write buffer will result in a longer recovery time
  // the next time the database is opened.
J
jorlow@chromium.org 已提交
100
  //
101
  // Default: 4MB
J
jorlow@chromium.org 已提交
102 103 104 105 106 107 108 109 110 111 112 113
  size_t write_buffer_size;

  // Number of open files that can be used by the DB.  You may need to
  // increase this if your database has a large working set (budget
  // one open file per 2MB of working set).
  //
  // Default: 1000
  int max_open_files;

  // Control over blocks (user data is stored in a set of blocks, and
  // a block is the unit of reading from disk).

114 115
  // If non-NULL, use the specified cache for blocks.
  // If NULL, leveldb will automatically create and use an 8MB internal cache.
J
jorlow@chromium.org 已提交
116 117 118 119 120 121 122 123
  // Default: NULL
  Cache* block_cache;

  // Approximate size of user data packed per block.  Note that the
  // block size specified here corresponds to uncompressed data.  The
  // actual size of the unit read from disk may be smaller if
  // compression is enabled.  This parameter can be changed dynamically.
  //
124
  // Default: 4K
D
dgrogan@chromium.org 已提交
125
  size_t block_size;
J
jorlow@chromium.org 已提交
126 127 128 129 130 131 132 133

  // Number of keys between restart points for delta encoding of keys.
  // This parameter can be changed dynamically.  Most clients should
  // leave this parameter alone.
  //
  // Default: 16
  int block_restart_interval;

134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150

  // Compress blocks using the specified compression algorithm.  This
  // parameter can be changed dynamically.
  //
  // Default: kSnappyCompression, which gives lightweight but fast
  // compression.
  //
  // Typical speeds of kSnappyCompression on an Intel(R) Core(TM)2 2.4GHz:
  //    ~200-500MB/s compression
  //    ~400-800MB/s decompression
  // Note that these speeds are significantly faster than most
  // persistent storage speeds, and therefore it is typically never
  // worth switching to kNoCompression.  Even if the input data is
  // incompressible, the kSnappyCompression implementation will
  // efficiently detect that and will switch to uncompressed mode.
  CompressionType compression;

151 152 153 154 155 156 157 158 159 160
  // Different levels can have different compression policies. There
  // are cases where most lower levels would like to quick compression
  // algorithm while the higher levels (which have more data) use
  // compression algorithms that have better compression but could
  // be slower. This array, if non NULL, should have an entry for
  // each level of the database. This array, if non NULL, overides the
  // value specified in the previous field 'compression'. The caller is
  // reponsible for allocating memory and initializing the values in it
  // before invoking Open(). The caller is responsible for freeing this
  // array and it could be freed anytime after the return from Open().
161
  // This could have been a std::vector but that makes the equivalent
162 163 164
  // java/C api hard to construct.
  CompressionType* compression_per_level;

165 166 167
  //different options for compression algorithms
  CompressionOptions compression_opts;

168 169 170 171 172 173 174
  // If non-NULL, use the specified filter policy to reduce disk reads.
  // Many applications will benefit from passing the result of
  // NewBloomFilterPolicy() here.
  //
  // Default: NULL
  const FilterPolicy* filter_policy;

175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197
  // Number of levels for this database
  int num_levels;

  // Number of files to trigger level-0 compaction. A value <0 means that
  // level-0 compaction will not be triggered by number of files at all.
  int level0_file_num_compaction_trigger;

  // Soft limit on number of level-0 files. We slow down writes at this point.
  // A value <0 means that no writing slow down will be triggered by number
  // of files in level-0.
  int level0_slowdown_writes_trigger;

  // Maximum number of level-0 files.  We stop writes at this point.
  int level0_stop_writes_trigger;

  // Maximum level to which a new compacted memtable is pushed if it
  // does not create overlap.  We try to push to level 2 to avoid the
  // relatively expensive level 0=>1 compactions and to avoid some
  // expensive manifest file operations.  We do not push all the way to
  // the largest level since that can generate a lot of wasted disk
  // space if the same key space is being repeatedly overwritten.
  int max_mem_compaction_level;

198 199 200 201 202 203 204 205 206 207
  // Target file size for compaction.
  // target_file_size_base is per-file size for level-1.
  // Target file size for level L can be calculated by
  // target_file_size_base * (target_file_size_multiplier ^ (L-1))
  // For example, if target_file_size_base is 2MB and
  // target_file_size_multiplier is 10, then each file on level-1 will
  // be 2MB, and each file on level 2 will be 20MB,
  // and each file on level-3 will be 200MB.

  // by default target_file_size_base is 2MB.
208
  int target_file_size_base;
209 210
  // by default target_file_size_multiplier is 1, which means
  // by default files in different levels will have similar size.
211 212
  int target_file_size_multiplier;

213 214 215 216 217 218 219 220 221
  // Control maximum total data size for a level.
  // max_bytes_for_level_base is the max total for level-1.
  // Maximum number of bytes for level L can be calculated as
  // (max_bytes_for_level_base) * (max_bytes_for_level_multiplier ^ (L-1))
  // For example, if max_bytes_for_level_base is 20MB, and if
  // max_bytes_for_level_multiplier is 10, total data size for level-1
  // will be 20MB, total file size for level-2 will be 200MB,
  // and total file size for level-3 will be 2GB.

222

223
  // by default 'max_bytes_for_level_base' is 10MB.
224
  int max_bytes_for_level_base;
225
  // by default 'max_bytes_for_level_base' is 10.
H
heyongqiang 已提交
226
  int max_bytes_for_level_multiplier;
227

H
heyongqiang 已提交
228 229 230 231 232
  // Maximum number of bytes in all compacted files.  We avoid expanding
  // the lower level file set of a compaction if it would make the
  // total compaction cover more than
  // (expanded_compaction_factor * targetFileSizeLevel()) many bytes.
  int expanded_compaction_factor;
233

H
heyongqiang 已提交
234 235 236
  // Control maximum bytes of overlaps in grandparent (i.e., level+2) before we
  // stop building a single file in a level->level+1 compaction.
  int max_grandparent_overlap_factor;
237

238 239 240
  // If non-null, then we should collect metrics about database operations
  Statistics* statistics;

241 242 243 244 245 246 247 248
  // If true, then the contents of data files are not synced
  // to stable storage. Their contents remain in the OS buffers till the
  // OS decides to flush them. This option is good for bulk-loading
  // of data. Once the bulk-loading is complete, please issue a
  // sync to the OS to flush all dirty buffesrs to stable storage.
  // Default: false
  bool disableDataSync;

249 250 251 252 253 254 255
  // If true, then every store to stable storage will issue a fsync.
  // If false, then every store to stable storage will issue a fdatasync.
  // This parameter should be set to true while storing data to
  // filesystem like ext3 which can lose files after a reboot.
  // Default: false
  bool use_fsync;

256 257 258 259 260 261
  // This number controls how often a new scribe log about
  // db deploy stats is written out.
  // -1 indicates no logging at all.
  // Default value is 1800 (half an hour).
  int db_stats_log_interval;

H
heyongqiang 已提交
262 263 264 265 266 267 268
  // This specifies the log dir.
  // If it is empty, the log files will be in the same dir as data.
  // If it is non empty, the log files will be in the specified dir,
  // and the db data dir's absolute path will be used as the log file
  // name's prefix.
  std::string db_log_dir;

269 270 271 272 273 274
  // Disable compaction triggered by seek.
  // With bloomfilter and fast storage, a miss on one level
  // is very cheap if the file handle is cached in table cache
  // (which is true if max_open_files is large).
  bool disable_seek_compaction;

275 276 277 278 279
  // The periodicity when obsolete files get deleted. The default
  // value is 0 which means that obsolete files get removed after
  // every compaction run.
  uint64_t delete_obsolete_files_period_micros;

280 281 282 283 284 285 286
  // Specify the maximal size of the info log file. If the log file
  // is larger than `max_log_file_size`, a new info log file will
  // be created.
  // If max_log_file_size == 0, all logs will be written to one
  // log file.
  size_t max_log_file_size;

287 288 289 290
  // Puts are delayed when any level has a compaction score that
  // exceeds rate_limit. This is ignored when <= 1.0.
  double rate_limit;

291 292 293 294 295 296 297 298
  // Disable block cache. If this is set to false,
  // then no block cache should be used, and the block_cache should
  // point to a NULL object.
  bool no_block_cache;

  // Number of shards used for table cache.
  int table_cache_numshardbits;

J
jorlow@chromium.org 已提交
299 300
  // Create an Options object with default values for all fields.
  Options();
301 302

  void Dump(Logger * log) const;
303 304 305 306 307 308 309 310 311 312 313 314 315 316 317

  // This method allows an application to modify/delete a key-value at 
  // the time of compaction. The compaction process invokes this
  // method for every kv that is being compacted. A return value
  // of false indicates that the kv should be preserved in the
  // output of this compaction run and a return value of true
  // indicates that this key-value should be removed from the 
  // output of the compaction.  The application can inspect
  // the existing value of the key, modify it if needed and
  // return back the new value for this key. The application
  // should allocate memory for the Slice object that is used to
  // return the new value and the leveldb framework will
  // free up that memory.
  bool (*CompactionFilter)(int level, const Slice& key, 
         const Slice& existing_value, Slice** new_value);
J
jorlow@chromium.org 已提交
318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343
};

// Options that control read operations
struct ReadOptions {
  // If true, all data read from underlying storage will be
  // verified against corresponding checksums.
  // Default: false
  bool verify_checksums;

  // Should the data read for this iteration be cached in memory?
  // Callers may wish to set this field to false for bulk scans.
  // Default: true
  bool fill_cache;

  // If "snapshot" is non-NULL, read as of the supplied snapshot
  // (which must belong to the DB that is being read and which must
  // not have been released).  If "snapshot" is NULL, use an impliicit
  // snapshot of the state at the beginning of this read operation.
  // Default: NULL
  const Snapshot* snapshot;

  ReadOptions()
      : verify_checksums(false),
        fill_cache(true),
        snapshot(NULL) {
  }
344 345 346 347
  ReadOptions(bool cksum, bool cache) :
              verify_checksums(cksum), fill_cache(cache),
              snapshot(NULL) {
  }
J
jorlow@chromium.org 已提交
348 349 350 351 352 353 354 355 356 357 358 359 360 361
};

// Options that control write operations
struct WriteOptions {
  // If true, the write will be flushed from the operating system
  // buffer cache (by calling WritableFile::Sync()) before the write
  // is considered complete.  If this flag is true, writes will be
  // slower.
  //
  // If this flag is false, and the machine crashes, some recent
  // writes may be lost.  Note that if it is just the process that
  // crashes (i.e., the machine does not reboot), no writes will be
  // lost even if sync==false.
  //
362 363 364 365 366 367
  // In other words, a DB write with sync==false has similar
  // crash semantics as the "write()" system call.  A DB write
  // with sync==true has similar crash semantics to a "write()"
  // system call followed by "fsync()".
  //
  // Default: false
J
jorlow@chromium.org 已提交
368 369
  bool sync;

H
heyongqiang 已提交
370 371 372 373
  // If true, writes will not first go to the write ahead log,
  // and the write may got lost after a crash.
  bool disableWAL;

J
jorlow@chromium.org 已提交
374
  WriteOptions()
H
heyongqiang 已提交
375 376
      : sync(false),
        disableWAL(false) {
J
jorlow@chromium.org 已提交
377 378 379
  }
};

H
heyongqiang 已提交
380 381 382 383 384 385 386 387 388 389 390
// Options that control flush operations
struct FlushOptions {
  // If true, the flush will wait until the flush is done.
  // Default: true
  bool wait;

  FlushOptions()
      : wait(true) {
  }
};

H
Hans Wennborg 已提交
391
}  // namespace leveldb
J
jorlow@chromium.org 已提交
392 393

#endif  // STORAGE_LEVELDB_INCLUDE_OPTIONS_H_