db_bench_tool.cc 297.1 KB
Newer Older
1
//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
S
Siying Dong 已提交
2 3 4
//  This source code is licensed under both the GPLv2 (found in the
//  COPYING file in the root directory) and Apache 2.0 License
//  (found in the LICENSE.Apache file in the root directory).
5
//
J
jorlow@chromium.org 已提交
6 7 8 9
// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file. See the AUTHORS file for names of contributors.

J
Jonathan Wiepert 已提交
10
#ifdef GFLAGS
11 12 13
#ifdef NUMA
#include <numa.h>
#endif
D
Dmitri Smirnov 已提交
14
#ifndef OS_WIN
15
#include <unistd.h>
D
Dmitri Smirnov 已提交
16
#endif
17
#include <fcntl.h>
J
jorlow@chromium.org 已提交
18 19
#include <stdio.h>
#include <stdlib.h>
20
#include <sys/types.h>
21 22 23 24 25
#ifdef __APPLE__
#include <mach/host_info.h>
#include <mach/mach_host.h>
#include <sys/sysctl.h>
#endif
26 27 28
#ifdef __FreeBSD__
#include <sys/sysctl.h>
#endif
29
#include <atomic>
30
#include <cinttypes>
31
#include <condition_variable>
32
#include <cstddef>
33
#include <iostream>
S
Siying Dong 已提交
34
#include <memory>
35
#include <mutex>
36
#include <queue>
37
#include <thread>
38
#include <unordered_map>
39

40
#include "db/db_impl/db_impl.h"
41
#include "db/malloc_stats.h"
J
jorlow@chromium.org 已提交
42
#include "db/version_set.h"
43 44
#include "monitoring/histogram.h"
#include "monitoring/statistics.h"
45
#include "options/cf_options.h"
A
agiardullo 已提交
46 47
#include "port/port.h"
#include "port/stack_trace.h"
48
#include "rocksdb/cache.h"
49
#include "rocksdb/convenience.h"
50 51
#include "rocksdb/db.h"
#include "rocksdb/env.h"
52
#include "rocksdb/filter_policy.h"
A
agiardullo 已提交
53 54 55
#include "rocksdb/memtablerep.h"
#include "rocksdb/options.h"
#include "rocksdb/perf_context.h"
56
#include "rocksdb/persistent_cache.h"
S
sdong 已提交
57
#include "rocksdb/rate_limiter.h"
58
#include "rocksdb/secondary_cache.h"
A
agiardullo 已提交
59
#include "rocksdb/slice.h"
60
#include "rocksdb/slice_transform.h"
61
#include "rocksdb/stats_history.h"
62
#include "rocksdb/table.h"
63
#include "rocksdb/utilities/object_registry.h"
A
agiardullo 已提交
64
#include "rocksdb/utilities/optimistic_transaction_db.h"
65
#include "rocksdb/utilities/options_type.h"
66
#include "rocksdb/utilities/options_util.h"
67 68 69
#ifndef ROCKSDB_LITE
#include "rocksdb/utilities/replayer.h"
#endif  // ROCKSDB_LITE
70
#include "rocksdb/utilities/sim_cache.h"
A
agiardullo 已提交
71 72
#include "rocksdb/utilities/transaction.h"
#include "rocksdb/utilities/transaction_db.h"
A
agiardullo 已提交
73
#include "rocksdb/write_batch.h"
74 75
#include "test_util/testutil.h"
#include "test_util/transaction_test_util.h"
76
#include "tools/simulated_hybrid_file_system.h"
S
Siying Dong 已提交
77
#include "util/cast_util.h"
I
Igor Canadi 已提交
78
#include "util/compression.h"
A
agiardullo 已提交
79
#include "util/crc32c.h"
80
#include "util/file_checksum_helper.h"
A
Andrew Kryczka 已提交
81
#include "util/gflags_compat.h"
82
#include "util/mutexlock.h"
J
jorlow@chromium.org 已提交
83
#include "util/random.h"
84
#include "util/stderr_logger.h"
A
agiardullo 已提交
85
#include "util/string_util.h"
I
xxHash  
Igor Canadi 已提交
86
#include "util/xxhash.h"
87
#include "utilities/blob_db/blob_db.h"
88
#include "utilities/counted_fs.h"
D
Deon Nicholas 已提交
89
#include "utilities/merge_operators.h"
P
Pooya Shareghi 已提交
90
#include "utilities/merge_operators/bytesxor.h"
91
#include "utilities/merge_operators/sortlist.h"
92
#include "utilities/persistent_cache/block_cache_tier.h"
J
jorlow@chromium.org 已提交
93

94 95 96 97
#ifdef MEMKIND
#include "memory/memkind_kmem_allocator.h"
#endif

D
Dmitri Smirnov 已提交
98
#ifdef OS_WIN
S
sdong 已提交
99
#include <io.h>  // open/close
D
Dmitri Smirnov 已提交
100 101
#endif

A
Andrew Kryczka 已提交
102 103 104
using GFLAGS_NAMESPACE::ParseCommandLineFlags;
using GFLAGS_NAMESPACE::RegisterFlagValidator;
using GFLAGS_NAMESPACE::SetUsageMessage;
T
Tyler Harter 已提交
105

106 107 108 109 110 111
#ifdef ROCKSDB_LITE
#define IF_ROCKSDB_LITE(Then, Else) Then
#else
#define IF_ROCKSDB_LITE(Then, Else) Else
#endif

112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128
DEFINE_string(
    benchmarks,
    "fillseq,"
    "fillseqdeterministic,"
    "fillsync,"
    "fillrandom,"
    "filluniquerandomdeterministic,"
    "overwrite,"
    "readrandom,"
    "newiterator,"
    "newiteratorwhilewriting,"
    "seekrandom,"
    "seekrandomwhilewriting,"
    "seekrandomwhilemerging,"
    "readseq,"
    "readreverse,"
    "compact,"
129
    "compactall,"
130
    "flush,"
131
IF_ROCKSDB_LITE("",
132 133 134
    "compact0,"
    "compact1,"
    "waitforcompaction,"
135
)
136
    "multireadrandom,"
137
    "mixgraph,"
138
    "readseq,"
139
    "readtorowcache,"
140 141 142 143
    "readtocache,"
    "readreverse,"
    "readwhilewriting,"
    "readwhilemerging,"
Y
Yi Wu 已提交
144
    "readwhilescanning,"
145 146
    "readrandomwriterandom,"
    "updaterandom,"
P
Pooya Shareghi 已提交
147
    "xorupdaterandom,"
148
    "approximatesizerandom,"
149 150 151 152
    "randomwithverify,"
    "fill100K,"
    "crc32c,"
    "xxhash,"
153 154
    "xxhash64,"
    "xxh3,"
155 156 157 158 159 160
    "compress,"
    "uncompress,"
    "acquireload,"
    "fillseekseq,"
    "randomtransaction,"
    "randomreplacekeys,"
161 162
    "timeseries,"
    "getmergeoperands",
163 164 165 166 167 168 169 170 171 172 173 174 175

    "Comma-separated list of operations to run in the specified"
    " order. Available benchmarks:\n"
    "\tfillseq       -- write N values in sequential key"
    " order in async mode\n"
    "\tfillseqdeterministic       -- write N values in the specified"
    " key order and keep the shape of the LSM tree\n"
    "\tfillrandom    -- write N values in random key order in async"
    " mode\n"
    "\tfilluniquerandomdeterministic       -- write N values in a random"
    " key order and keep the shape of the LSM tree\n"
    "\toverwrite     -- overwrite N values in random key order in"
    " async mode\n"
176
    "\tfillsync      -- write N/1000 values in random key order in "
177 178 179 180 181 182 183 184 185 186 187 188 189 190
    "sync mode\n"
    "\tfill100K      -- write N/1000 100K values in random order in"
    " async mode\n"
    "\tdeleteseq     -- delete N keys in sequential order\n"
    "\tdeleterandom  -- delete N keys in random order\n"
    "\treadseq       -- read N times sequentially\n"
    "\treadtocache   -- 1 thread reading database sequentially\n"
    "\treadreverse   -- read N times in reverse order\n"
    "\treadrandom    -- read N times in random order\n"
    "\treadmissing   -- read N missing keys in random order\n"
    "\treadwhilewriting      -- 1 writer, N threads doing random "
    "reads\n"
    "\treadwhilemerging      -- 1 merger, N threads doing random "
    "reads\n"
Y
Yi Wu 已提交
191 192
    "\treadwhilescanning     -- 1 thread doing full table scan, "
    "N threads doing random reads\n"
193 194 195 196
    "\treadrandomwriterandom -- N threads doing random-read, "
    "random-write\n"
    "\tupdaterandom  -- N threads doing read-modify-write for random "
    "keys\n"
P
Pooya Shareghi 已提交
197 198
    "\txorupdaterandom  -- N threads doing read-XOR-write for "
    "random keys\n"
199 200 201 202 203 204 205 206 207 208 209 210 211 212
    "\tappendrandom  -- N threads doing read-modify-write with "
    "growing values\n"
    "\tmergerandom   -- same as updaterandom/appendrandom using merge"
    " operator. "
    "Must be used with merge_operator\n"
    "\treadrandommergerandom -- perform N random read-or-merge "
    "operations. Must be used with merge_operator\n"
    "\tnewiterator   -- repeated iterator creation\n"
    "\tseekrandom    -- N random seeks, call Next seek_nexts times "
    "per seek\n"
    "\tseekrandomwhilewriting -- seekrandom and 1 thread doing "
    "overwrite\n"
    "\tseekrandomwhilemerging -- seekrandom and 1 thread doing "
    "merge\n"
213 214 215 216
    "\tcrc32c        -- repeated crc32c of <block size> data\n"
    "\txxhash        -- repeated xxHash of <block size> data\n"
    "\txxhash64      -- repeated xxHash64 of <block size> data\n"
    "\txxh3          -- repeated XXH3 of <block size> data\n"
217 218 219 220 221 222 223 224 225 226
    "\tacquireload   -- load N*1000 times\n"
    "\tfillseekseq   -- write N values in sequential key, then read "
    "them by seeking to each key\n"
    "\trandomtransaction     -- execute N random transactions and "
    "verify correctness\n"
    "\trandomreplacekeys     -- randomly replaces N keys by deleting "
    "the old version and putting the new version\n\n"
    "\ttimeseries            -- 1 writer generates time series data "
    "and multiple readers doing random reads on id\n\n"
    "Meta operations:\n"
A
Aaron Gao 已提交
227 228
    "\tcompact     -- Compact the entire DB; If multiple, randomly choose one\n"
    "\tcompactall  -- Compact the entire DB\n"
229
IF_ROCKSDB_LITE("",
230 231 232
    "\tcompact0  -- compact L0 into L1\n"
    "\tcompact1  -- compact L1 into L2\n"
    "\twaitforcompaction - pause until compaction is (probably) done\n"
233
)
234
    "\tflush - flush the memtable\n"
235
    "\tstats       -- Print DB stats\n"
S
Siying Dong 已提交
236
    "\tresetstats  -- Reset DB stats\n"
237
    "\tlevelstats  -- Print the number of files and bytes per level\n"
238
    "\tmemstats  -- Print memtable stats\n"
239
    "\tsstables    -- Print sstable info\n"
240
    "\theapprofile -- Dump a heap profile (if supported by this port)\n"
Y
Yanqin Jin 已提交
241
IF_ROCKSDB_LITE("",
242
    "\treplay      -- replay the trace file specified with trace_file\n"
Y
Yanqin Jin 已提交
243
)
244 245 246 247 248 249
    "\tgetmergeoperands -- Insert lots of merge records which are a list of "
    "sorted ints for a key and then compare performance of lookup for another "
    "key "
    "by doing a Get followed by binary searching in the large sorted list vs "
    "doing a GetMergeOperands and binary searching in the operands which are"
    "sorted sub-lists. The MergeOperator used is sortlist.h\n");
250 251 252 253 254 255 256 257

DEFINE_int64(num, 1000000, "Number of key/values to place in database");

DEFINE_int64(numdistinct, 1000,
             "Number of distinct keys to use. Used in RandomWithVerify to "
             "read/write on fewer keys so that gets are more likely to find the"
             " key and puts are more likely to update the same key");

258 259 260 261
DEFINE_int64(merge_keys, -1,
             "Number of distinct keys to use for MergeRandom and "
             "ReadRandomMergeRandom. "
             "If negative, there will be FLAGS_num keys.");
262
DEFINE_int32(num_column_families, 1, "Number of Column Families to use.");
263

264
DEFINE_int32(
265
    num_hot_column_families, 0,
266 267 268 269 270
    "Number of Hot Column Families. If more than 0, only write to this "
    "number of column families. After finishing all the writes to them, "
    "create new set of column families and insert to them. Only used "
    "when num_column_families > 1.");

271 272 273 274 275 276 277 278 279
DEFINE_string(column_family_distribution, "",
              "Comma-separated list of percentages, where the ith element "
              "indicates the probability of an op using the ith column family. "
              "The number of elements must be `num_hot_column_families` if "
              "specified; otherwise, it must be `num_column_families`. The "
              "sum of elements must be 100. E.g., if `num_column_families=4`, "
              "and `num_hot_column_families=0`, a valid list could be "
              "\"10,20,30,40\".");

280 281 282
DEFINE_int64(reads, -1, "Number of read operations to do.  "
             "If negative, do FLAGS_num reads.");

Y
Yueh-Hsuan Chiang 已提交
283 284 285
DEFINE_int64(deletes, -1, "Number of delete operations to do.  "
             "If negative, do FLAGS_num deletions.");

L
Lei Jin 已提交
286 287
DEFINE_int32(bloom_locality, 0, "Control bloom filter probes locality");

288 289 290 291 292 293 294 295
DEFINE_int64(seed, 0, "Seed base for random number generators. "
             "When 0 it is deterministic.");

DEFINE_int32(threads, 1, "Number of concurrent threads to run.");

DEFINE_int32(duration, 0, "Time in seconds for the random-ops tests to run."
             " When 0 then num & reads determine the test duration");

296 297 298 299 300 301 302 303 304
DEFINE_string(value_size_distribution_type, "fixed",
              "Value size distribution type: fixed, uniform, normal");

DEFINE_int32(value_size, 100, "Size of each value in fixed distribution");
static unsigned int value_size = 100;

DEFINE_int32(value_size_min, 100, "Min size of random value");

DEFINE_int32(value_size_max, 102400, "Max size of random value");
T
Tyler Harter 已提交
305

306 307
DEFINE_int32(seek_nexts, 0,
             "How many times to call Next() after Seek() in "
308 309
             "fillseekseq, seekrandom, seekrandomwhilewriting and "
             "seekrandomwhilemerging");
T
Tomislav Novak 已提交
310

M
Mark Callaghan 已提交
311 312 313 314
DEFINE_bool(reverse_iterator, false,
            "When true use Prev rather than Next for iterators that do "
            "Seek and then Next");

315 316 317 318
DEFINE_int64(max_scan_distance, 0,
             "Used to define iterate_upper_bound (or iterate_lower_bound "
             "if FLAGS_reverse_iterator is set to true) when value is nonzero");

319
DEFINE_bool(use_uint64_comparator, false, "use Uint64 user comparator");
320

321 322
DEFINE_int64(batch_size, 1, "Batch size");

A
Andrew Kryczka 已提交
323
static bool ValidateKeySize(const char* /*flagname*/, int32_t /*value*/) {
324 325
  return true;
}
326

327 328
static bool ValidateUint32Range(const char* flagname, uint64_t value) {
  if (value > std::numeric_limits<uint32_t>::max()) {
329
    fprintf(stderr, "Invalid value for --%s: %lu, overflow\n", flagname,
330 331 332 333 334 335
            (unsigned long)value);
    return false;
  }
  return true;
}

336
DEFINE_int32(key_size, 16, "size of each key");
337

338 339 340
DEFINE_int32(user_timestamp_size, 0,
             "number of bytes in a user-defined timestamp");

341 342 343
DEFINE_int32(num_multi_db, 0,
             "Number of DBs used in the benchmark. 0 means single DB.");

344 345
DEFINE_double(compression_ratio, 0.5, "Arrange to generate values that shrink"
              " to this fraction of their original size after compression");
J
jorlow@chromium.org 已提交
346

347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366
DEFINE_double(
    overwrite_probability, 0.0,
    "Used in 'filluniquerandom' benchmark: for each write operation, "
    "we give a probability to perform an overwrite instead. The key used for "
    "the overwrite is randomly chosen from the last 'overwrite_window_size' "
    "keys "
    "previously inserted into the DB. "
    "Valid overwrite_probability values: [0.0, 1.0].");

DEFINE_uint32(overwrite_window_size, 1,
              "Used in 'filluniquerandom' benchmark. For each write "
              "operation, when "
              "the overwrite_probability flag is set by the user, the key used "
              "to perform "
              "an overwrite is randomly chosen from the last "
              "'overwrite_window_size' keys "
              "previously inserted into the DB. "
              "Warning: large values can affect throughput. "
              "Valid overwrite_window_size values: [1, kMaxUint32].");

367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402
DEFINE_uint64(
    disposable_entries_delete_delay, 0,
    "Minimum delay in microseconds for the series of Deletes "
    "to be issued. When 0 the insertion of the last disposable entry is "
    "immediately followed by the issuance of the Deletes. "
    "(only compatible with fillanddeleteuniquerandom benchmark).");

DEFINE_uint64(disposable_entries_batch_size, 0,
              "Number of consecutively inserted disposable KV entries "
              "that will be deleted after 'delete_delay' microseconds. "
              "A series of Deletes is always issued once all the "
              "disposable KV entries it targets have been inserted "
              "into the DB. When 0 no deletes are issued and a "
              "regular 'filluniquerandom' benchmark occurs. "
              "(only compatible with fillanddeleteuniquerandom benchmark)");

DEFINE_int32(disposable_entries_value_size, 64,
             "Size of the values (in bytes) of the entries targeted by "
             "selective deletes. "
             "(only compatible with fillanddeleteuniquerandom benchmark)");

DEFINE_uint64(
    persistent_entries_batch_size, 0,
    "Number of KV entries being inserted right before the deletes "
    "targeting the disposable KV entries are issued. These "
    "persistent keys are not targeted by the deletes, and will always "
    "remain valid in the DB. (only compatible with "
    "--benchmarks='fillanddeleteuniquerandom' "
    "and used when--disposable_entries_batch_size is > 0).");

DEFINE_int32(persistent_entries_value_size, 64,
             "Size of the values (in bytes) of the entries not targeted by "
             "deletes. (only compatible with "
             "--benchmarks='fillanddeleteuniquerandom' "
             "and used when--disposable_entries_batch_size is > 0).");

403 404
DEFINE_double(read_random_exp_range, 0.0,
              "Read random's key will be generated using distribution of "
405
              "num * exp(-r) where r is uniform number from 0 to this value. "
406 407 408
              "The larger the number is, the more skewed the reads are. "
              "Only used in readrandom and multireadrandom benchmarks.");

409
DEFINE_bool(histogram, false, "Print histogram of operation timings");
J
jorlow@chromium.org 已提交
410

411 412 413 414 415 416 417 418
DEFINE_bool(enable_numa, false,
            "Make operations aware of NUMA architecture and bind memory "
            "and cpus corresponding to nodes together. In NUMA, memory "
            "in same node as CPUs are closer when compared to memory in "
            "other nodes. Reads can be faster when the process is bound to "
            "CPU and memory of same node. Use \"$numactl --hardware\" command "
            "to see NUMA memory architecture.");

419 420
DEFINE_int64(db_write_buffer_size,
             ROCKSDB_NAMESPACE::Options().db_write_buffer_size,
421 422
             "Number of bytes to buffer in all memtables before compacting");

423 424 425
DEFINE_bool(cost_write_buffer_to_cache, false,
            "The usage of memtable is costed to the block cache");

426 427 428
DEFINE_int64(arena_block_size, ROCKSDB_NAMESPACE::Options().arena_block_size,
             "The size, in bytes, of one block in arena memory allocation.");

429
DEFINE_int64(write_buffer_size, ROCKSDB_NAMESPACE::Options().write_buffer_size,
430
             "Number of bytes to buffer in memtable before compacting");
431

432
DEFINE_int32(max_write_buffer_number,
433
             ROCKSDB_NAMESPACE::Options().max_write_buffer_number,
434
             "The number of in-memory memtables. Each memtable is of size"
Y
Yanqin Jin 已提交
435
             " write_buffer_size bytes.");
436

437
DEFINE_int32(min_write_buffer_number_to_merge,
438
             ROCKSDB_NAMESPACE::Options().min_write_buffer_number_to_merge,
439 440 441 442 443 444 445 446
             "The minimum number of write buffers that will be merged together"
             "before writing to storage. This is cheap because it is an"
             "in-memory merge. If this feature is not enabled, then all these"
             "write buffers are flushed to L0 as separate files and this "
             "increases read amplification because a get request has to check"
             " in all of these files. Also, an in-memory merge may result in"
             " writing less data to storage if there are duplicate records "
             " in each of these individual write buffers.");
447

448
DEFINE_int32(max_write_buffer_number_to_maintain,
449
             ROCKSDB_NAMESPACE::Options().max_write_buffer_number_to_maintain,
450 451 452 453 454 455 456 457 458 459 460 461
             "The total maximum number of write buffers to maintain in memory "
             "including copies of buffers that have already been flushed. "
             "Unlike max_write_buffer_number, this parameter does not affect "
             "flushing. This controls the minimum amount of write history "
             "that will be available in memory for conflict checking when "
             "Transactions are used. If this value is too low, some "
             "transactions may fail at commit time due to not being able to "
             "determine whether there were any write conflicts. Setting this "
             "value to 0 will cause write buffers to be freed immediately "
             "after they are flushed.  If this value is set to -1, "
             "'max_write_buffer_number' will be used.");

462
DEFINE_int64(max_write_buffer_size_to_maintain,
463
             ROCKSDB_NAMESPACE::Options().max_write_buffer_size_to_maintain,
464 465 466 467 468 469 470 471 472 473 474 475
             "The total maximum size of write buffers to maintain in memory "
             "including copies of buffers that have already been flushed. "
             "Unlike max_write_buffer_number, this parameter does not affect "
             "flushing. This controls the minimum amount of write history "
             "that will be available in memory for conflict checking when "
             "Transactions are used. If this value is too low, some "
             "transactions may fail at commit time due to not being able to "
             "determine whether there were any write conflicts. Setting this "
             "value to 0 will cause write buffers to be freed immediately "
             "after they are flushed.  If this value is set to -1, "
             "'max_write_buffer_number' will be used.");

476
DEFINE_int32(max_background_jobs,
477
             ROCKSDB_NAMESPACE::Options().max_background_jobs,
478 479 480
             "The maximum number of concurrent background jobs that can occur "
             "in parallel.");

481 482 483 484
DEFINE_int32(num_bottom_pri_threads, 0,
             "The number of threads in the bottom-priority thread pool (used "
             "by universal compaction only).");

485 486 487 488 489 490 491 492
DEFINE_int32(num_high_pri_threads, 0,
             "The maximum number of concurrent background compactions"
             " that can occur in parallel.");

DEFINE_int32(num_low_pri_threads, 0,
             "The maximum number of concurrent background compactions"
             " that can occur in parallel.");

493
DEFINE_int32(max_background_compactions,
494
             ROCKSDB_NAMESPACE::Options().max_background_compactions,
495 496
             "The maximum number of concurrent background compactions"
             " that can occur in parallel.");
497

498
DEFINE_uint64(subcompactions, 1,
499 500 501
              "Maximum number of subcompactions to divide L0-L1 compactions "
              "into.");
static const bool FLAGS_subcompactions_dummy
T
Tamir Duberstein 已提交
502
    __attribute__((__unused__)) = RegisterFlagValidator(&FLAGS_subcompactions,
503
                                                    &ValidateUint32Range);
504

505
DEFINE_int32(max_background_flushes,
506
             ROCKSDB_NAMESPACE::Options().max_background_flushes,
507 508 509
             "The maximum number of concurrent background flushes"
             " that can occur in parallel.");

510 511 512
static ROCKSDB_NAMESPACE::CompactionStyle FLAGS_compaction_style_e;
DEFINE_int32(compaction_style,
             (int32_t)ROCKSDB_NAMESPACE::Options().compaction_style,
513
             "style of compaction: level-based, universal and fifo");
514

515 516 517
static ROCKSDB_NAMESPACE::CompactionPri FLAGS_compaction_pri_e;
DEFINE_int32(compaction_pri,
             (int32_t)ROCKSDB_NAMESPACE::Options().compaction_pri,
518 519
             "priority of files to compaction: by size or by data age");

520 521 522
DEFINE_int32(universal_size_ratio, 0,
             "Percentage flexibility while comparing file size"
             " (for universal compaction only).");
523

524 525
DEFINE_int32(universal_min_merge_width, 0, "The minimum number of files in a"
             " single compaction run (for universal compaction only).");
526

527 528
DEFINE_int32(universal_max_merge_width, 0, "The max number of files to compact"
             " in universal style compaction");
529

530 531
DEFINE_int32(universal_max_size_amplification_percent, 0,
             "The max size amplification for universal style compaction");
532

533 534 535 536
DEFINE_int32(universal_compression_size_percent, -1,
             "The percentage of the database to compress for universal "
             "compaction. -1 means compress everything.");

537
DEFINE_bool(universal_allow_trivial_move, false,
538
            "Allow trivial move in universal compaction.");
539

540 541 542
DEFINE_bool(universal_incremental, false,
            "Enable incremental compactions in universal compaction.");

Y
Yi Wu 已提交
543 544 545 546 547 548 549 550
DEFINE_int64(cache_size, 8 << 20,  // 8MB
             "Number of bytes to use as a cache of uncompressed data");

DEFINE_int32(cache_numshardbits, 6,
             "Number of shards for the block cache"
             " is 2 ** cache_numshardbits. Negative means use default settings."
             " This is applied only if FLAGS_cache_size is non-negative.");

551 552 553 554 555
DEFINE_double(cache_high_pri_pool_ratio, 0.0,
              "Ratio of block cache reserve for high pri blocks. "
              "If > 0.0, we also enable "
              "cache_index_and_filter_blocks_with_high_priority.");

Y
Yi Wu 已提交
556 557
DEFINE_bool(use_clock_cache, false,
            "Replace default LRU block cache with clock cache.");
558

559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590
DEFINE_bool(use_lru_secondary_cache, false,
            "Use the LRUSecondaryCache as the secondary cache.");

DEFINE_int64(lru_secondary_cache_size, 8 << 20,  // 8MB
             "Number of bytes to use as a cache of data");

DEFINE_int32(lru_secondary_cache_numshardbits, 6,
             "Number of shards for the block cache"
             " is 2 ** lru_secondary_cache_numshardbits."
             " Negative means use default settings."
             " This is applied only if FLAGS_cache_size is non-negative.");

DEFINE_double(lru_secondary_cache_high_pri_pool_ratio, 0.0,
              "Ratio of block cache reserve for high pri blocks. "
              "If > 0.0, we also enable "
              "cache_index_and_filter_blocks_with_high_priority.");

DEFINE_string(lru_secondary_cache_compression_type, "lz4",
              "The compression algorithm to use for large "
              "values stored in LRUSecondaryCache.");
static enum ROCKSDB_NAMESPACE::CompressionType
    FLAGS_lru_secondary_cache_compression_type_e =
        ROCKSDB_NAMESPACE::kLZ4Compression;

DEFINE_uint32(
    lru_secondary_cache_compress_format_version, 2,
    "compress_format_version can have two values: "
    "compress_format_version == 1 -- decompressed size is not included"
    " in the block header."
    "compress_format_version == 2 -- decompressed size is included"
    " in the block header in varint32 format.");

591 592
DEFINE_int64(simcache_size, -1,
             "Number of bytes to use as a simcache of "
Y
Yi Wu 已提交
593
             "uncompressed data. Nagative value disables simcache.");
J
jorlow@chromium.org 已提交
594

595 596 597
DEFINE_bool(cache_index_and_filter_blocks, false,
            "Cache index/filter blocks in block cache.");

598 599 600
DEFINE_bool(use_cache_memkind_kmem_allocator, false,
            "Use memkind kmem allocator for block cache.");

601 602 603
DEFINE_bool(partition_index_and_filters, false,
            "Partition index and filter blocks.");

604 605
DEFINE_bool(partition_index, false, "Partition index blocks");

606 607
DEFINE_bool(index_with_first_key, false, "Include first key in the index");

608 609 610 611 612
DEFINE_bool(
    optimize_filters_for_memory,
    ROCKSDB_NAMESPACE::BlockBasedTableOptions().optimize_filters_for_memory,
    "Minimize memory footprint of filters");

613 614 615 616 617
DEFINE_int64(
    index_shortening_mode, 2,
    "mode to shorten index: 0 for no shortening; 1 for only shortening "
    "separaters; 2 for shortening shortening and successor");

618
DEFINE_int64(metadata_block_size,
619
             ROCKSDB_NAMESPACE::BlockBasedTableOptions().metadata_block_size,
620 621 622 623 624 625 626
             "Max partition size when partitioning index/filters");

// The default reduces the overhead of reading time with flash. With HDD, which
// offers much less throughput, however, this number better to be set to 1.
DEFINE_int32(ops_between_duration_checks, 1000,
             "Check duration limit every x ops");

627 628 629
DEFINE_bool(pin_l0_filter_and_index_blocks_in_cache, false,
            "Pin index/filter blocks of L0 files in block cache.");

630 631 632 633
DEFINE_bool(
    pin_top_level_index_and_filter, false,
    "Pin top-level index of partitioned index/filter blocks in block cache.");

634
DEFINE_int32(block_size,
635 636
             static_cast<int32_t>(
                 ROCKSDB_NAMESPACE::BlockBasedTableOptions().block_size),
637
             "Number of bytes in a block.");
638

639 640 641 642
DEFINE_int32(format_version,
             static_cast<int32_t>(
                 ROCKSDB_NAMESPACE::BlockBasedTableOptions().format_version),
             "Format version of SST files.");
643

644
DEFINE_int32(block_restart_interval,
645
             ROCKSDB_NAMESPACE::BlockBasedTableOptions().block_restart_interval,
646
             "Number of keys between restart points "
647 648
             "for delta encoding of keys in data block.");

649 650 651 652 653
DEFINE_int32(
    index_block_restart_interval,
    ROCKSDB_NAMESPACE::BlockBasedTableOptions().index_block_restart_interval,
    "Number of keys between restart points "
    "for delta encoding of keys in index block.");
654

655
DEFINE_int32(read_amp_bytes_per_bit,
656
             ROCKSDB_NAMESPACE::BlockBasedTableOptions().read_amp_bytes_per_bit,
657 658
             "Number of bytes per bit to be used in block read-amp bitmap");

659 660 661 662
DEFINE_bool(
    enable_index_compression,
    ROCKSDB_NAMESPACE::BlockBasedTableOptions().enable_index_compression,
    "Compress the index block");
663

664 665
DEFINE_bool(block_align,
            ROCKSDB_NAMESPACE::BlockBasedTableOptions().block_align,
666 667
            "Align data blocks on page size");

668 669 670 671
DEFINE_int64(prepopulate_block_cache, 0,
             "Pre-populate hot/warm blocks in block cache. 0 to disable and 1 "
             "to insert during flush");

672 673 674 675 676 677 678 679 680 681
DEFINE_bool(use_data_block_hash_index, false,
            "if use kDataBlockBinaryAndHash "
            "instead of kDataBlockBinarySearch. "
            "This is valid if only we use BlockTable");

DEFINE_double(data_block_hash_table_util_ratio, 0.75,
              "util ratio for data block hash index table. "
              "This is only valid if use_data_block_hash_index is "
              "set to true");

682 683 684
DEFINE_int64(compressed_cache_size, -1,
             "Number of bytes to use as a cache of compressed data.");

685 686 687 688
DEFINE_int64(row_cache_size, 0,
             "Number of bytes to use as a cache of individual rows"
             " (0 = disabled).");

689
DEFINE_int32(open_files, ROCKSDB_NAMESPACE::Options().max_open_files,
690 691
             "Maximum number of files to keep open at the same time"
             " (use default if == 0)");
692

693 694
DEFINE_int32(file_opening_threads,
             ROCKSDB_NAMESPACE::Options().max_file_opening_threads,
695 696 697
             "If open_files is set to -1, this option set the number of "
             "threads that will be used to open files during DB::Open()");

698 699
DEFINE_int32(compaction_readahead_size, 0, "Compaction readahead size");

700 701
DEFINE_int32(log_readahead_size, 0, "WAL and manifest readahead size");

D
Dmitri Smirnov 已提交
702 703
DEFINE_int32(random_access_max_buffer_size, 1024 * 1024,
             "Maximum windows randomaccess buffer size");
704

I
Islam AbdelRahman 已提交
705 706
DEFINE_int32(writable_file_max_buffer_size, 1024 * 1024,
             "Maximum write buffer for Writable File");
707

708 709 710
DEFINE_int32(bloom_bits, -1,
             "Bloom filter bits per key. Negative means use default."
             "Zero disables.");
711 712 713

DEFINE_bool(use_ribbon_filter, false, "Use Ribbon instead of Bloom filter");

714 715 716
DEFINE_double(memtable_bloom_size_ratio, 0,
              "Ratio of memtable size used for bloom filter. 0 means no bloom "
              "filter.");
717 718
DEFINE_bool(memtable_whole_key_filtering, false,
            "Try to use whole key bloom filter in memtables.");
719 720
DEFINE_bool(memtable_use_huge_page, false,
            "Try to use huge page in memtables.");
S
Sanjay Ghemawat 已提交
721

722 723 724 725
DEFINE_bool(whole_key_filtering,
            ROCKSDB_NAMESPACE::BlockBasedTableOptions().whole_key_filtering,
            "Use whole keys (in addition to prefixes) in SST bloom filter.");

726 727 728
DEFINE_bool(use_existing_db, false, "If true, do not destroy the existing"
            " database.  If you set this flag and also specify a benchmark that"
            " wants a fresh database, that benchmark will fail.");
729

730 731 732 733 734 735 736 737
DEFINE_bool(use_existing_keys, false,
            "If true, uses existing keys in the DB, "
            "rather than generating new ones. This involves some startup "
            "latency to load all keys into memory. It is supported for the "
            "same read/overwrite benchmarks as `-use_existing_db=true`, which "
            "must also be set for this flag to be enabled. When this flag is "
            "set, the value for `-num` will be ignored.");

738 739 740 741 742
DEFINE_bool(show_table_properties, false,
            "If true, then per-level table"
            " properties will be printed on every stats-interval when"
            " stats_interval is set and stats_per_interval is on.");

743
DEFINE_string(db, "", "Use the db with the following name.");
744

745 746 747 748 749 750 751 752 753 754 755 756 757 758
// Read cache flags

DEFINE_string(read_cache_path, "",
              "If not empty string, a read cache will be used in this path");

DEFINE_int64(read_cache_size, 4LL * 1024 * 1024 * 1024,
             "Maximum size of the read cache");

DEFINE_bool(read_cache_direct_write, true,
            "Whether to use Direct IO for writing to the read cache");

DEFINE_bool(read_cache_direct_read, true,
            "Whether to use Direct IO for reading from read cache");

759
DEFINE_bool(use_keep_filter, false, "Whether to use a noop compaction filter");
760

761 762 763 764 765 766 767 768
static bool ValidateCacheNumshardbits(const char* flagname, int32_t value) {
  if (value >= 20) {
    fprintf(stderr, "Invalid value for --%s: %d, must be < 20\n",
            flagname, value);
    return false;
  }
  return true;
}
769

770 771
DEFINE_bool(verify_checksum, true,
            "Verify checksum for every block read"
772
            " from storage");
773

774 775 776 777
DEFINE_int32(checksum_type,
             ROCKSDB_NAMESPACE::BlockBasedTableOptions().checksum,
             "ChecksumType as an int");

778
DEFINE_bool(statistics, false, "Database statistics");
779
DEFINE_int32(stats_level, ROCKSDB_NAMESPACE::StatsLevel::kExceptDetailedTimers,
S
Siying Dong 已提交
780
             "stats level for statistics");
781
DEFINE_string(statistics_string, "", "Serialized statistics string");
782
static class std::shared_ptr<ROCKSDB_NAMESPACE::Statistics> dbstats;
783

784 785
DEFINE_int64(writes, -1, "Number of write operations to do. If negative, do"
             " --num reads.");
H
heyongqiang 已提交
786

787 788
DEFINE_bool(finish_after_writes, false, "Write thread terminates after all writes are finished");

789
DEFINE_bool(sync, false, "Sync all writes to disk");
H
heyongqiang 已提交
790

791
DEFINE_bool(use_fsync, false, "If true, issue fsync instead of fdatasync");
M
Mark Callaghan 已提交
792

793
DEFINE_bool(disable_wal, false, "If true, do not write WAL for write.");
794

795 796 797
DEFINE_bool(manual_wal_flush, false,
            "If true, buffer WAL until buffer is full or a manual FlushWAL().");

798
DEFINE_string(wal_compression, "none",
799 800 801 802
              "Algorithm to use for WAL compression. none to disable.");
static enum ROCKSDB_NAMESPACE::CompressionType FLAGS_wal_compression_e =
    ROCKSDB_NAMESPACE::kNoCompression;

L
Lei Jin 已提交
803 804
DEFINE_string(wal_dir, "", "If not empty, use the given dir for WAL");

805 806 807
DEFINE_string(truth_db, "/dev/shm/truth_db/dbbench",
              "Truth key/values used when using verify");

808
DEFINE_int32(num_levels, 7, "The total number of levels");
H
heyongqiang 已提交
809

810 811
DEFINE_int64(target_file_size_base,
             ROCKSDB_NAMESPACE::Options().target_file_size_base,
812
             "Target file size at level-1");
H
heyongqiang 已提交
813

814
DEFINE_int32(target_file_size_multiplier,
815
             ROCKSDB_NAMESPACE::Options().target_file_size_multiplier,
816
             "A multiplier to compute target level-N file size (N >= 2)");
817

818
DEFINE_uint64(max_bytes_for_level_base,
819
              ROCKSDB_NAMESPACE::Options().max_bytes_for_level_base,
820
              "Max bytes for level-1");
H
heyongqiang 已提交
821

822 823 824
DEFINE_bool(level_compaction_dynamic_level_bytes, false,
            "Whether level size base is dynamic");

825 826
DEFINE_double(max_bytes_for_level_multiplier, 10,
              "A multiplier to compute max bytes for level-N (N >= 2)");
H
heyongqiang 已提交
827

828 829 830
static std::vector<int> FLAGS_max_bytes_for_level_multiplier_additional_v;
DEFINE_string(max_bytes_for_level_multiplier_additional, "",
              "A vector that specifies additional fanout per level");
831

832
DEFINE_int32(level0_stop_writes_trigger,
833
             ROCKSDB_NAMESPACE::Options().level0_stop_writes_trigger,
834
             "Number of files in level-0"
835
             " that will trigger put stop.");
836

837
DEFINE_int32(level0_slowdown_writes_trigger,
838
             ROCKSDB_NAMESPACE::Options().level0_slowdown_writes_trigger,
839
             "Number of files in level-0"
840
             " that will slow down writes.");
841

842
DEFINE_int32(level0_file_num_compaction_trigger,
843
             ROCKSDB_NAMESPACE::Options().level0_file_num_compaction_trigger,
844
             "Number of files in level-0"
845
             " when compactions start");
846

847 848 849 850 851
DEFINE_uint64(periodic_compaction_seconds,
              ROCKSDB_NAMESPACE::Options().periodic_compaction_seconds,
              "Files older than this will be picked up for compaction and"
              " rewritten to the same level");

852 853
DEFINE_uint64(ttl_seconds, ROCKSDB_NAMESPACE::Options().ttl, "Set options.ttl");

854 855 856 857 858 859 860 861 862 863 864 865 866
static bool ValidateInt32Percent(const char* flagname, int32_t value) {
  if (value <= 0 || value>=100) {
    fprintf(stderr, "Invalid value for --%s: %d, 0< pct <100 \n",
            flagname, value);
    return false;
  }
  return true;
}
DEFINE_int32(readwritepercent, 90, "Ratio of reads to reads/writes (expressed"
             " as percentage) for the ReadRandomWriteRandom workload. The "
             "default value 90 means 90% operations out of all reads and writes"
             " operations are reads. In other words, 9 gets for every 1 put.");

867 868 869 870 871
DEFINE_int32(mergereadpercent, 70, "Ratio of merges to merges&reads (expressed"
             " as percentage) for the ReadRandomMergeRandom workload. The"
             " default value 70 means 70% out of all read and merge operations"
             " are merges. In other words, 7 merges for every 3 gets.");

872 873 874 875 876 877
DEFINE_int32(deletepercent, 2, "Percentage of deletes out of reads/writes/"
             "deletes (used in RandomWithVerify only). RandomWithVerify "
             "calculates writepercent as (100 - FLAGS_readwritepercent - "
             "deletepercent), so deletepercent must be smaller than (100 - "
             "FLAGS_readwritepercent)");

878 879
DEFINE_bool(optimize_filters_for_hits,
            ROCKSDB_NAMESPACE::Options().optimize_filters_for_hits,
880 881 882 883
            "Optimizes bloom filters for workloads for most lookups return "
            "a value. For now this doesn't create bloom filters for the max "
            "level of the LSM to reduce metadata that should fit in RAM. ");

884 885 886 887 888 889 890 891 892 893 894 895 896
DEFINE_bool(paranoid_checks, ROCKSDB_NAMESPACE::Options().paranoid_checks,
            "RocksDB will aggressively check consistency of the data.");

DEFINE_bool(force_consistency_checks,
            ROCKSDB_NAMESPACE::Options().force_consistency_checks,
            "Runs consistency checks on the LSM every time a change is "
            "applied.");

DEFINE_bool(check_flush_compaction_key_order,
            ROCKSDB_NAMESPACE::Options().check_flush_compaction_key_order,
            "During flush or compaction, check whether keys inserted to "
            "output files are in order.");

I
Igor Canadi 已提交
897 898
DEFINE_uint64(delete_obsolete_files_period_micros, 0,
              "Ignored. Left here for backward compatibility");
899

900 901 902
DEFINE_int64(writes_before_delete_range, 0,
             "Number of writes before DeleteRange is called regularly.");

A
Andrew Kryczka 已提交
903
DEFINE_int64(writes_per_range_tombstone, 0,
904
             "Number of writes between range tombstones");
A
Andrew Kryczka 已提交
905 906 907 908 909 910 911

DEFINE_int64(range_tombstone_width, 100, "Number of keys in tombstone's range");

DEFINE_int64(max_num_range_tombstones, 0,
             "Maximum number of range tombstones "
             "to insert.");

912 913 914
DEFINE_bool(expand_range_tombstones, false,
            "Expand range tombstone into sequential regular tombstones.");

915
#ifndef ROCKSDB_LITE
916
// Transactions Options
A
agiardullo 已提交
917
DEFINE_bool(optimistic_transaction_db, false,
A
agiardullo 已提交
918 919 920
            "Open a OptimisticTransactionDB instance. "
            "Required for randomtransaction benchmark.");

A
agiardullo 已提交
921 922 923 924
DEFINE_bool(transaction_db, false,
            "Open a TransactionDB instance. "
            "Required for randomtransaction benchmark.");

A
agiardullo 已提交
925 926 927 928
DEFINE_uint64(transaction_sets, 2,
              "Number of keys each transaction will "
              "modify (use in RandomTransaction only).  Max: 9999");

929 930 931 932
DEFINE_bool(transaction_set_snapshot, false,
            "Setting to true will have each transaction call SetSnapshot()"
            " upon creation.");

A
agiardullo 已提交
933 934 935 936
DEFINE_int32(transaction_sleep, 0,
             "Max microseconds to sleep in between "
             "reading and writing a value (used in RandomTransaction only). ");

937 938 939
DEFINE_uint64(transaction_lock_timeout, 100,
              "If using a transaction_db, specifies the lock wait timeout in"
              " milliseconds before failing a transaction waiting on a lock");
940 941 942 943 944 945 946 947 948
DEFINE_string(
    options_file, "",
    "The path to a RocksDB options file.  If specified, then db_bench will "
    "run with the RocksDB options in the default column family of the "
    "specified options file. "
    "Note that with this setting, db_bench will ONLY accept the following "
    "RocksDB options related command-line arguments, all other arguments "
    "that are related to RocksDB options will be ignored:\n"
    "\t--use_existing_db\n"
949
    "\t--use_existing_keys\n"
950 951 952 953 954 955
    "\t--statistics\n"
    "\t--row_cache_size\n"
    "\t--row_cache_numshardbits\n"
    "\t--enable_io_prio\n"
    "\t--dump_malloc_stats\n"
    "\t--num_multi_db\n");
956

957
// FIFO Compaction Options
958 959
DEFINE_uint64(fifo_compaction_max_table_files_size_mb, 0,
              "The limit of total table file sizes to trigger FIFO compaction");
960

961 962
DEFINE_bool(fifo_compaction_allow_compaction, true,
            "Allow compaction in FIFO compaction.");
963

S
Sagar Vemuri 已提交
964
DEFINE_uint64(fifo_compaction_ttl, 0, "TTL for the SST Files in seconds.");
965

966 967
DEFINE_uint64(fifo_age_for_warm, 0, "age_for_warm for FIFO compaction.");

968 969
// Stacked BlobDB Options
DEFINE_bool(use_blob_db, false, "[Stacked BlobDB] Open a BlobDB instance.");
S
Sagar Vemuri 已提交
970

971 972 973
DEFINE_bool(
    blob_db_enable_gc,
    ROCKSDB_NAMESPACE::blob_db::BlobDBOptions().enable_garbage_collection,
974
    "[Stacked BlobDB] Enable BlobDB garbage collection.");
S
Sagar Vemuri 已提交
975

976 977 978
DEFINE_double(
    blob_db_gc_cutoff,
    ROCKSDB_NAMESPACE::blob_db::BlobDBOptions().garbage_collection_cutoff,
979
    "[Stacked BlobDB] Cutoff ratio for BlobDB garbage collection.");
S
Sagar Vemuri 已提交
980

981 982
DEFINE_bool(blob_db_is_fifo,
            ROCKSDB_NAMESPACE::blob_db::BlobDBOptions().is_fifo,
983
            "[Stacked BlobDB] Enable FIFO eviction strategy in BlobDB.");
984 985

DEFINE_uint64(blob_db_max_db_size,
986
              ROCKSDB_NAMESPACE::blob_db::BlobDBOptions().max_db_size,
987 988
              "[Stacked BlobDB] Max size limit of the directory where blob "
              "files are stored.");
S
Sagar Vemuri 已提交
989

990 991 992
DEFINE_uint64(blob_db_max_ttl_range, 0,
              "[Stacked BlobDB] TTL range to generate BlobDB data (in "
              "seconds). 0 means no TTL.");
S
Sagar Vemuri 已提交
993

994 995 996 997
DEFINE_uint64(
    blob_db_ttl_range_secs,
    ROCKSDB_NAMESPACE::blob_db::BlobDBOptions().ttl_range_secs,
    "[Stacked BlobDB] TTL bucket size to use when creating blob files.");
S
Sagar Vemuri 已提交
998

999 1000 1001 1002 1003
DEFINE_uint64(
    blob_db_min_blob_size,
    ROCKSDB_NAMESPACE::blob_db::BlobDBOptions().min_blob_size,
    "[Stacked BlobDB] Smallest blob to store in a file. Blobs "
    "smaller than this will be inlined with the key in the LSM tree.");
S
Sagar Vemuri 已提交
1004

1005
DEFINE_uint64(blob_db_bytes_per_sync,
1006
              ROCKSDB_NAMESPACE::blob_db::BlobDBOptions().bytes_per_sync,
1007
              "[Stacked BlobDB] Bytes to sync blob file at.");
S
Sagar Vemuri 已提交
1008

1009
DEFINE_uint64(blob_db_file_size,
1010
              ROCKSDB_NAMESPACE::blob_db::BlobDBOptions().blob_file_size,
1011
              "[Stacked BlobDB] Target size of each blob file.");
1012

1013 1014 1015
DEFINE_string(
    blob_db_compression_type, "snappy",
    "[Stacked BlobDB] Algorithm to use to compress blobs in blob files.");
1016 1017
static enum ROCKSDB_NAMESPACE::CompressionType
    FLAGS_blob_db_compression_type_e = ROCKSDB_NAMESPACE::kSnappyCompression;
1018

1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050
#endif  // ROCKSDB_LITE

// Integrated BlobDB options
DEFINE_bool(
    enable_blob_files,
    ROCKSDB_NAMESPACE::AdvancedColumnFamilyOptions().enable_blob_files,
    "[Integrated BlobDB] Enable writing large values to separate blob files.");

DEFINE_uint64(min_blob_size,
              ROCKSDB_NAMESPACE::AdvancedColumnFamilyOptions().min_blob_size,
              "[Integrated BlobDB] The size of the smallest value to be stored "
              "separately in a blob file.");

DEFINE_uint64(blob_file_size,
              ROCKSDB_NAMESPACE::AdvancedColumnFamilyOptions().blob_file_size,
              "[Integrated BlobDB] The size limit for blob files.");

DEFINE_string(blob_compression_type, "none",
              "[Integrated BlobDB] The compression algorithm to use for large "
              "values stored in blob files.");

DEFINE_bool(enable_blob_garbage_collection,
            ROCKSDB_NAMESPACE::AdvancedColumnFamilyOptions()
                .enable_blob_garbage_collection,
            "[Integrated BlobDB] Enable blob garbage collection.");

DEFINE_double(blob_garbage_collection_age_cutoff,
              ROCKSDB_NAMESPACE::AdvancedColumnFamilyOptions()
                  .blob_garbage_collection_age_cutoff,
              "[Integrated BlobDB] The cutoff in terms of blob file age for "
              "garbage collection.");

1051 1052 1053 1054 1055 1056
DEFINE_double(blob_garbage_collection_force_threshold,
              ROCKSDB_NAMESPACE::AdvancedColumnFamilyOptions()
                  .blob_garbage_collection_force_threshold,
              "[Integrated BlobDB] The threshold for the ratio of garbage in "
              "the oldest blob files for forcing garbage collection.");

1057 1058 1059 1060 1061
DEFINE_uint64(blob_compaction_readahead_size,
              ROCKSDB_NAMESPACE::AdvancedColumnFamilyOptions()
                  .blob_compaction_readahead_size,
              "[Integrated BlobDB] Compaction readahead for blob files.");

1062 1063
#ifndef ROCKSDB_LITE

1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076
// Secondary DB instance Options
DEFINE_bool(use_secondary_db, false,
            "Open a RocksDB secondary instance. A primary instance can be "
            "running in another db_bench process.");

DEFINE_string(secondary_path, "",
              "Path to a directory used by the secondary instance to store "
              "private files, e.g. info log.");

DEFINE_int32(secondary_update_interval, 5,
             "Secondary instance attempts to catch up with the primary every "
             "secondary_update_interval seconds.");

1077
#endif  // ROCKSDB_LITE
1078

1079
DEFINE_bool(report_bg_io_stats, false,
1080 1081
            "Measure times spents on I/Os while in compactions. ");

1082 1083 1084
DEFINE_bool(use_stderr_info_logger, false,
            "Write info logs to stderr instead of to LOG file. ");

1085 1086
#ifndef ROCKSDB_LITE

1087 1088
DEFINE_string(trace_file, "", "Trace workload to a file. ");

1089 1090
DEFINE_double(trace_replay_fast_forward, 1.0,
              "Fast forward trace replay, must > 0.0.");
1091 1092 1093 1094 1095 1096 1097 1098 1099 1100
DEFINE_int32(block_cache_trace_sampling_frequency, 1,
             "Block cache trace sampling frequency, termed s. It uses spatial "
             "downsampling and samples accesses to one out of s blocks.");
DEFINE_int64(
    block_cache_trace_max_trace_file_size_in_bytes,
    uint64_t{64} * 1024 * 1024 * 1024,
    "The maximum block cache trace file size in bytes. Block cache accesses "
    "will not be logged if the trace file size exceeds this threshold. Default "
    "is 64 GB.");
DEFINE_string(block_cache_trace_file, "", "Block cache trace file path.");
1101 1102
DEFINE_int32(trace_replay_threads, 1,
             "The number of threads to replay, must >=1.");
1103

1104 1105 1106
DEFINE_bool(io_uring_enabled, true,
            "If true, enable the use of IO uring if the platform supports it");
extern "C" bool RocksDbIOUringEnable() { return FLAGS_io_uring_enabled; }
1107 1108
#endif  // ROCKSDB_LITE

1109 1110 1111 1112
DEFINE_bool(adaptive_readahead, false,
            "carry forward internal auto readahead size from one file to next "
            "file at each level during iteration");

1113 1114 1115 1116 1117 1118 1119 1120
DEFINE_bool(rate_limit_user_ops, false,
            "When true use Env::IO_USER priority level to charge internal rate "
            "limiter for reads associated with user operations.");

DEFINE_bool(file_checksum, false,
            "When true use FileChecksumGenCrc32cFactory for "
            "file_checksum_gen_factory.");

1121 1122 1123 1124 1125 1126
DEFINE_bool(rate_limit_auto_wal_flush, false,
            "When true use Env::IO_USER priority level to charge internal rate "
            "limiter for automatic WAL flush (`Options::manual_wal_flush` == "
            "false) after the user "
            "write operation");

1127 1128 1129 1130
DEFINE_bool(async_io, false,
            "When set true, RocksDB does asynchronous reads for internal auto "
            "readahead prefetching.");

1131 1132
static enum ROCKSDB_NAMESPACE::CompressionType StringToCompressionType(
    const char* ctype) {
1133 1134 1135
  assert(ctype);

  if (!strcasecmp(ctype, "none"))
1136
    return ROCKSDB_NAMESPACE::kNoCompression;
1137
  else if (!strcasecmp(ctype, "snappy"))
1138
    return ROCKSDB_NAMESPACE::kSnappyCompression;
1139
  else if (!strcasecmp(ctype, "zlib"))
1140
    return ROCKSDB_NAMESPACE::kZlibCompression;
1141
  else if (!strcasecmp(ctype, "bzip2"))
1142
    return ROCKSDB_NAMESPACE::kBZip2Compression;
A
Albert Strasheim 已提交
1143
  else if (!strcasecmp(ctype, "lz4"))
1144
    return ROCKSDB_NAMESPACE::kLZ4Compression;
A
Albert Strasheim 已提交
1145
  else if (!strcasecmp(ctype, "lz4hc"))
1146
    return ROCKSDB_NAMESPACE::kLZ4HCCompression;
1147
  else if (!strcasecmp(ctype, "xpress"))
1148
    return ROCKSDB_NAMESPACE::kXpressCompression;
1149
  else if (!strcasecmp(ctype, "zstd"))
1150
    return ROCKSDB_NAMESPACE::kZSTD;
1151 1152

  fprintf(stdout, "Cannot parse compression type '%s'\n", ctype);
1153
  return ROCKSDB_NAMESPACE::kSnappyCompression;  // default value
1154
}
1155

1156
static std::string ColumnFamilyName(size_t i) {
1157
  if (i == 0) {
1158
    return ROCKSDB_NAMESPACE::kDefaultColumnFamilyName;
1159 1160
  } else {
    char name[100];
S
sdong 已提交
1161
    snprintf(name, sizeof(name), "column_family_name_%06zu", i);
1162 1163 1164
    return std::string(name);
  }
}
I
Igor Canadi 已提交
1165

1166 1167
DEFINE_string(compression_type, "snappy",
              "Algorithm to use to compress the database");
1168 1169
static enum ROCKSDB_NAMESPACE::CompressionType FLAGS_compression_type_e =
    ROCKSDB_NAMESPACE::kSnappyCompression;
1170

1171 1172
DEFINE_int64(sample_for_compression, 0, "Sample every N block for compression");

1173
DEFINE_int32(compression_level, ROCKSDB_NAMESPACE::CompressionOptions().level,
1174 1175 1176
             "Compression level. The meaning of this value is library-"
             "dependent. If unset, we try to use the default for the library "
             "specified in `--compression_type`");
1177

1178
DEFINE_int32(compression_max_dict_bytes,
1179
             ROCKSDB_NAMESPACE::CompressionOptions().max_dict_bytes,
1180 1181 1182
             "Maximum size of dictionary used to prime the compression "
             "library.");

1183
DEFINE_int32(compression_zstd_max_train_bytes,
1184
             ROCKSDB_NAMESPACE::CompressionOptions().zstd_max_train_bytes,
A
Andrew Kryczka 已提交
1185 1186 1187
             "Maximum size of training data passed to zstd's dictionary "
             "trainer.");

1188 1189 1190 1191 1192
DEFINE_int32(min_level_to_compress, -1, "If non-negative, compression starts"
             " from this level. Levels with number < min_level_to_compress are"
             " not compressed. Otherwise, apply compression_type to "
             "all levels.");

1193 1194
DEFINE_int32(compression_parallel_threads, 1,
             "Number of threads for parallel compression.");
1195

1196 1197 1198 1199
DEFINE_uint64(compression_max_dict_buffer_bytes,
              ROCKSDB_NAMESPACE::CompressionOptions().max_dict_buffer_bytes,
              "Maximum bytes to buffer to collect samples for dictionary.");

1200 1201
static bool ValidateTableCacheNumshardbits(const char* flagname,
                                           int32_t value) {
1202 1203
  if (0 >= value || value >= 20) {
    fprintf(stderr, "Invalid value for --%s: %d, must be  0 < val < 20\n",
1204 1205 1206 1207 1208 1209
            flagname, value);
    return false;
  }
  return true;
}
DEFINE_int32(table_cache_numshardbits, 4, "");
1210

1211
#ifndef ROCKSDB_LITE
1212 1213
DEFINE_string(env_uri, "",
              "URI for registry Env lookup. Mutually exclusive"
1214
              " with --fs_uri");
1215 1216
DEFINE_string(fs_uri, "",
              "URI for registry Filesystem lookup. Mutually exclusive"
1217
              " with --env_uri."
1218
              " Creates a default environment with the specified filesystem.");
1219
#endif  // ROCKSDB_LITE
1220 1221 1222 1223
DEFINE_string(simulate_hybrid_fs_file, "",
              "File for Store Metadata for Simulate hybrid FS. Empty means "
              "disable the feature. Now, if it is set, "
              "bottommost_temperature is set to kWarm.");
S
sdong 已提交
1224 1225 1226 1227
DEFINE_int32(simulate_hybrid_hdd_multipliers, 1,
             "In simulate_hybrid_fs_file or simulate_hdd mode, how many HDDs "
             "are simulated.");
DEFINE_bool(simulate_hdd, false, "Simulate read/write latency on HDD.");
1228

1229
static std::shared_ptr<ROCKSDB_NAMESPACE::Env> env_guard;
1230

1231
static ROCKSDB_NAMESPACE::Env* FLAGS_env = ROCKSDB_NAMESPACE::Env::Default();
1232

1233 1234
DEFINE_int64(stats_interval, 0, "Stats are reported every N operations when "
             "this is greater than zero. When 0 the interval grows over time.");
1235

1236 1237 1238
DEFINE_int64(stats_interval_seconds, 0, "Report stats every N seconds. This "
             "overrides stats_interval when both are > 0.");

1239 1240
DEFINE_int32(stats_per_interval, 0, "Reports additional stats per interval when"
             " this is greater than 0.");
1241

1242
DEFINE_int64(report_interval_seconds, 0,
A
Adam Retter 已提交
1243
             "If greater than zero, it will write simple stats in CSV format "
1244 1245 1246 1247 1248 1249
             "to --report_file every N seconds");

DEFINE_string(report_file, "report.csv",
              "Filename where some simple stats are reported to (if "
              "--report_interval_seconds is bigger than 0)");

1250 1251 1252 1253
DEFINE_int32(thread_status_per_interval, 0,
             "Takes and report a snapshot of the current status of each thread"
             " when this is greater than 0.");

1254 1255
DEFINE_int32(perf_level, ROCKSDB_NAMESPACE::PerfLevel::kDisable,
             "Level of perf collection");
1256

1257 1258 1259
DEFINE_uint64(soft_pending_compaction_bytes_limit, 64ull * 1024 * 1024 * 1024,
              "Slowdown writes if pending compaction bytes exceed this number");

1260
DEFINE_uint64(hard_pending_compaction_bytes_limit, 128ull * 1024 * 1024 * 1024,
1261
              "Stop writes if pending compaction bytes exceed this number");
1262

S
sdong 已提交
1263
DEFINE_uint64(delayed_write_rate, 8388608u,
S
sdong 已提交
1264 1265 1266
              "Limited bytes allowed to DB when soft_rate_limit or "
              "level0_slowdown_writes_trigger triggers");

1267 1268 1269
DEFINE_bool(enable_pipelined_write, true,
            "Allow WAL and memtable writes to be pipelined");

1270 1271 1272 1273
DEFINE_bool(
    unordered_write, false,
    "Enable the unordered write feature, which provides higher throughput but "
    "relaxes the guarantees around atomic reads and immutable snapshots");
M
Maysam Yabandeh 已提交
1274

1275
DEFINE_bool(allow_concurrent_memtable_write, true,
1276 1277
            "Allow multi-writers to update mem tables in parallel.");

1278 1279 1280
DEFINE_double(experimental_mempurge_threshold, 0.0,
              "Maximum useful payload ratio estimate that triggers a mempurge "
              "(memtable garbage collection).");
1281

1282 1283
DEFINE_bool(inplace_update_support,
            ROCKSDB_NAMESPACE::Options().inplace_update_support,
1284 1285 1286
            "Support in-place memtable update for smaller or same-size values");

DEFINE_uint64(inplace_update_num_locks,
1287
              ROCKSDB_NAMESPACE::Options().inplace_update_num_locks,
1288 1289
              "Number of RW locks to protect in-place memtable updates");

1290
DEFINE_bool(enable_write_thread_adaptive_yield, true,
1291 1292 1293 1294 1295 1296 1297 1298 1299 1300
            "Use a yielding spin loop for brief writer thread waits.");

DEFINE_uint64(
    write_thread_max_yield_usec, 100,
    "Maximum microseconds for enable_write_thread_adaptive_yield operation.");

DEFINE_uint64(write_thread_slow_yield_usec, 3,
              "The threshold at which a slow yield is considered a signal that "
              "other processes or threads want the core.");

S
sdong 已提交
1301 1302
DEFINE_uint64(rate_limiter_bytes_per_sec, 0, "Set options.rate_limiter value.");

1303 1304 1305 1306
DEFINE_int64(rate_limiter_refill_period_us, 100 * 1000,
             "Set refill period on "
             "rate limiter.");

A
Andrew Kryczka 已提交
1307 1308 1309 1310
DEFINE_bool(rate_limiter_auto_tuned, false,
            "Enable dynamic adjustment of rate limit according to demand for "
            "background I/O");

1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329

DEFINE_bool(sine_write_rate, false,
            "Use a sine wave write_rate_limit");

DEFINE_uint64(sine_write_rate_interval_milliseconds, 10000,
              "Interval of which the sine wave write_rate_limit is recalculated");

DEFINE_double(sine_a, 1,
             "A in f(x) = A sin(bx + c) + d");

DEFINE_double(sine_b, 1,
             "B in f(x) = A sin(bx + c) + d");

DEFINE_double(sine_c, 0,
             "C in f(x) = A sin(bx + c) + d");

DEFINE_double(sine_d, 1,
             "D in f(x) = A sin(bx + c) + d");

1330 1331 1332
DEFINE_bool(rate_limit_bg_reads, false,
            "Use options.rate_limiter on compaction reads");

1333 1334
DEFINE_uint64(
    benchmark_write_rate_limit, 0,
1335 1336
    "If non-zero, db_bench will rate-limit the writes going into RocksDB. This "
    "is the global rate in bytes/second.");
1337

1338
// the parameters of mix_graph
1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352
DEFINE_double(keyrange_dist_a, 0.0,
              "The parameter 'a' of prefix average access distribution "
              "f(x)=a*exp(b*x)+c*exp(d*x)");
DEFINE_double(keyrange_dist_b, 0.0,
              "The parameter 'b' of prefix average access distribution "
              "f(x)=a*exp(b*x)+c*exp(d*x)");
DEFINE_double(keyrange_dist_c, 0.0,
              "The parameter 'c' of prefix average access distribution"
              "f(x)=a*exp(b*x)+c*exp(d*x)");
DEFINE_double(keyrange_dist_d, 0.0,
              "The parameter 'd' of prefix average access distribution"
              "f(x)=a*exp(b*x)+c*exp(d*x)");
DEFINE_int64(keyrange_num, 1,
             "The number of key ranges that are in the same prefix "
1353
             "group, each prefix range will have its key access "
1354
             "distribution");
1355 1356 1357 1358 1359 1360 1361 1362 1363
DEFINE_double(key_dist_a, 0.0,
              "The parameter 'a' of key access distribution model "
              "f(x)=a*x^b");
DEFINE_double(key_dist_b, 0.0,
              "The parameter 'b' of key access distribution model "
              "f(x)=a*x^b");
DEFINE_double(value_theta, 0.0,
              "The parameter 'theta' of Generized Pareto Distribution "
              "f(x)=(1/sigma)*(1+k*(x-theta)/sigma)^-(1/k+1)");
1364 1365
// Use reasonable defaults based on the mixgraph paper
DEFINE_double(value_k, 0.2615,
1366 1367
              "The parameter 'k' of Generized Pareto Distribution "
              "f(x)=(1/sigma)*(1+k*(x-theta)/sigma)^-(1/k+1)");
1368 1369
// Use reasonable defaults based on the mixgraph paper
DEFINE_double(value_sigma, 25.45,
1370 1371 1372 1373 1374
              "The parameter 'theta' of Generized Pareto Distribution "
              "f(x)=(1/sigma)*(1+k*(x-theta)/sigma)^-(1/k+1)");
DEFINE_double(iter_theta, 0.0,
              "The parameter 'theta' of Generized Pareto Distribution "
              "f(x)=(1/sigma)*(1+k*(x-theta)/sigma)^-(1/k+1)");
1375 1376
// Use reasonable defaults based on the mixgraph paper
DEFINE_double(iter_k, 2.517,
1377 1378
              "The parameter 'k' of Generized Pareto Distribution "
              "f(x)=(1/sigma)*(1+k*(x-theta)/sigma)^-(1/k+1)");
1379 1380
// Use reasonable defaults based on the mixgraph paper
DEFINE_double(iter_sigma, 14.236,
1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401
              "The parameter 'sigma' of Generized Pareto Distribution "
              "f(x)=(1/sigma)*(1+k*(x-theta)/sigma)^-(1/k+1)");
DEFINE_double(mix_get_ratio, 1.0,
              "The ratio of Get queries of mix_graph workload");
DEFINE_double(mix_put_ratio, 0.0,
              "The ratio of Put queries of mix_graph workload");
DEFINE_double(mix_seek_ratio, 0.0,
              "The ratio of Seek queries of mix_graph workload");
DEFINE_int64(mix_max_scan_len, 10000, "The max scan length of Iterator");
DEFINE_int64(mix_max_value_size, 1024, "The max value size of this workload");
DEFINE_double(
    sine_mix_rate_noise, 0.0,
    "Add the noise ratio to the sine rate, it is between 0.0 and 1.0");
DEFINE_bool(sine_mix_rate, false,
            "Enable the sine QPS control on the mix workload");
DEFINE_uint64(
    sine_mix_rate_interval_milliseconds, 10000,
    "Interval of which the sine wave read_rate_limit is recalculated");
DEFINE_int64(mix_accesses, -1,
             "The total query accesses of mix_graph workload");

1402 1403 1404 1405 1406
DEFINE_uint64(
    benchmark_read_rate_limit, 0,
    "If non-zero, db_bench will rate-limit the reads from RocksDB. This "
    "is the global rate in ops/second.");

1407 1408
DEFINE_uint64(max_compaction_bytes,
              ROCKSDB_NAMESPACE::Options().max_compaction_bytes,
1409
              "Max bytes allowed in one compaction");
1410

1411
#ifndef ROCKSDB_LITE
1412
DEFINE_bool(readonly, false, "Run read only benchmarks.");
Z
Zhongyi Xie 已提交
1413 1414 1415

DEFINE_bool(print_malloc_stats, false,
            "Print malloc stats to stdout after benchmarks finish.");
1416
#endif  // ROCKSDB_LITE
H
heyongqiang 已提交
1417

1418
DEFINE_bool(disable_auto_compactions, false, "Do not auto trigger compactions");
1419

1420 1421 1422
DEFINE_uint64(wal_ttl_seconds, 0, "Set the TTL for the WAL Files in seconds.");
DEFINE_uint64(wal_size_limit_MB, 0, "Set the size limit for the WAL Files"
              " in MB.");
1423
DEFINE_uint64(max_total_wal_size, 0, "Set total max WAL size");
1424

1425
DEFINE_bool(mmap_read, ROCKSDB_NAMESPACE::Options().allow_mmap_reads,
1426
            "Allow reads to occur via mmap-ing files");
1427

1428
DEFINE_bool(mmap_write, ROCKSDB_NAMESPACE::Options().allow_mmap_writes,
1429
            "Allow writes to occur via mmap-ing files");
1430

1431
DEFINE_bool(use_direct_reads, ROCKSDB_NAMESPACE::Options().use_direct_reads,
1432 1433
            "Use O_DIRECT for reading data");

1434
DEFINE_bool(use_direct_io_for_flush_and_compaction,
1435
            ROCKSDB_NAMESPACE::Options().use_direct_io_for_flush_and_compaction,
1436
            "Use O_DIRECT for background flush and compaction writes");
A
Aaron Gao 已提交
1437

1438 1439
DEFINE_bool(advise_random_on_open,
            ROCKSDB_NAMESPACE::Options().advise_random_on_open,
1440
            "Advise random access on table file open");
1441

1442 1443 1444
DEFINE_string(compaction_fadvice, "NORMAL",
              "Access pattern advice when a file is compacted");
static auto FLAGS_compaction_fadvice_e =
1445
    ROCKSDB_NAMESPACE::Options().access_hint_on_compaction_start;
1446

I
Igor Canadi 已提交
1447 1448 1449
DEFINE_bool(use_tailing_iterator, false,
            "Use tailing iterator to access a series of keys instead of get");

1450
DEFINE_bool(use_adaptive_mutex, ROCKSDB_NAMESPACE::Options().use_adaptive_mutex,
1451 1452
            "Use adaptive mutex");

1453
DEFINE_uint64(bytes_per_sync, ROCKSDB_NAMESPACE::Options().bytes_per_sync,
1454
              "Allows OS to incrementally sync SST files to disk while they are"
1455 1456
              " being written, in the background. Issue one request for every"
              " bytes_per_sync written. 0 turns it off.");
1457

1458 1459
DEFINE_uint64(wal_bytes_per_sync,
              ROCKSDB_NAMESPACE::Options().wal_bytes_per_sync,
1460 1461 1462 1463
              "Allows OS to incrementally sync WAL files to disk while they are"
              " being written, in the background. Issue one request for every"
              " wal_bytes_per_sync written. 0 turns it off.");

A
Andres Noetzli 已提交
1464 1465 1466 1467 1468 1469 1470
DEFINE_bool(use_single_deletes, true,
            "Use single deletes (used in RandomReplaceKeys only).");

DEFINE_double(stddev, 2000.0,
              "Standard deviation of normal distribution used for picking keys"
              " (used in RandomReplaceKeys only).");

1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488
DEFINE_int32(key_id_range, 100000,
             "Range of possible value of key id (used in TimeSeries only).");

DEFINE_string(expire_style, "none",
              "Style to remove expired time entries. Can be one of the options "
              "below: none (do not expired data), compaction_filter (use a "
              "compaction filter to remove expired data), delete (seek IDs and "
              "remove expired data) (used in TimeSeries only).");

DEFINE_uint64(
    time_range, 100000,
    "Range of timestamp that store in the database (used in TimeSeries"
    " only).");

DEFINE_int32(num_deletion_threads, 1,
             "Number of threads to do deletion (used in TimeSeries and delete "
             "expire_style only).");

1489 1490 1491
DEFINE_int32(max_successive_merges, 0, "Maximum number of successive merge"
             " operations on a key in the memtable");

1492 1493 1494 1495 1496 1497 1498 1499
static bool ValidatePrefixSize(const char* flagname, int32_t value) {
  if (value < 0 || value>=2000000000) {
    fprintf(stderr, "Invalid value for --%s: %d. 0<= PrefixSize <=2000000000\n",
            flagname, value);
    return false;
  }
  return true;
}
1500

L
Lei Jin 已提交
1501 1502
DEFINE_int32(prefix_size, 0, "control the prefix size for HashSkipList and "
             "plain table");
1503 1504 1505
DEFINE_int64(keys_per_prefix, 0, "control average number of keys generated "
             "per prefix, 0 means no special handling of the prefix, "
             "i.e. use the prefix comes with the generated random number.");
1506 1507 1508 1509 1510 1511 1512 1513
DEFINE_bool(total_order_seek, false,
            "Enable total order seek regardless of index format.");
DEFINE_bool(prefix_same_as_start, false,
            "Enforce iterator to return keys with prefix same as seek key.");
DEFINE_bool(
    seek_missing_prefix, false,
    "Iterator seek to keys with non-exist prefixes. Require prefix_size > 8");

1514 1515 1516
DEFINE_int32(memtable_insert_with_hint_prefix_size, 0,
             "If non-zero, enable "
             "memtable insert with hint with the given prefix size.");
1517 1518
DEFINE_bool(enable_io_prio, false, "Lower the background flush/compaction "
            "threads' IO priority");
1519 1520
DEFINE_bool(enable_cpu_prio, false, "Lower the background flush/compaction "
            "threads' CPU priority");
1521 1522 1523
DEFINE_bool(identity_as_first_hash, false, "the first hash function of cuckoo "
            "table becomes an identity function. This is only valid when key "
            "is 8 bytes");
1524
DEFINE_bool(dump_malloc_stats, true, "Dump malloc stats in LOG ");
1525 1526
DEFINE_uint64(stats_dump_period_sec,
              ROCKSDB_NAMESPACE::Options().stats_dump_period_sec,
1527
              "Gap between printing stats to log in seconds");
1528
DEFINE_uint64(stats_persist_period_sec,
1529
              ROCKSDB_NAMESPACE::Options().stats_persist_period_sec,
1530
              "Gap between persisting stats in seconds");
1531 1532
DEFINE_bool(persist_stats_to_disk,
            ROCKSDB_NAMESPACE::Options().persist_stats_to_disk,
1533
            "whether to persist stats to disk");
1534
DEFINE_uint64(stats_history_buffer_size,
1535
              ROCKSDB_NAMESPACE::Options().stats_history_buffer_size,
1536
              "Max number of stats snapshots to keep in memory");
1537 1538 1539
DEFINE_int64(multiread_stride, 0,
             "Stride length for the keys in a MultiGet batch");
DEFINE_bool(multiread_batched, false, "Use the new MultiGet API");
J
Jim Paton 已提交
1540

1541
DEFINE_string(memtablerep, "skip_list", "");
1542
DEFINE_int64(hash_bucket_count, 1024 * 1024, "hash bucket count");
L
Lei Jin 已提交
1543 1544
DEFINE_bool(use_plain_table, false, "if use plain table "
            "instead of block-based table format");
1545 1546
DEFINE_bool(use_cuckoo_table, false, "if use cuckoo table format");
DEFINE_double(cuckoo_hash_ratio, 0.9, "Hash ratio for Cuckoo SST table.");
1547 1548 1549
DEFINE_bool(use_hash_search, false, "if use kHashSearch "
            "instead of kBinarySearch. "
            "This is valid if only we use BlockTable");
1550 1551 1552
DEFINE_bool(use_block_based_filter, false, "if use kBlockBasedFilter "
            "instead of kFullFilter for filter block. "
            "This is valid if only we use BlockTable");
1553 1554 1555 1556
DEFINE_string(merge_operator, "", "The merge operator to use with the database."
              "If a new merge operator is specified, be sure to use fresh"
              " database The possible merge operators are defined in"
              " utilities/merge_operators.h");
T
Tomislav Novak 已提交
1557 1558 1559
DEFINE_int32(skip_list_lookahead, 0, "Used with skip_list memtablerep; try "
             "linear search first for this many steps from the previous "
             "position");
1560 1561
DEFINE_bool(report_file_operations, false, "if report number of file "
            "operations");
1562
DEFINE_bool(report_open_timing, false, "if report open timing");
1563
DEFINE_int32(readahead_size, 0, "Iterator readahead size");
D
Deon Nicholas 已提交
1564

1565 1566 1567 1568
DEFINE_bool(read_with_latest_user_timestamp, true,
            "If true, always use the current latest timestamp for read. If "
            "false, choose a random timestamp from the past.");

1569 1570 1571 1572 1573 1574
#ifndef ROCKSDB_LITE
DEFINE_string(secondary_cache_uri, "",
              "Full URI for creating a custom secondary cache object");
static class std::shared_ptr<ROCKSDB_NAMESPACE::SecondaryCache> secondary_cache;
#endif  // ROCKSDB_LITE

T
Tamir Duberstein 已提交
1575
static const bool FLAGS_prefix_size_dummy __attribute__((__unused__)) =
1576
    RegisterFlagValidator(&FLAGS_prefix_size, &ValidatePrefixSize);
K
kailiu 已提交
1577

T
Tamir Duberstein 已提交
1578
static const bool FLAGS_key_size_dummy __attribute__((__unused__)) =
1579
    RegisterFlagValidator(&FLAGS_key_size, &ValidateKeySize);
K
kailiu 已提交
1580

T
Tamir Duberstein 已提交
1581
static const bool FLAGS_cache_numshardbits_dummy __attribute__((__unused__)) =
1582 1583
    RegisterFlagValidator(&FLAGS_cache_numshardbits,
                          &ValidateCacheNumshardbits);
K
kailiu 已提交
1584

T
Tamir Duberstein 已提交
1585
static const bool FLAGS_readwritepercent_dummy __attribute__((__unused__)) =
1586
    RegisterFlagValidator(&FLAGS_readwritepercent, &ValidateInt32Percent);
K
kailiu 已提交
1587

I
Igor Canadi 已提交
1588 1589 1590
DEFINE_int32(disable_seek_compaction, false,
             "Not used, left here for backwards compatibility");

T
Tamir Duberstein 已提交
1591
static const bool FLAGS_deletepercent_dummy __attribute__((__unused__)) =
1592
    RegisterFlagValidator(&FLAGS_deletepercent, &ValidateInt32Percent);
T
Tamir Duberstein 已提交
1593
static const bool FLAGS_table_cache_numshardbits_dummy __attribute__((__unused__)) =
1594 1595
    RegisterFlagValidator(&FLAGS_table_cache_numshardbits,
                          &ValidateTableCacheNumshardbits);
K
kailiu 已提交
1596

1597
namespace ROCKSDB_NAMESPACE {
1598
namespace {
1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624
static Status CreateMemTableRepFactory(
    const ConfigOptions& config_options,
    std::shared_ptr<MemTableRepFactory>* factory) {
  Status s;
  if (!strcasecmp(FLAGS_memtablerep.c_str(), SkipListFactory::kNickName())) {
    factory->reset(new SkipListFactory(FLAGS_skip_list_lookahead));
#ifndef ROCKSDB_LITE
  } else if (!strcasecmp(FLAGS_memtablerep.c_str(), "prefix_hash")) {
    factory->reset(NewHashSkipListRepFactory(FLAGS_hash_bucket_count));
  } else if (!strcasecmp(FLAGS_memtablerep.c_str(),
                         VectorRepFactory::kNickName())) {
    factory->reset(new VectorRepFactory());
  } else if (!strcasecmp(FLAGS_memtablerep.c_str(), "hash_linkedlist")) {
    factory->reset(NewHashLinkListRepFactory(FLAGS_hash_bucket_count));
#endif  // ROCKSDB_LITE
  } else {
    std::unique_ptr<MemTableRepFactory> unique;
    s = MemTableRepFactory::CreateFromString(config_options, FLAGS_memtablerep,
                                             &unique);
    if (s.ok()) {
      factory->reset(unique.release());
    }
  }
  return s;
}

1625 1626
}  // namespace

1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650
enum DistributionType : unsigned char {
  kFixed = 0,
  kUniform,
  kNormal
};

static enum DistributionType FLAGS_value_size_distribution_type_e = kFixed;

static enum DistributionType StringToDistributionType(const char* ctype) {
  assert(ctype);

  if (!strcasecmp(ctype, "fixed"))
    return kFixed;
  else if (!strcasecmp(ctype, "uniform"))
    return kUniform;
  else if (!strcasecmp(ctype, "normal"))
    return kNormal;

  fprintf(stdout, "Cannot parse distribution type '%s'\n", ctype);
  return kFixed;  // default value
}

class BaseDistribution {
 public:
1651 1652
  BaseDistribution(unsigned int _min, unsigned int _max)
      : min_value_size_(_min), max_value_size_(_max) {}
1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690
  virtual ~BaseDistribution() {}

  unsigned int Generate() {
    auto val = Get();
    if (NeedTruncate()) {
      val = std::max(min_value_size_, val);
      val = std::min(max_value_size_, val);
    }
    return val;
  }
 private:
  virtual unsigned int Get() = 0;
  virtual bool NeedTruncate() {
    return true;
  }
  unsigned int min_value_size_;
  unsigned int max_value_size_;
};

class FixedDistribution : public BaseDistribution
{
 public:
  FixedDistribution(unsigned int size) :
    BaseDistribution(size, size),
    size_(size) {}
 private:
  virtual unsigned int Get() override {
    return size_;
  }
  virtual bool NeedTruncate() override {
    return false;
  }
  unsigned int size_;
};

class NormalDistribution
    : public BaseDistribution, public std::normal_distribution<double> {
 public:
1691 1692 1693 1694 1695 1696 1697 1698
  NormalDistribution(unsigned int _min, unsigned int _max)
      : BaseDistribution(_min, _max),
        // 99.7% values within the range [min, max].
        std::normal_distribution<double>(
            (double)(_min + _max) / 2.0 /*mean*/,
            (double)(_max - _min) / 6.0 /*stddev*/),
        gen_(rd_()) {}

1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710
 private:
  virtual unsigned int Get() override {
    return static_cast<unsigned int>((*this)(gen_));
  }
  std::random_device rd_;
  std::mt19937 gen_;
};

class UniformDistribution
    : public BaseDistribution,
      public std::uniform_int_distribution<unsigned int> {
 public:
1711 1712 1713 1714 1715
  UniformDistribution(unsigned int _min, unsigned int _max)
      : BaseDistribution(_min, _max),
        std::uniform_int_distribution<unsigned int>(_min, _max),
        gen_(rd_()) {}

1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726
 private:
  virtual unsigned int Get() override {
    return (*this)(gen_);
  }
  virtual bool NeedTruncate() override {
    return false;
  }
  std::random_device rd_;
  std::mt19937 gen_;
};

1727
// Helper for quickly generating random data.
J
jorlow@chromium.org 已提交
1728 1729 1730
class RandomGenerator {
 private:
  std::string data_;
1731
  unsigned int pos_;
1732
  std::unique_ptr<BaseDistribution> dist_;
J
jorlow@chromium.org 已提交
1733 1734

 public:
1735

J
jorlow@chromium.org 已提交
1736
  RandomGenerator() {
1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751
    auto max_value_size = FLAGS_value_size_max;
    switch (FLAGS_value_size_distribution_type_e) {
      case kUniform:
        dist_.reset(new UniformDistribution(FLAGS_value_size_min,
                                            FLAGS_value_size_max));
        break;
      case kNormal:
        dist_.reset(new NormalDistribution(FLAGS_value_size_min,
                                           FLAGS_value_size_max));
        break;
      case kFixed:
      default:
        dist_.reset(new FixedDistribution(value_size));
        max_value_size = value_size;
    }
J
jorlow@chromium.org 已提交
1752 1753 1754 1755 1756
    // We use a limited amount of data over and over again and ensure
    // that it is larger than the compression window (32KB), and also
    // large enough to serve all typical value sizes we want to write.
    Random rnd(301);
    std::string piece;
1757
    while (data_.size() < (unsigned)std::max(1048576, max_value_size)) {
J
jorlow@chromium.org 已提交
1758 1759 1760 1761 1762 1763 1764 1765
      // Add a short fragment that is as compressible as specified
      // by FLAGS_compression_ratio.
      test::CompressibleString(&rnd, FLAGS_compression_ratio, 100, &piece);
      data_.append(piece);
    }
    pos_ = 0;
  }

1766
  Slice Generate(unsigned int len) {
1767
    assert(len <= data_.size());
J
jorlow@chromium.org 已提交
1768 1769 1770 1771 1772 1773
    if (pos_ + len > data_.size()) {
      pos_ = 0;
    }
    pos_ += len;
    return Slice(data_.data() + pos_ - len, len);
  }
A
Anirban Rahut 已提交
1774

1775 1776 1777
  Slice Generate() {
    auto len = dist_->Generate();
    return Generate(len);
A
Anirban Rahut 已提交
1778
  }
1779
};
X
Xing Jin 已提交
1780

1781 1782 1783 1784 1785 1786 1787 1788
static void AppendWithSpace(std::string* str, Slice msg) {
  if (msg.empty()) return;
  if (!str->empty()) {
    str->push_back(' ');
  }
  str->append(msg.data(), msg.size());
}

1789 1790 1791
struct DBWithColumnFamilies {
  std::vector<ColumnFamilyHandle*> cfh;
  DB* db;
1792
#ifndef ROCKSDB_LITE
A
agiardullo 已提交
1793
  OptimisticTransactionDB* opt_txn_db;
1794
#endif  // ROCKSDB_LITE
1795 1796 1797 1798 1799 1800
  std::atomic<size_t> num_created;  // Need to be updated after all the
                                    // new entries in cfh are set.
  size_t num_hot;  // Number of column families to be queried at each moment.
                   // After each CreateNewCf(), another num_hot number of new
                   // Column families will be created and used to be queried.
  port::Mutex create_cf_mutex;  // Only one thread can execute CreateNewCf()
1801 1802
  std::vector<int> cfh_idx_to_prob;  // ith index holds probability of operating
                                     // on cfh[i].
1803

1804 1805 1806 1807 1808 1809
  DBWithColumnFamilies()
      : db(nullptr)
#ifndef ROCKSDB_LITE
        , opt_txn_db(nullptr)
#endif  // ROCKSDB_LITE
  {
1810
    cfh.clear();
1811 1812
    num_created = 0;
    num_hot = 0;
1813
  }
1814 1815 1816 1817

  DBWithColumnFamilies(const DBWithColumnFamilies& other)
      : cfh(other.cfh),
        db(other.db),
1818
#ifndef ROCKSDB_LITE
A
agiardullo 已提交
1819
        opt_txn_db(other.opt_txn_db),
1820
#endif  // ROCKSDB_LITE
1821
        num_created(other.num_created.load()),
1822 1823 1824
        num_hot(other.num_hot),
        cfh_idx_to_prob(other.cfh_idx_to_prob) {
  }
1825

A
agiardullo 已提交
1826 1827 1828 1829
  void DeleteDBs() {
    std::for_each(cfh.begin(), cfh.end(),
                  [](ColumnFamilyHandle* cfhi) { delete cfhi; });
    cfh.clear();
1830
#ifndef ROCKSDB_LITE
A
agiardullo 已提交
1831 1832 1833
    if (opt_txn_db) {
      delete opt_txn_db;
      opt_txn_db = nullptr;
A
agiardullo 已提交
1834 1835
    } else {
      delete db;
1836
      db = nullptr;
A
agiardullo 已提交
1837
    }
1838 1839
#else
    delete db;
A
agiardullo 已提交
1840
    db = nullptr;
1841
#endif  // ROCKSDB_LITE
A
agiardullo 已提交
1842 1843
  }

1844 1845
  ColumnFamilyHandle* GetCfh(int64_t rand_num) {
    assert(num_hot > 0);
1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857
    size_t rand_offset = 0;
    if (!cfh_idx_to_prob.empty()) {
      assert(cfh_idx_to_prob.size() == num_hot);
      int sum = 0;
      while (sum + cfh_idx_to_prob[rand_offset] < rand_num % 100) {
        sum += cfh_idx_to_prob[rand_offset];
        ++rand_offset;
      }
      assert(rand_offset < cfh_idx_to_prob.size());
    } else {
      rand_offset = rand_num % num_hot;
    }
1858
    return cfh[num_created.load(std::memory_order_acquire) - num_hot +
1859
               rand_offset];
1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882
  }

  // stage: assume CF from 0 to stage * num_hot has be created. Need to create
  //        stage * num_hot + 1 to stage * (num_hot + 1).
  void CreateNewCf(ColumnFamilyOptions options, int64_t stage) {
    MutexLock l(&create_cf_mutex);
    if ((stage + 1) * num_hot <= num_created) {
      // Already created.
      return;
    }
    auto new_num_created = num_created + num_hot;
    assert(new_num_created <= cfh.size());
    for (size_t i = num_created; i < new_num_created; i++) {
      Status s =
          db->CreateColumnFamily(options, ColumnFamilyName(i), &(cfh[i]));
      if (!s.ok()) {
        fprintf(stderr, "create column family error: %s\n",
                s.ToString().c_str());
        abort();
      }
    }
    num_created.store(new_num_created, std::memory_order_release);
  }
1883 1884
};

1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907
// a class that reports stats to CSV file
class ReporterAgent {
 public:
  ReporterAgent(Env* env, const std::string& fname,
                uint64_t report_interval_secs)
      : env_(env),
        total_ops_done_(0),
        last_report_(0),
        report_interval_secs_(report_interval_secs),
        stop_(false) {
    auto s = env_->NewWritableFile(fname, &report_file_, EnvOptions());
    if (s.ok()) {
      s = report_file_->Append(Header() + "\n");
    }
    if (s.ok()) {
      s = report_file_->Flush();
    }
    if (!s.ok()) {
      fprintf(stderr, "Can't open %s: %s\n", fname.c_str(),
              s.ToString().c_str());
      abort();
    }

D
Dmitri Smirnov 已提交
1908
    reporting_thread_ = port::Thread([&]() { SleepAndReport(); });
1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927
  }

  ~ReporterAgent() {
    {
      std::unique_lock<std::mutex> lk(mutex_);
      stop_ = true;
      stop_cv_.notify_all();
    }
    reporting_thread_.join();
  }

  // thread safe
  void ReportFinishedOps(int64_t num_ops) {
    total_ops_done_.fetch_add(num_ops);
  }

 private:
  std::string Header() const { return "secs_elapsed,interval_qps"; }
  void SleepAndReport() {
1928 1929
    auto* clock = env_->GetSystemClock().get();
    auto time_started = clock->NowMicros();
1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943
    while (true) {
      {
        std::unique_lock<std::mutex> lk(mutex_);
        if (stop_ ||
            stop_cv_.wait_for(lk, std::chrono::seconds(report_interval_secs_),
                              [&]() { return stop_; })) {
          // stopping
          break;
        }
        // else -> timeout, which means time for a report!
      }
      auto total_ops_done_snapshot = total_ops_done_.load();
      // round the seconds elapsed
      auto secs_elapsed =
1944
          (clock->NowMicros() - time_started + kMicrosInSecond / 2) /
1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967
          kMicrosInSecond;
      std::string report = ToString(secs_elapsed) + "," +
                           ToString(total_ops_done_snapshot - last_report_) +
                           "\n";
      auto s = report_file_->Append(report);
      if (s.ok()) {
        s = report_file_->Flush();
      }
      if (!s.ok()) {
        fprintf(stderr,
                "Can't write to report file (%s), stopping the reporting\n",
                s.ToString().c_str());
        break;
      }
      last_report_ = total_ops_done_snapshot;
    }
  }

  Env* env_;
  std::unique_ptr<WritableFile> report_file_;
  std::atomic<int64_t> total_ops_done_;
  int64_t last_report_;
  const uint64_t report_interval_secs_;
1968
  ROCKSDB_NAMESPACE::port::Thread reporting_thread_;
1969 1970 1971 1972 1973 1974
  std::mutex mutex_;
  // will notify on stop
  std::condition_variable stop_cv_;
  bool stop_;
};

1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003
enum OperationType : unsigned char {
  kRead = 0,
  kWrite,
  kDelete,
  kSeek,
  kMerge,
  kUpdate,
  kCompress,
  kUncompress,
  kCrc,
  kHash,
  kOthers
};

static std::unordered_map<OperationType, std::string, std::hash<unsigned char>>
                          OperationTypeString = {
  {kRead, "read"},
  {kWrite, "write"},
  {kDelete, "delete"},
  {kSeek, "seek"},
  {kMerge, "merge"},
  {kUpdate, "update"},
  {kCompress, "compress"},
  {kCompress, "uncompress"},
  {kCrc, "crc"},
  {kHash, "hash"},
  {kOthers, "op"}
};

2004
class CombinedStats;
2005 2006
class Stats {
 private:
2007
  SystemClock* clock_;
2008
  int id_;
2009
  uint64_t start_ = 0;
2010
  uint64_t sine_interval_;
D
Dmitri Smirnov 已提交
2011
  uint64_t finish_;
2012
  double seconds_;
D
Dmitri Smirnov 已提交
2013 2014 2015 2016 2017 2018
  uint64_t done_;
  uint64_t last_report_done_;
  uint64_t next_report_;
  uint64_t bytes_;
  uint64_t last_op_finish_;
  uint64_t last_report_finish_;
2019
  std::unordered_map<OperationType, std::shared_ptr<HistogramImpl>,
2020
                     std::hash<unsigned char>> hist_;
2021
  std::string message_;
2022
  bool exclude_from_merge_;
2023
  ReporterAgent* reporter_agent_;  // does not own
2024
  friend class CombinedStats;
2025 2026

 public:
2027
  Stats() : clock_(FLAGS_env->GetSystemClock().get()) { Start(-1); }
2028

2029 2030 2031 2032
  void SetReporterAgent(ReporterAgent* reporter_agent) {
    reporter_agent_ = reporter_agent;
  }

2033 2034 2035
  void Start(int id) {
    id_ = id;
    next_report_ = FLAGS_stats_interval ? FLAGS_stats_interval : 100;
2036
    last_op_finish_ = start_;
2037
    hist_.clear();
2038
    done_ = 0;
2039
    last_report_done_ = 0;
2040 2041
    bytes_ = 0;
    seconds_ = 0;
2042 2043
    start_ = clock_->NowMicros();
    sine_interval_ = clock_->NowMicros();
2044
    finish_ = start_;
2045
    last_report_finish_ = start_;
2046
    message_.clear();
2047 2048
    // When set, stats from this thread won't be merged with others.
    exclude_from_merge_ = false;
2049 2050 2051
  }

  void Merge(const Stats& other) {
2052 2053 2054
    if (other.exclude_from_merge_)
      return;

2055 2056 2057
    for (auto it = other.hist_.begin(); it != other.hist_.end(); ++it) {
      auto this_it = hist_.find(it->first);
      if (this_it != hist_.end()) {
2058
        this_it->second->Merge(*(other.hist_.at(it->first)));
2059 2060 2061 2062 2063
      } else {
        hist_.insert({ it->first, it->second });
      }
    }

2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074
    done_ += other.done_;
    bytes_ += other.bytes_;
    seconds_ += other.seconds_;
    if (other.start_ < start_) start_ = other.start_;
    if (other.finish_ > finish_) finish_ = other.finish_;

    // Just keep the messages from one thread
    if (message_.empty()) message_ = other.message_;
  }

  void Stop() {
2075
    finish_ = clock_->NowMicros();
2076 2077 2078 2079 2080 2081 2082
    seconds_ = (finish_ - start_) * 1e-6;
  }

  void AddMessage(Slice msg) {
    AppendWithSpace(&message_, msg);
  }

2083
  void SetId(int id) { id_ = id; }
2084
  void SetExcludeFromMerge() { exclude_from_merge_ = true; }
2085

2086 2087 2088 2089
  void PrintThreadStatus() {
    std::vector<ThreadStatus> thread_list;
    FLAGS_env->GetThreadList(&thread_list);

2090
    fprintf(stderr, "\n%18s %10s %12s %20s %13s %45s %12s %s\n",
2091
        "ThreadID", "ThreadType", "cfName", "Operation",
2092
        "ElapsedTime", "Stage", "State", "OperationProperties");
2093

2094
    int64_t current_time = 0;
2095
    clock_->GetCurrentTime(&current_time).PermitUncheckedError();
2096
    for (auto ts : thread_list) {
2097
      fprintf(stderr, "%18" PRIu64 " %10s %12s %20s %13s %45s %12s",
2098 2099 2100 2101
          ts.thread_id,
          ThreadStatus::GetThreadTypeName(ts.thread_type).c_str(),
          ts.cf_name.c_str(),
          ThreadStatus::GetOperationName(ts.operation_type).c_str(),
2102
          ThreadStatus::MicrosToString(ts.op_elapsed_micros).c_str(),
2103
          ThreadStatus::GetOperationStageName(ts.operation_stage).c_str(),
2104
          ThreadStatus::GetStateName(ts.state_type).c_str());
2105 2106 2107 2108 2109 2110 2111 2112

      auto op_properties = ThreadStatus::InterpretOperationProperties(
          ts.operation_type, ts.op_properties);
      for (const auto& op_prop : op_properties) {
        fprintf(stderr, " %s %" PRIu64" |",
            op_prop.first.c_str(), op_prop.second);
      }
      fprintf(stderr, "\n");
2113 2114 2115
    }
  }

2116
  void ResetSineInterval() { sine_interval_ = clock_->NowMicros(); }
2117 2118 2119 2120 2121 2122 2123 2124 2125

  uint64_t GetSineInterval() {
    return sine_interval_;
  }

  uint64_t GetStart() {
    return start_;
  }

2126 2127
  void ResetLastOpTime() {
    // Set to now to avoid latency from calls to SleepForMicroseconds
2128
    last_op_finish_ = clock_->NowMicros();
2129 2130
  }

2131 2132
  void FinishedOps(DBWithColumnFamilies* db_with_cfh, DB* db, int64_t num_ops,
                   enum OperationType op_type = kOthers) {
2133 2134 2135
    if (reporter_agent_) {
      reporter_agent_->ReportFinishedOps(num_ops);
    }
2136
    if (FLAGS_histogram) {
2137
      uint64_t now = clock_->NowMicros();
D
Dmitri Smirnov 已提交
2138
      uint64_t micros = now - last_op_finish_;
2139 2140 2141

      if (hist_.find(op_type) == hist_.end())
      {
2142 2143
        auto hist_temp = std::make_shared<HistogramImpl>();
        hist_.insert({op_type, std::move(hist_temp)});
2144
      }
2145
      hist_[op_type]->Add(micros);
2146

2147
      if (micros > 20000 && !FLAGS_stats_interval) {
D
Dmitri Smirnov 已提交
2148
        fprintf(stderr, "long op: %" PRIu64 " micros%30s\r", micros, "");
2149 2150 2151 2152 2153
        fflush(stderr);
      }
      last_op_finish_ = now;
    }

2154
    done_ += num_ops;
2155
    if (done_ >= next_report_) {
2156 2157 2158 2159 2160 2161 2162 2163
      if (!FLAGS_stats_interval) {
        if      (next_report_ < 1000)   next_report_ += 100;
        else if (next_report_ < 5000)   next_report_ += 500;
        else if (next_report_ < 10000)  next_report_ += 1000;
        else if (next_report_ < 50000)  next_report_ += 5000;
        else if (next_report_ < 100000) next_report_ += 10000;
        else if (next_report_ < 500000) next_report_ += 50000;
        else                            next_report_ += 100000;
2164
        fprintf(stderr, "... finished %" PRIu64 " ops%30s\r", done_, "");
2165
      } else {
2166
        uint64_t now = clock_->NowMicros();
2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178
        int64_t usecs_since_last = now - last_report_finish_;

        // Determine whether to print status where interval is either
        // each N operations or each N seconds.

        if (FLAGS_stats_interval_seconds &&
            usecs_since_last < (FLAGS_stats_interval_seconds * 1000000)) {
          // Don't check again for this many operations
          next_report_ += FLAGS_stats_interval;

        } else {
          fprintf(stderr,
2179 2180
                  "%s ... thread %d: (%" PRIu64 ",%" PRIu64
                  ") ops and "
2181
                  "(%.1f,%.1f) ops/second in (%.6f,%.6f) seconds\n",
2182
                  clock_->TimeToString(now / 1000000).c_str(), id_,
2183
                  done_ - last_report_done_, done_,
2184
                  (done_ - last_report_done_) / (usecs_since_last / 1000000.0),
2185 2186 2187 2188
                  done_ / ((now - start_) / 1000000.0),
                  (now - last_report_finish_) / 1000000.0,
                  (now - start_) / 1000000.0);

2189
          if (id_ == 0 && FLAGS_stats_per_interval) {
2190 2191 2192 2193 2194 2195 2196
            std::string stats;

            if (db_with_cfh && db_with_cfh->num_created.load()) {
              for (size_t i = 0; i < db_with_cfh->num_created.load(); ++i) {
                if (db->GetProperty(db_with_cfh->cfh[i], "rocksdb.cfstats",
                                    &stats))
                  fprintf(stderr, "%s\n", stats.c_str());
2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226
                if (FLAGS_show_table_properties) {
                  for (int level = 0; level < FLAGS_num_levels; ++level) {
                    if (db->GetProperty(
                            db_with_cfh->cfh[i],
                            "rocksdb.aggregated-table-properties-at-level" +
                                ToString(level),
                            &stats)) {
                      if (stats.find("# entries=0") == std::string::npos) {
                        fprintf(stderr, "Level[%d]: %s\n", level,
                                stats.c_str());
                      }
                    }
                  }
                }
              }
            } else if (db) {
              if (db->GetProperty("rocksdb.stats", &stats)) {
                fprintf(stderr, "%s\n", stats.c_str());
              }
              if (FLAGS_show_table_properties) {
                for (int level = 0; level < FLAGS_num_levels; ++level) {
                  if (db->GetProperty(
                          "rocksdb.aggregated-table-properties-at-level" +
                              ToString(level),
                          &stats)) {
                    if (stats.find("# entries=0") == std::string::npos) {
                      fprintf(stderr, "Level[%d]: %s\n", level, stats.c_str());
                    }
                  }
                }
2227 2228
              }
            }
2229
          }
M
Mark Callaghan 已提交
2230

2231 2232 2233 2234
          next_report_ += FLAGS_stats_interval;
          last_report_finish_ = now;
          last_report_done_ = done_;
        }
2235
      }
2236 2237 2238 2239
      if (id_ == 0 && FLAGS_thread_status_per_interval) {
        PrintThreadStatus();
      }
      fflush(stderr);
2240 2241 2242 2243 2244 2245 2246 2247 2248
    }
  }

  void AddBytes(int64_t n) {
    bytes_ += n;
  }

  void Report(const Slice& name) {
    // Pretend at least one op was done in case we are running a benchmark
2249
    // that does not call FinishedOps().
2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262
    if (done_ < 1) done_ = 1;

    std::string extra;
    if (bytes_ > 0) {
      // Rate is computed on actual elapsed time, not the sum of per-thread
      // elapsed times.
      double elapsed = (finish_ - start_) * 1e-6;
      char rate[100];
      snprintf(rate, sizeof(rate), "%6.1f MB/s",
               (bytes_ / 1048576.0) / elapsed);
      extra = rate;
    }
    AppendWithSpace(&extra, message_);
2263 2264
    double elapsed = (finish_ - start_) * 1e-6;
    double throughput = (double)done_/elapsed;
2265

D
Dhruba Borthakur 已提交
2266
    fprintf(stdout, "%-12s : %11.3f micros/op %ld ops/sec;%s%s\n",
2267
            name.ToString().c_str(),
2268
            seconds_ * 1e6 / done_,
D
Dhruba Borthakur 已提交
2269
            (long)throughput,
2270 2271 2272
            (extra.empty() ? "" : " "),
            extra.c_str());
    if (FLAGS_histogram) {
2273 2274 2275
      for (auto it = hist_.begin(); it != hist_.end(); ++it) {
        fprintf(stdout, "Microseconds per %s:\n%s\n",
                OperationTypeString[it->first].c_str(),
2276
                it->second->ToString().c_str());
2277
      }
2278
    }
2279
    if (FLAGS_report_file_operations) {
2280 2281 2282 2283 2284
      auto* counted_fs =
          FLAGS_env->GetFileSystem()->CheckedCast<CountedFileSystem>();
      assert(counted_fs);
      fprintf(stdout, "%s", counted_fs->PrintCounters().c_str());
      counted_fs->ResetCounters();
2285
    }
2286 2287 2288 2289
    fflush(stdout);
  }
};

2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358
class CombinedStats {
 public:
  void AddStats(const Stats& stat) {
    uint64_t total_ops = stat.done_;
    uint64_t total_bytes_ = stat.bytes_;
    double elapsed;

    if (total_ops < 1) {
      total_ops = 1;
    }

    elapsed = (stat.finish_ - stat.start_) * 1e-6;
    throughput_ops_.emplace_back(total_ops / elapsed);

    if (total_bytes_ > 0) {
      double mbs = (total_bytes_ / 1048576.0);
      throughput_mbs_.emplace_back(mbs / elapsed);
    }
  }

  void Report(const std::string& bench_name) {
    const char* name = bench_name.c_str();
    int num_runs = static_cast<int>(throughput_ops_.size());

    if (throughput_mbs_.size() == throughput_ops_.size()) {
      fprintf(stdout,
              "%s [AVG    %d runs] : %d ops/sec; %6.1f MB/sec\n"
              "%s [MEDIAN %d runs] : %d ops/sec; %6.1f MB/sec\n",
              name, num_runs, static_cast<int>(CalcAvg(throughput_ops_)),
              CalcAvg(throughput_mbs_), name, num_runs,
              static_cast<int>(CalcMedian(throughput_ops_)),
              CalcMedian(throughput_mbs_));
    } else {
      fprintf(stdout,
              "%s [AVG    %d runs] : %d ops/sec\n"
              "%s [MEDIAN %d runs] : %d ops/sec\n",
              name, num_runs, static_cast<int>(CalcAvg(throughput_ops_)), name,
              num_runs, static_cast<int>(CalcMedian(throughput_ops_)));
    }
  }

 private:
  double CalcAvg(std::vector<double> data) {
    double avg = 0;
    for (double x : data) {
      avg += x;
    }
    avg = avg / data.size();
    return avg;
  }

  double CalcMedian(std::vector<double> data) {
    assert(data.size() > 0);
    std::sort(data.begin(), data.end());

    size_t mid = data.size() / 2;
    if (data.size() % 2 == 1) {
      // Odd number of entries
      return data[mid];
    } else {
      // Even number of entries
      return (data[mid] + data[mid - 1]) / 2;
    }
  }

  std::vector<double> throughput_ops_;
  std::vector<double> throughput_mbs_;
};

2359 2360 2361 2362 2363 2364 2365 2366
class TimestampEmulator {
 private:
  std::atomic<uint64_t> timestamp_;

 public:
  TimestampEmulator() : timestamp_(0) {}
  uint64_t Get() const { return timestamp_.load(); }
  void Inc() { timestamp_++; }
2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385
  Slice Allocate(char* scratch) {
    // TODO: support larger timestamp sizes
    assert(FLAGS_user_timestamp_size == 8);
    assert(scratch);
    uint64_t ts = timestamp_.fetch_add(1);
    EncodeFixed64(scratch, ts);
    return Slice(scratch, FLAGS_user_timestamp_size);
  }
  Slice GetTimestampForRead(Random64& rand, char* scratch) {
    assert(FLAGS_user_timestamp_size == 8);
    assert(scratch);
    if (FLAGS_read_with_latest_user_timestamp) {
      return Allocate(scratch);
    }
    // Choose a random timestamp from the past.
    uint64_t ts = rand.Next() % Get();
    EncodeFixed64(scratch, ts);
    return Slice(scratch, FLAGS_user_timestamp_size);
  }
2386 2387
};

2388 2389 2390 2391 2392
// State shared by all concurrent executions of the same benchmark.
struct SharedState {
  port::Mutex mu;
  port::CondVar cv;
  int total;
2393
  int perf_level;
2394
  std::shared_ptr<RateLimiter> write_rate_limiter;
2395
  std::shared_ptr<RateLimiter> read_rate_limiter;
2396 2397 2398 2399 2400 2401 2402

  // Each thread goes through the following states:
  //    (1) initializing
  //    (2) waiting for others to be initialized
  //    (3) running
  //    (4) done

2403 2404
  long num_initialized;
  long num_done;
2405 2406
  bool start;

2407
  SharedState() : cv(&mu), perf_level(FLAGS_perf_level) { }
2408 2409 2410 2411 2412
};

// Per-thread state for concurrent executions of the same benchmark.
struct ThreadState {
  int tid;             // 0..n-1 when running in n threads
2413
  Random64 rand;         // Has different seeds for different threads
2414
  Stats stats;
2415
  SharedState* shared;
2416

2417 2418
  explicit ThreadState(int index)
      : tid(index), rand((FLAGS_seed ? FLAGS_seed : 1000) + index) {}
2419 2420
};

M
Mark Callaghan 已提交
2421 2422
class Duration {
 public:
D
Dmitri Smirnov 已提交
2423
  Duration(uint64_t max_seconds, int64_t max_ops, int64_t ops_per_stage = 0) {
M
Mark Callaghan 已提交
2424 2425
    max_seconds_ = max_seconds;
    max_ops_= max_ops;
2426
    ops_per_stage_ = (ops_per_stage > 0) ? ops_per_stage : max_ops;
M
Mark Callaghan 已提交
2427 2428 2429 2430
    ops_ = 0;
    start_at_ = FLAGS_env->NowMicros();
  }

2431 2432
  int64_t GetStage() { return std::min(ops_, max_ops_ - 1) / ops_per_stage_; }

L
Lei Jin 已提交
2433
  bool Done(int64_t increment) {
2434
    if (increment <= 0) increment = 1;    // avoid Done(0) and infinite loops
M
Mark Callaghan 已提交
2435 2436 2437
    ops_ += increment;

    if (max_seconds_) {
2438
      // Recheck every appx 1000 ops (exact iff increment is factor of 1000)
2439 2440
      auto granularity = FLAGS_ops_between_duration_checks;
      if ((ops_ / granularity) != ((ops_ - increment) / granularity)) {
D
Dmitri Smirnov 已提交
2441 2442
        uint64_t now = FLAGS_env->NowMicros();
        return ((now - start_at_) / 1000000) >= max_seconds_;
M
Mark Callaghan 已提交
2443 2444 2445 2446 2447 2448 2449 2450 2451
      } else {
        return false;
      }
    } else {
      return ops_ > max_ops_;
    }
  }

 private:
D
Dmitri Smirnov 已提交
2452
  uint64_t max_seconds_;
2453
  int64_t max_ops_;
2454
  int64_t ops_per_stage_;
2455
  int64_t ops_;
D
Dmitri Smirnov 已提交
2456
  uint64_t start_at_;
M
Mark Callaghan 已提交
2457 2458
};

J
jorlow@chromium.org 已提交
2459 2460
class Benchmark {
 private:
2461 2462
  std::shared_ptr<Cache> cache_;
  std::shared_ptr<Cache> compressed_cache_;
T
Tyler Harter 已提交
2463
  const SliceTransform* prefix_extractor_;
2464 2465
  DBWithColumnFamilies db_;
  std::vector<DBWithColumnFamilies> multi_dbs_;
2466
  int64_t num_;
2467
  int key_size_;
2468
  int user_timestamp_size_;
2469 2470
  int prefix_size_;
  int64_t keys_per_prefix_;
L
Lei Jin 已提交
2471
  int64_t entries_per_batch_;
2472
  int64_t writes_before_delete_range_;
A
Andrew Kryczka 已提交
2473 2474 2475
  int64_t writes_per_range_tombstone_;
  int64_t range_tombstone_width_;
  int64_t max_num_range_tombstones_;
2476
  ReadOptions read_options_;
2477
  WriteOptions write_options_;
2478
  Options open_options_;  // keep options around to properly destroy db later
2479
#ifndef ROCKSDB_LITE
2480
  TraceOptions trace_options_;
2481
  TraceOptions block_cache_trace_options_;
2482
#endif
2483
  int64_t reads_;
Y
Yueh-Hsuan Chiang 已提交
2484
  int64_t deletes_;
2485
  double read_random_exp_range_;
2486 2487 2488
  int64_t writes_;
  int64_t readwrites_;
  int64_t merge_keys_;
2489
  bool report_file_operations_;
2490
  bool use_blob_db_;  // Stacked BlobDB
2491
  std::vector<std::string> keys_;
2492

2493 2494
  class ErrorHandlerListener : public EventListener {
   public:
S
Siying Dong 已提交
2495
#ifndef ROCKSDB_LITE
2496 2497 2498 2499 2500 2501
    ErrorHandlerListener()
        : mutex_(),
          cv_(&mutex_),
          no_auto_recovery_(false),
          recovery_complete_(false) {}

2502
    ~ErrorHandlerListener() override {}
2503

2504 2505 2506
    const char* Name() const override { return kClassName(); }
    static const char* kClassName() { return "ErrorHandlerListener"; }

2507
    void OnErrorRecoveryBegin(BackgroundErrorReason /*reason*/,
2508 2509
                              Status /*bg_error*/,
                              bool* auto_recovery) override {
2510 2511 2512 2513 2514
      if (*auto_recovery && no_auto_recovery_) {
        *auto_recovery = false;
      }
    }

2515
    void OnErrorRecoveryCompleted(Status /*old_bg_error*/) override {
2516 2517 2518 2519 2520
      InstrumentedMutexLock l(&mutex_);
      recovery_complete_ = true;
      cv_.SignalAll();
    }

Y
Yi Wu 已提交
2521
    bool WaitForRecovery(uint64_t abs_time_us) {
2522 2523
      InstrumentedMutexLock l(&mutex_);
      if (!recovery_complete_) {
Y
Yi Wu 已提交
2524
        cv_.TimedWait(abs_time_us);
2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539
      }
      if (recovery_complete_) {
        recovery_complete_ = false;
        return true;
      }
      return false;
    }

    void EnableAutoRecovery(bool enable = true) { no_auto_recovery_ = !enable; }

   private:
    InstrumentedMutex mutex_;
    InstrumentedCondVar cv_;
    bool no_auto_recovery_;
    bool recovery_complete_;
S
Siying Dong 已提交
2540 2541 2542 2543
#else   // ROCKSDB_LITE
    bool WaitForRecovery(uint64_t /*abs_time_us*/) { return true; }
    void EnableAutoRecovery(bool /*enable*/) {}
#endif  // ROCKSDB_LITE
2544 2545 2546 2547
  };

  std::shared_ptr<ErrorHandlerListener> listener_;

2548 2549
  std::unique_ptr<TimestampEmulator> mock_app_clock_;

2550 2551 2552 2553 2554 2555 2556 2557
  bool SanityCheck() {
    if (FLAGS_compression_ratio > 1) {
      fprintf(stderr, "compression_ratio should be between 0 and 1\n");
      return false;
    }
    return true;
  }

2558
  inline bool CompressSlice(const CompressionInfo& compression_info,
2559
                            const Slice& input, std::string* compressed) {
2560 2561 2562 2563
    constexpr uint32_t compress_format_version = 2;

    return CompressData(input, compression_info, compress_format_version,
                        compressed);
2564 2565
  }

2566
  void PrintHeader(const Options& options) {
2567
    PrintEnvironment();
2568 2569 2570
    fprintf(stdout,
            "Keys:       %d bytes each (+ %d bytes user-defined timestamp)\n",
            FLAGS_key_size, FLAGS_user_timestamp_size);
2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584
    auto avg_value_size = FLAGS_value_size;
    if (FLAGS_value_size_distribution_type_e == kFixed) {
      fprintf(stdout, "Values:     %d bytes each (%d bytes after compression)\n",
              avg_value_size,
              static_cast<int>(avg_value_size * FLAGS_compression_ratio + 0.5));
    } else {
      avg_value_size = (FLAGS_value_size_min + FLAGS_value_size_max) / 2;
      fprintf(stdout, "Values:     %d avg bytes each (%d bytes after compression)\n",
              avg_value_size,
              static_cast<int>(avg_value_size * FLAGS_compression_ratio + 0.5));
      fprintf(stdout, "Values Distribution: %s (min: %d, max: %d)\n",
              FLAGS_value_size_distribution_type.c_str(),
              FLAGS_value_size_min, FLAGS_value_size_max);
    }
2585 2586 2587
    fprintf(stdout, "Entries:    %" PRIu64 "\n", num_);
    fprintf(stdout, "Prefix:    %d bytes\n", FLAGS_prefix_size);
    fprintf(stdout, "Keys per prefix:    %" PRIu64 "\n", keys_per_prefix_);
2588
    fprintf(stdout, "RawSize:    %.1f MB (estimated)\n",
2589
            ((static_cast<int64_t>(FLAGS_key_size + avg_value_size) * num_)
2590
             / 1048576.0));
2591
    fprintf(stdout, "FileSize:   %.1f MB (estimated)\n",
2592
            (((FLAGS_key_size + avg_value_size * FLAGS_compression_ratio)
2593
              * num_)
2594
             / 1048576.0));
2595 2596
    fprintf(stdout, "Write rate: %" PRIu64 " bytes/second\n",
            FLAGS_benchmark_write_rate_limit);
2597 2598
    fprintf(stdout, "Read rate: %" PRIu64 " ops/second\n",
            FLAGS_benchmark_read_rate_limit);
2599 2600 2601 2602 2603 2604 2605 2606 2607 2608 2609 2610
    if (FLAGS_enable_numa) {
      fprintf(stderr, "Running in NUMA enabled mode.\n");
#ifndef NUMA
      fprintf(stderr, "NUMA is not defined in the system.\n");
      exit(1);
#else
      if (numa_available() == -1) {
        fprintf(stderr, "NUMA is not supported by the system.\n");
        exit(1);
      }
#endif
    }
2611

N
Nathan Bronson 已提交
2612 2613
    auto compression = CompressionTypeToString(FLAGS_compression_type_e);
    fprintf(stdout, "Compression: %s\n", compression.c_str());
2614 2615
    fprintf(stdout, "Compression sampling rate: %" PRId64 "\n",
            FLAGS_sample_for_compression);
2616 2617 2618
    if (options.memtable_factory != nullptr) {
      fprintf(stdout, "Memtablerep: %s\n",
              options.memtable_factory->GetId().c_str());
J
Jim Paton 已提交
2619
    }
2620
    fprintf(stdout, "Perf Level: %d\n", FLAGS_perf_level);
J
Jim Paton 已提交
2621

N
Nathan Bronson 已提交
2622
    PrintWarnings(compression.c_str());
2623 2624 2625
    fprintf(stdout, "------------------------------------------------\n");
  }

2626
  void PrintWarnings(const char* compression) {
2627 2628 2629 2630 2631 2632 2633 2634 2635
#if defined(__GNUC__) && !defined(__OPTIMIZE__)
    fprintf(stdout,
            "WARNING: Optimization is disabled: benchmarks unnecessarily slow\n"
            );
#endif
#ifndef NDEBUG
    fprintf(stdout,
            "WARNING: Assertions are enabled; benchmarks unnecessarily slow\n");
#endif
2636
    if (FLAGS_compression_type_e != ROCKSDB_NAMESPACE::kNoCompression) {
2637 2638
      // The test string should not be too small.
      const int len = FLAGS_block_size;
2639
      std::string input_str(len, 'y');
2640
      std::string compressed;
2641 2642 2643
      CompressionOptions opts;
      CompressionContext context(FLAGS_compression_type_e);
      CompressionInfo info(opts, context, CompressionDict::GetEmptyDict(),
2644 2645
                           FLAGS_compression_type_e,
                           FLAGS_sample_for_compression);
2646
      bool result = CompressSlice(info, Slice(input_str), &compressed);
2647 2648

      if (!result) {
2649 2650 2651 2652 2653
        fprintf(stdout, "WARNING: %s compression is not enabled\n",
                compression);
      } else if (compressed.size() >= input_str.size()) {
        fprintf(stdout, "WARNING: %s compression is not effective\n",
                compression);
2654
      }
2655
    }
2656 2657
  }

K
kailiu 已提交
2658 2659 2660 2661 2662 2663 2664
// Current the following isn't equivalent to OS_LINUX.
#if defined(__linux)
  static Slice TrimSpace(Slice s) {
    unsigned int start = 0;
    while (start < s.size() && isspace(s[start])) {
      start++;
    }
S
sdong 已提交
2665
    unsigned int limit = static_cast<unsigned int>(s.size());
K
kailiu 已提交
2666 2667 2668 2669 2670 2671 2672
    while (limit > start && isspace(s[limit-1])) {
      limit--;
    }
    return Slice(s.data() + start, limit - start);
  }
#endif

2673
  void PrintEnvironment() {
H
Hyunyoung Lee 已提交
2674
    fprintf(stderr, "RocksDB:    version %d.%d\n",
2675 2676
            kMajorVersion, kMinorVersion);

2677
#if defined(__linux) || defined(__APPLE__) || defined(__FreeBSD__)
2678
    time_t now = time(nullptr);
2679 2680 2681 2682 2683
    char buf[52];
    // Lint complains about ctime() usage, so replace it with ctime_r(). The
    // requirement is to provide a buffer which is at least 26 bytes.
    fprintf(stderr, "Date:       %s",
            ctime_r(&now, buf));  // ctime_r() adds newline
2684

2685
#if defined(__linux)
2686
    FILE* cpuinfo = fopen("/proc/cpuinfo", "r");
2687
    if (cpuinfo != nullptr) {
2688 2689 2690 2691
      char line[1000];
      int num_cpus = 0;
      std::string cpu_type;
      std::string cache_size;
2692
      while (fgets(line, sizeof(line), cpuinfo) != nullptr) {
2693
        const char* sep = strchr(line, ':');
2694
        if (sep == nullptr) {
2695 2696 2697 2698 2699 2700 2701 2702 2703 2704 2705 2706 2707 2708 2709
          continue;
        }
        Slice key = TrimSpace(Slice(line, sep - 1 - line));
        Slice val = TrimSpace(Slice(sep + 1));
        if (key == "model name") {
          ++num_cpus;
          cpu_type = val.ToString();
        } else if (key == "cache size") {
          cache_size = val.ToString();
        }
      }
      fclose(cpuinfo);
      fprintf(stderr, "CPU:        %d * %s\n", num_cpus, cpu_type.c_str());
      fprintf(stderr, "CPUCache:   %s\n", cache_size.c_str());
    }
2710 2711 2712 2713 2714 2715 2716 2717 2718 2719 2720 2721 2722 2723 2724 2725 2726 2727 2728 2729 2730 2731 2732 2733 2734
#elif defined(__APPLE__)
    struct host_basic_info h;
    size_t hlen = HOST_BASIC_INFO_COUNT;
    if (host_info(mach_host_self(), HOST_BASIC_INFO, (host_info_t)&h,
                  (uint32_t*)&hlen) == KERN_SUCCESS) {
      std::string cpu_type;
      std::string cache_size;
      size_t hcache_size;
      hlen = sizeof(hcache_size);
      if (sysctlbyname("hw.cachelinesize", &hcache_size, &hlen, NULL, 0) == 0) {
        cache_size = std::to_string(hcache_size);
      }
      switch (h.cpu_type) {
        case CPU_TYPE_X86_64:
          cpu_type = "x86_64";
          break;
        case CPU_TYPE_ARM64:
          cpu_type = "arm64";
          break;
        default:
          break;
      }
      fprintf(stderr, "CPU:        %d * %s\n", h.max_cpus, cpu_type.c_str());
      fprintf(stderr, "CPUCache:   %s\n", cache_size.c_str());
    }
2735 2736 2737 2738 2739 2740 2741 2742 2743 2744 2745 2746 2747
#elif defined(__FreeBSD__)
    int ncpus;
    size_t len = sizeof(ncpus);
    int mib[2] = {CTL_HW, HW_NCPU};
    if (sysctl(mib, 2, &ncpus, &len, nullptr, 0) == 0) {
      char cpu_type[16];
      len = sizeof(cpu_type) - 1;
      mib[1] = HW_MACHINE;
      if (sysctl(mib, 2, cpu_type, &len, nullptr, 0) == 0) cpu_type[len] = 0;

      fprintf(stderr, "CPU:        %d * %s\n", ncpus, cpu_type);
      // no programmatic way to get the cache line size except on PPC
    }
2748
#endif
2749 2750 2751
#endif
  }

2752 2753 2754 2755 2756 2757 2758 2759 2760 2761 2762 2763 2764 2765 2766 2767 2768 2769 2770 2771 2772 2773
  static bool KeyExpired(const TimestampEmulator* timestamp_emulator,
                         const Slice& key) {
    const char* pos = key.data();
    pos += 8;
    uint64_t timestamp = 0;
    if (port::kLittleEndian) {
      int bytes_to_fill = 8;
      for (int i = 0; i < bytes_to_fill; ++i) {
        timestamp |= (static_cast<uint64_t>(static_cast<unsigned char>(pos[i]))
                      << ((bytes_to_fill - i - 1) << 3));
      }
    } else {
      memcpy(&timestamp, pos, sizeof(timestamp));
    }
    return timestamp_emulator->Get() - timestamp > FLAGS_time_range;
  }

  class ExpiredTimeFilter : public CompactionFilter {
   public:
    explicit ExpiredTimeFilter(
        const std::shared_ptr<TimestampEmulator>& timestamp_emulator)
        : timestamp_emulator_(timestamp_emulator) {}
A
Andrew Kryczka 已提交
2774 2775 2776
    bool Filter(int /*level*/, const Slice& key,
                const Slice& /*existing_value*/, std::string* /*new_value*/,
                bool* /*value_changed*/) const override {
2777 2778 2779 2780 2781 2782 2783 2784
      return KeyExpired(timestamp_emulator_.get(), key);
    }
    const char* Name() const override { return "ExpiredTimeFilter"; }

   private:
    std::shared_ptr<TimestampEmulator> timestamp_emulator_;
  };

2785 2786
  class KeepFilter : public CompactionFilter {
   public:
2787 2788 2789
    bool Filter(int /*level*/, const Slice& /*key*/, const Slice& /*value*/,
                std::string* /*new_value*/,
                bool* /*value_changed*/) const override {
2790 2791 2792
      return false;
    }

2793
    const char* Name() const override { return "KeepFilter"; }
2794 2795
  };

Y
Yi Wu 已提交
2796 2797 2798 2799 2800
  std::shared_ptr<Cache> NewCache(int64_t capacity) {
    if (capacity <= 0) {
      return nullptr;
    }
    if (FLAGS_use_clock_cache) {
2801 2802
      auto cache = NewClockCache(static_cast<size_t>(capacity),
                                 FLAGS_cache_numshardbits);
Y
Yi Wu 已提交
2803 2804 2805 2806 2807 2808
      if (!cache) {
        fprintf(stderr, "Clock cache not supported.");
        exit(1);
      }
      return cache;
    } else {
2809 2810 2811
      LRUCacheOptions opts(
          static_cast<size_t>(capacity), FLAGS_cache_numshardbits,
          false /*strict_capacity_limit*/, FLAGS_cache_high_pri_pool_ratio,
2812
#ifdef MEMKIND
2813 2814 2815
          FLAGS_use_cache_memkind_kmem_allocator
              ? std::make_shared<MemkindKmemAllocator>()
              : nullptr
2816
#else
2817 2818 2819 2820 2821
          nullptr
#endif
      );
      if (FLAGS_use_cache_memkind_kmem_allocator) {
#ifndef MEMKIND
2822 2823 2824 2825
        fprintf(stderr, "Memkind library is not linked with the binary.");
        exit(1);
#endif
      }
2826 2827
#ifndef ROCKSDB_LITE
      if (!FLAGS_secondary_cache_uri.empty()) {
2828 2829
        Status s = SecondaryCache::CreateFromString(
            ConfigOptions(), FLAGS_secondary_cache_uri, &secondary_cache);
2830 2831 2832 2833 2834 2835 2836 2837 2838 2839
        if (secondary_cache == nullptr) {
          fprintf(
              stderr,
              "No secondary cache registered matching string: %s status=%s\n",
              FLAGS_secondary_cache_uri.c_str(), s.ToString().c_str());
          exit(1);
        }
        opts.secondary_cache = secondary_cache;
      }
#endif  // ROCKSDB_LITE
2840 2841 2842 2843 2844 2845 2846 2847 2848 2849 2850 2851 2852 2853 2854

      if (FLAGS_use_lru_secondary_cache) {
        LRUSecondaryCacheOptions secondary_cache_opts;
        secondary_cache_opts.capacity = FLAGS_lru_secondary_cache_size;
        secondary_cache_opts.num_shard_bits =
            FLAGS_lru_secondary_cache_numshardbits;
        secondary_cache_opts.high_pri_pool_ratio =
            FLAGS_lru_secondary_cache_high_pri_pool_ratio;
        secondary_cache_opts.compression_type =
            FLAGS_lru_secondary_cache_compression_type_e;
        secondary_cache_opts.compress_format_version =
            FLAGS_lru_secondary_cache_compress_format_version;
        opts.secondary_cache = NewLRUSecondaryCache(secondary_cache_opts);
      }

2855
      return NewLRUCache(opts);
Y
Yi Wu 已提交
2856 2857 2858
    }
  }

J
jorlow@chromium.org 已提交
2859
 public:
2860
  Benchmark()
Y
Yi Wu 已提交
2861 2862
      : cache_(NewCache(FLAGS_cache_size)),
        compressed_cache_(NewCache(FLAGS_compressed_cache_size)),
2863 2864 2865
        prefix_extractor_(NewFixedPrefixTransform(FLAGS_prefix_size)),
        num_(FLAGS_num),
        key_size_(FLAGS_key_size),
2866
        user_timestamp_size_(FLAGS_user_timestamp_size),
2867 2868 2869 2870 2871 2872 2873 2874 2875 2876 2877
        prefix_size_(FLAGS_prefix_size),
        keys_per_prefix_(FLAGS_keys_per_prefix),
        entries_per_batch_(1),
        reads_(FLAGS_reads < 0 ? FLAGS_num : FLAGS_reads),
        read_random_exp_range_(0.0),
        writes_(FLAGS_writes < 0 ? FLAGS_num : FLAGS_writes),
        readwrites_(
            (FLAGS_writes < 0 && FLAGS_reads < 0)
                ? FLAGS_num
                : ((FLAGS_writes > FLAGS_reads) ? FLAGS_writes : FLAGS_reads)),
        merge_keys_(FLAGS_merge_keys < 0 ? FLAGS_num : FLAGS_merge_keys),
Y
Yi Wu 已提交
2878 2879
        report_file_operations_(FLAGS_report_file_operations),
#ifndef ROCKSDB_LITE
2880
        use_blob_db_(FLAGS_use_blob_db)  // Stacked BlobDB
Y
Yi Wu 已提交
2881
#else
2882
        use_blob_db_(false)  // Stacked BlobDB
Y
Yi Wu 已提交
2883
#endif  // !ROCKSDB_LITE
2884
  {
2885 2886 2887 2888 2889 2890 2891 2892 2893 2894
    // use simcache instead of cache
    if (FLAGS_simcache_size >= 0) {
      if (FLAGS_cache_numshardbits >= 1) {
        cache_ =
            NewSimCache(cache_, FLAGS_simcache_size, FLAGS_cache_numshardbits);
      } else {
        cache_ = NewSimCache(cache_, FLAGS_simcache_size, 0);
      }
    }

2895
    if (report_file_operations_) {
2896 2897 2898
      FLAGS_env = new CompositeEnvWrapper(
          FLAGS_env,
          std::make_shared<CountedFileSystem>(FLAGS_env->GetFileSystem()));
2899 2900
    }

2901 2902 2903 2904 2905
    if (FLAGS_prefix_size > FLAGS_key_size) {
      fprintf(stderr, "prefix size is larger than key size");
      exit(1);
    }

J
jorlow@chromium.org 已提交
2906
    std::vector<std::string> files;
2907
    FLAGS_env->GetChildren(FLAGS_db, &files);
2908
    for (size_t i = 0; i < files.size(); i++) {
J
jorlow@chromium.org 已提交
2909
      if (Slice(files[i]).starts_with("heap-")) {
2910
        FLAGS_env->DeleteFile(FLAGS_db + "/" + files[i]);
J
jorlow@chromium.org 已提交
2911 2912
      }
    }
2913
    if (!FLAGS_use_existing_db) {
2914
      Options options;
2915
      options.env = FLAGS_env;
2916 2917 2918
      if (!FLAGS_wal_dir.empty()) {
        options.wal_dir = FLAGS_wal_dir;
      }
Y
Yi Wu 已提交
2919 2920
#ifndef ROCKSDB_LITE
      if (use_blob_db_) {
2921
        // Stacked BlobDB
Y
Yi Wu 已提交
2922 2923 2924
        blob_db::DestroyBlobDB(FLAGS_db, options, blob_db::BlobDBOptions());
      }
#endif  // !ROCKSDB_LITE
2925
      DestroyDB(FLAGS_db, options);
2926 2927 2928 2929 2930 2931 2932 2933 2934 2935
      if (!FLAGS_wal_dir.empty()) {
        FLAGS_env->DeleteDir(FLAGS_wal_dir);
      }

      if (FLAGS_num_multi_db > 1) {
        FLAGS_env->CreateDir(FLAGS_db);
        if (!FLAGS_wal_dir.empty()) {
          FLAGS_env->CreateDir(FLAGS_wal_dir);
        }
      }
2936
    }
2937 2938

    listener_.reset(new ErrorHandlerListener());
2939 2940 2941
    if (user_timestamp_size_ > 0) {
      mock_app_clock_.reset(new TimestampEmulator());
    }
J
jorlow@chromium.org 已提交
2942 2943
  }

2944
  void DeleteDBs() {
A
agiardullo 已提交
2945
    db_.DeleteDBs();
2946 2947
    for (const DBWithColumnFamilies& dbwcf : multi_dbs_) {
      delete dbwcf.db;
2948
    }
2949 2950 2951 2952
  }

  ~Benchmark() {
    DeleteDBs();
T
Tyler Harter 已提交
2953
    delete prefix_extractor_;
I
Igor Canadi 已提交
2954
    if (cache_.get() != nullptr) {
2955 2956
      // Clear cache reference first
      open_options_.write_buffer_manager.reset();
I
Igor Canadi 已提交
2957 2958 2959
      // this will leak, but we're shutting down so nobody cares
      cache_->DisownData();
    }
J
jorlow@chromium.org 已提交
2960 2961
  }

2962
  Slice AllocateKey(std::unique_ptr<const char[]>* key_guard) {
2963 2964 2965
    char* data = new char[key_size_];
    const char* const_data = data;
    key_guard->reset(const_data);
2966
    return Slice(key_guard->get(), key_size_);
L
Lei Jin 已提交
2967 2968
  }

2969
  // Generate key according to the given specification and random number.
2970 2971 2972 2973 2974 2975 2976 2977 2978 2979 2980 2981 2982
  // The resulting key will have the following format:
  //   - If keys_per_prefix_ is positive, extra trailing bytes are either cut
  //     off or padded with '0'.
  //     The prefix value is derived from key value.
  //     ----------------------------
  //     | prefix 00000 | key 00000 |
  //     ----------------------------
  //
  //   - If keys_per_prefix_ is 0, the key is simply a binary representation of
  //     random number followed by trailing '0's
  //     ----------------------------
  //     |        key 00000         |
  //     ----------------------------
L
Lei Jin 已提交
2983
  void GenerateKeyFromInt(uint64_t v, int64_t num_keys, Slice* key) {
2984 2985 2986 2987 2988 2989 2990
    if (!keys_.empty()) {
      assert(FLAGS_use_existing_keys);
      assert(keys_.size() == static_cast<size_t>(num_keys));
      assert(v < static_cast<uint64_t>(num_keys));
      *key = keys_[v];
      return;
    }
L
Lei Jin 已提交
2991
    char* start = const_cast<char*>(key->data());
2992 2993 2994 2995 2996 2997 2998 2999 3000 3001 3002 3003 3004 3005 3006 3007 3008 3009 3010 3011 3012 3013 3014 3015 3016 3017 3018 3019 3020 3021 3022
    char* pos = start;
    if (keys_per_prefix_ > 0) {
      int64_t num_prefix = num_keys / keys_per_prefix_;
      int64_t prefix = v % num_prefix;
      int bytes_to_fill = std::min(prefix_size_, 8);
      if (port::kLittleEndian) {
        for (int i = 0; i < bytes_to_fill; ++i) {
          pos[i] = (prefix >> ((bytes_to_fill - i - 1) << 3)) & 0xFF;
        }
      } else {
        memcpy(pos, static_cast<void*>(&prefix), bytes_to_fill);
      }
      if (prefix_size_ > 8) {
        // fill the rest with 0s
        memset(pos + 8, '0', prefix_size_ - 8);
      }
      pos += prefix_size_;
    }

    int bytes_to_fill = std::min(key_size_ - static_cast<int>(pos - start), 8);
    if (port::kLittleEndian) {
      for (int i = 0; i < bytes_to_fill; ++i) {
        pos[i] = (v >> ((bytes_to_fill - i - 1) << 3)) & 0xFF;
      }
    } else {
      memcpy(pos, static_cast<void*>(&v), bytes_to_fill);
    }
    pos += bytes_to_fill;
    if (key_size_ > pos - start) {
      memset(pos, '0', key_size_ - (pos - start));
    }
X
Xing Jin 已提交
3023 3024
  }

3025 3026 3027 3028 3029 3030 3031 3032 3033 3034 3035
  void GenerateKeyFromIntForSeek(uint64_t v, int64_t num_keys, Slice* key) {
    GenerateKeyFromInt(v, num_keys, key);
    if (FLAGS_seek_missing_prefix) {
      assert(prefix_size_ > 8);
      char* key_ptr = const_cast<char*>(key->data());
      // This rely on GenerateKeyFromInt filling paddings with '0's.
      // Putting a '1' will create a non-existing prefix.
      key_ptr[8] = '1';
    }
  }

3036 3037 3038 3039 3040 3041 3042 3043 3044 3045 3046 3047
  std::string GetPathForMultiple(std::string base_name, size_t id) {
    if (!base_name.empty()) {
#ifndef OS_WIN
      if (base_name.back() != '/') {
        base_name += '/';
      }
#else
      if (base_name.back() != '\\') {
        base_name += '\\';
      }
#endif
    }
3048
    return base_name + ToString(id);
3049 3050
  }

3051 3052 3053 3054 3055 3056 3057 3058 3059 3060 3061 3062 3063 3064 3065
  void VerifyDBFromDB(std::string& truth_db_name) {
    DBWithColumnFamilies truth_db;
    auto s = DB::OpenForReadOnly(open_options_, truth_db_name, &truth_db.db);
    if (!s.ok()) {
      fprintf(stderr, "open error: %s\n", s.ToString().c_str());
      exit(1);
    }
    ReadOptions ro;
    ro.total_order_seek = true;
    std::unique_ptr<Iterator> truth_iter(truth_db.db->NewIterator(ro));
    std::unique_ptr<Iterator> db_iter(db_.db->NewIterator(ro));
    // Verify that all the key/values in truth_db are retrivable in db with
    // ::Get
    fprintf(stderr, "Verifying db >= truth_db with ::Get...\n");
    for (truth_iter->SeekToFirst(); truth_iter->Valid(); truth_iter->Next()) {
3066 3067 3068 3069 3070
      std::string value;
      s = db_.db->Get(ro, truth_iter->key(), &value);
      assert(s.ok());
      // TODO(myabandeh): provide debugging hints
      assert(Slice(value) == truth_iter->value());
3071 3072 3073 3074 3075 3076 3077 3078 3079 3080 3081
    }
    // Verify that the db iterator does not give any extra key/value
    fprintf(stderr, "Verifying db == truth_db...\n");
    for (db_iter->SeekToFirst(), truth_iter->SeekToFirst(); db_iter->Valid();
         db_iter->Next(), truth_iter->Next()) {
      assert(truth_iter->Valid());
      assert(truth_iter->value() == db_iter->value());
    }
    // No more key should be left unchecked in truth_db
    assert(!truth_iter->Valid());
    fprintf(stderr, "...Verified\n");
3082 3083
  }

3084
  void ErrorExit() {
3085
    DeleteDBs();
3086 3087 3088
    exit(1);
  }

J
jorlow@chromium.org 已提交
3089
  void Run() {
3090
    if (!SanityCheck()) {
3091
      ErrorExit();
3092
    }
3093
    Open(&open_options_);
3094
    PrintHeader(open_options_);
3095 3096
    std::stringstream benchmark_stream(FLAGS_benchmarks);
    std::string name;
3097
    std::unique_ptr<ExpiredTimeFilter> filter;
3098
    while (std::getline(benchmark_stream, name, ',')) {
X
Xing Jin 已提交
3099
      // Sanitize parameters
3100
      num_ = FLAGS_num;
3101
      reads_ = (FLAGS_reads < 0 ? FLAGS_num : FLAGS_reads);
3102
      writes_ = (FLAGS_writes < 0 ? FLAGS_num : FLAGS_writes);
Y
Yueh-Hsuan Chiang 已提交
3103
      deletes_ = (FLAGS_deletes < 0 ? FLAGS_num : FLAGS_deletes);
3104
      value_size = FLAGS_value_size;
3105
      key_size_ = FLAGS_key_size;
3106
      entries_per_batch_ = FLAGS_batch_size;
3107
      writes_before_delete_range_ = FLAGS_writes_before_delete_range;
A
Andrew Kryczka 已提交
3108 3109 3110
      writes_per_range_tombstone_ = FLAGS_writes_per_range_tombstone;
      range_tombstone_width_ = FLAGS_range_tombstone_width;
      max_num_range_tombstones_ = FLAGS_max_num_range_tombstones;
3111
      write_options_ = WriteOptions();
3112
      read_random_exp_range_ = FLAGS_read_random_exp_range;
3113 3114 3115
      if (FLAGS_sync) {
        write_options_.sync = true;
      }
H
heyongqiang 已提交
3116
      write_options_.disableWAL = FLAGS_disable_wal;
3117 3118
      write_options_.rate_limiter_priority =
          FLAGS_rate_limit_auto_wal_flush ? Env::IO_USER : Env::IO_TOTAL;
3119 3120 3121
      read_options_ = ReadOptions(FLAGS_verify_checksum, true);
      read_options_.total_order_seek = FLAGS_total_order_seek;
      read_options_.prefix_same_as_start = FLAGS_prefix_same_as_start;
3122 3123
      read_options_.rate_limiter_priority =
          FLAGS_rate_limit_user_ops ? Env::IO_USER : Env::IO_TOTAL;
3124 3125 3126
      read_options_.tailing = FLAGS_use_tailing_iterator;
      read_options_.readahead_size = FLAGS_readahead_size;
      read_options_.adaptive_readahead = FLAGS_adaptive_readahead;
3127
      read_options_.async_io = FLAGS_async_io;
H
heyongqiang 已提交
3128

3129
      void (Benchmark::*method)(ThreadState*) = nullptr;
A
agiardullo 已提交
3130 3131
      void (Benchmark::*post_process_method)() = nullptr;

3132
      bool fresh_db = false;
3133
      int num_threads = FLAGS_threads;
3134

3135 3136 3137 3138 3139 3140
      int num_repeat = 1;
      int num_warmup = 0;
      if (!name.empty() && *name.rbegin() == ']') {
        auto it = name.find('[');
        if (it == std::string::npos) {
          fprintf(stderr, "unknown benchmark arguments '%s'\n", name.c_str());
3141
          ErrorExit();
3142 3143 3144 3145 3146 3147 3148 3149 3150 3151 3152 3153 3154 3155 3156 3157 3158 3159 3160 3161 3162 3163 3164
        }
        std::string args = name.substr(it + 1);
        args.resize(args.size() - 1);
        name.resize(it);

        std::string bench_arg;
        std::stringstream args_stream(args);
        while (std::getline(args_stream, bench_arg, '-')) {
          if (bench_arg.empty()) {
            continue;
          }
          if (bench_arg[0] == 'X') {
            // Repeat the benchmark n times
            std::string num_str = bench_arg.substr(1);
            num_repeat = std::stoi(num_str);
          } else if (bench_arg[0] == 'W') {
            // Warm up the benchmark for n times
            std::string num_str = bench_arg.substr(1);
            num_warmup = std::stoi(num_str);
          }
        }
      }

3165 3166 3167 3168 3169 3170 3171 3172 3173
      // Both fillseqdeterministic and filluniquerandomdeterministic
      // fill the levels except the max level with UNIQUE_RANDOM
      // and fill the max level with fillseq and filluniquerandom, respectively
      if (name == "fillseqdeterministic" ||
          name == "filluniquerandomdeterministic") {
        if (!FLAGS_disable_auto_compactions) {
          fprintf(stderr,
                  "Please disable_auto_compactions in FillDeterministic "
                  "benchmark\n");
3174
          ErrorExit();
3175 3176 3177 3178 3179 3180 3181 3182 3183 3184 3185 3186 3187 3188
        }
        if (num_threads > 1) {
          fprintf(stderr,
                  "filldeterministic multithreaded not supported"
                  ", use 1 thread\n");
          num_threads = 1;
        }
        fresh_db = true;
        if (name == "fillseqdeterministic") {
          method = &Benchmark::WriteSeqDeterministic;
        } else {
          method = &Benchmark::WriteUniqueRandomDeterministic;
        }
      } else if (name == "fillseq") {
3189 3190
        fresh_db = true;
        method = &Benchmark::WriteSeq;
3191
      } else if (name == "fillbatch") {
3192 3193 3194
        fresh_db = true;
        entries_per_batch_ = 1000;
        method = &Benchmark::WriteSeq;
3195
      } else if (name == "fillrandom") {
3196 3197
        fresh_db = true;
        method = &Benchmark::WriteRandom;
3198 3199
      } else if (name == "filluniquerandom" ||
                 name == "fillanddeleteuniquerandom") {
3200 3201
        fresh_db = true;
        if (num_threads > 1) {
3202
          fprintf(stderr,
3203 3204
                  "filluniquerandom and fillanddeleteuniquerandom "
                  "multithreaded not supported, use 1 thread");
3205
          num_threads = 1;
3206 3207
        }
        method = &Benchmark::WriteUniqueRandom;
3208
      } else if (name == "overwrite") {
3209
        method = &Benchmark::WriteRandom;
3210
      } else if (name == "fillsync") {
3211 3212 3213 3214
        fresh_db = true;
        num_ /= 1000;
        write_options_.sync = true;
        method = &Benchmark::WriteRandom;
3215
      } else if (name == "fill100K") {
3216 3217
        fresh_db = true;
        num_ /= 1000;
3218
        value_size = 100 * 1000;
3219
        method = &Benchmark::WriteRandom;
3220
      } else if (name == "readseq") {
3221
        method = &Benchmark::ReadSequential;
3222 3223 3224 3225 3226
      } else if (name == "readtorowcache") {
        if (!FLAGS_use_existing_keys || !FLAGS_row_cache_size) {
          fprintf(stderr,
                  "Please set use_existing_keys to true and specify a "
                  "row cache size in readtorowcache benchmark\n");
3227
          ErrorExit();
3228 3229
        }
        method = &Benchmark::ReadToRowCache;
3230
      } else if (name == "readtocache") {
M
Mark Callaghan 已提交
3231 3232 3233
        method = &Benchmark::ReadSequential;
        num_threads = 1;
        reads_ = num_;
3234
      } else if (name == "readreverse") {
3235
        method = &Benchmark::ReadReverse;
3236
      } else if (name == "readrandom") {
3237 3238 3239 3240
        if (FLAGS_multiread_stride) {
          fprintf(stderr, "entries_per_batch = %" PRIi64 "\n",
                  entries_per_batch_);
        }
3241
        method = &Benchmark::ReadRandom;
3242
      } else if (name == "readrandomfast") {
L
Lei Jin 已提交
3243
        method = &Benchmark::ReadRandomFast;
3244
      } else if (name == "multireadrandom") {
M
mike@arpaia.co 已提交
3245 3246
        fprintf(stderr, "entries_per_batch = %" PRIi64 "\n",
                entries_per_batch_);
L
Lei Jin 已提交
3247
        method = &Benchmark::MultiReadRandom;
3248 3249 3250 3251
      } else if (name == "approximatesizerandom") {
        fprintf(stderr, "entries_per_batch = %" PRIi64 "\n",
                entries_per_batch_);
        method = &Benchmark::ApproximateSizeRandom;
3252 3253
      } else if (name == "mixgraph") {
        method = &Benchmark::MixGraph;
3254
      } else if (name == "readmissing") {
L
Lei Jin 已提交
3255 3256
        ++key_size_;
        method = &Benchmark::ReadRandom;
3257
      } else if (name == "newiterator") {
3258
        method = &Benchmark::IteratorCreation;
3259
      } else if (name == "newiteratorwhilewriting") {
3260 3261
        num_threads++;  // Add extra thread for writing
        method = &Benchmark::IteratorCreationWhileWriting;
3262
      } else if (name == "seekrandom") {
S
Sanjay Ghemawat 已提交
3263
        method = &Benchmark::SeekRandom;
3264
      } else if (name == "seekrandomwhilewriting") {
L
Lei Jin 已提交
3265 3266
        num_threads++;  // Add extra thread for writing
        method = &Benchmark::SeekRandomWhileWriting;
3267
      } else if (name == "seekrandomwhilemerging") {
3268 3269
        num_threads++;  // Add extra thread for merging
        method = &Benchmark::SeekRandomWhileMerging;
3270
      } else if (name == "readrandomsmall") {
3271
        reads_ /= 1000;
3272
        method = &Benchmark::ReadRandom;
3273
      } else if (name == "deleteseq") {
S
Sanjay Ghemawat 已提交
3274
        method = &Benchmark::DeleteSeq;
3275
      } else if (name == "deleterandom") {
S
Sanjay Ghemawat 已提交
3276
        method = &Benchmark::DeleteRandom;
3277
      } else if (name == "readwhilewriting") {
3278 3279
        num_threads++;  // Add extra thread for writing
        method = &Benchmark::ReadWhileWriting;
3280
      } else if (name == "readwhilemerging") {
M
Mark Callaghan 已提交
3281 3282
        num_threads++;  // Add extra thread for writing
        method = &Benchmark::ReadWhileMerging;
Y
Yi Wu 已提交
3283 3284 3285
      } else if (name == "readwhilescanning") {
        num_threads++;  // Add extra thread for scaning
        method = &Benchmark::ReadWhileScanning;
3286
      } else if (name == "readrandomwriterandom") {
3287
        method = &Benchmark::ReadRandomWriteRandom;
3288
      } else if (name == "readrandommergerandom") {
3289 3290
        if (FLAGS_merge_operator.empty()) {
          fprintf(stdout, "%-12s : skipped (--merge_operator is unknown)\n",
3291
                  name.c_str());
3292
          ErrorExit();
3293
        }
L
Lei Jin 已提交
3294
        method = &Benchmark::ReadRandomMergeRandom;
3295
      } else if (name == "updaterandom") {
M
Mark Callaghan 已提交
3296
        method = &Benchmark::UpdateRandom;
P
Pooya Shareghi 已提交
3297 3298
      } else if (name == "xorupdaterandom") {
        method = &Benchmark::XORUpdateRandom;
3299
      } else if (name == "appendrandom") {
D
Deon Nicholas 已提交
3300
        method = &Benchmark::AppendRandom;
3301
      } else if (name == "mergerandom") {
D
Deon Nicholas 已提交
3302 3303
        if (FLAGS_merge_operator.empty()) {
          fprintf(stdout, "%-12s : skipped (--merge_operator is unknown)\n",
3304
                  name.c_str());
L
Lei Jin 已提交
3305
          exit(1);
D
Deon Nicholas 已提交
3306
        }
L
Lei Jin 已提交
3307
        method = &Benchmark::MergeRandom;
3308
      } else if (name == "randomwithverify") {
3309
        method = &Benchmark::RandomWithVerify;
3310
      } else if (name == "fillseekseq") {
T
Tomislav Novak 已提交
3311
        method = &Benchmark::WriteSeqSeekSeq;
3312
      } else if (name == "compact") {
3313
        method = &Benchmark::Compact;
3314 3315
      } else if (name == "compactall") {
        CompactAll();
3316 3317 3318 3319 3320 3321 3322 3323 3324 3325
#ifndef ROCKSDB_LITE
      } else if (name == "compact0") {
        CompactLevel(0);
      } else if (name == "compact1") {
        CompactLevel(1);
      } else if (name == "waitforcompaction") {
        WaitForCompaction();
#endif
      } else if (name == "flush") {
        Flush();
3326
      } else if (name == "crc32c") {
3327
        method = &Benchmark::Crc32c;
3328
      } else if (name == "xxhash") {
I
xxHash  
Igor Canadi 已提交
3329
        method = &Benchmark::xxHash;
3330 3331 3332 3333
      } else if (name == "xxhash64") {
        method = &Benchmark::xxHash64;
      } else if (name == "xxh3") {
        method = &Benchmark::xxh3;
3334
      } else if (name == "acquireload") {
3335
        method = &Benchmark::AcquireLoad;
3336
      } else if (name == "compress") {
A
Albert Strasheim 已提交
3337
        method = &Benchmark::Compress;
3338
      } else if (name == "uncompress") {
A
Albert Strasheim 已提交
3339
        method = &Benchmark::Uncompress;
3340
#ifndef ROCKSDB_LITE
3341
      } else if (name == "randomtransaction") {
A
agiardullo 已提交
3342 3343
        method = &Benchmark::RandomTransaction;
        post_process_method = &Benchmark::RandomTransactionVerify;
3344
#endif  // ROCKSDB_LITE
A
Andres Noetzli 已提交
3345 3346 3347
      } else if (name == "randomreplacekeys") {
        fresh_db = true;
        method = &Benchmark::RandomReplaceKeys;
3348 3349 3350 3351 3352 3353 3354 3355 3356
      } else if (name == "timeseries") {
        timestamp_emulator_.reset(new TimestampEmulator());
        if (FLAGS_expire_style == "compaction_filter") {
          filter.reset(new ExpiredTimeFilter(timestamp_emulator_));
          fprintf(stdout, "Compaction filter is used to remove expired data");
          open_options_.compaction_filter = filter.get();
        }
        fresh_db = true;
        method = &Benchmark::TimeSeries;
3357
      } else if (name == "stats") {
3358
        PrintStats("rocksdb.stats");
S
Siying Dong 已提交
3359 3360
      } else if (name == "resetstats") {
        ResetStats();
3361 3362
      } else if (name == "verify") {
        VerifyDBFromDB(FLAGS_truth_db);
3363
      } else if (name == "levelstats") {
3364
        PrintStats("rocksdb.levelstats");
3365 3366 3367 3368 3369 3370 3371 3372
      } else if (name == "memstats") {
        std::vector<std::string> keys{"rocksdb.num-immutable-mem-table",
                                      "rocksdb.cur-size-active-mem-table",
                                      "rocksdb.cur-size-all-mem-tables",
                                      "rocksdb.size-all-mem-tables",
                                      "rocksdb.num-entries-active-mem-table",
                                      "rocksdb.num-entries-imm-mem-tables"};
        PrintStats(keys);
3373
      } else if (name == "sstables") {
3374
        PrintStats("rocksdb.sstables");
3375 3376
      } else if (name == "stats_history") {
        PrintStatsHistory();
3377
#ifndef ROCKSDB_LITE
3378 3379 3380
      } else if (name == "replay") {
        if (num_threads > 1) {
          fprintf(stderr, "Multi-threaded replay is not yet supported\n");
3381
          ErrorExit();
3382 3383 3384
        }
        if (FLAGS_trace_file == "") {
          fprintf(stderr, "Please set --trace_file to be replayed from\n");
3385
          ErrorExit();
3386 3387
        }
        method = &Benchmark::Replay;
3388
#endif  // ROCKSDB_LITE
3389 3390
      } else if (name == "getmergeoperands") {
        method = &Benchmark::GetMergeOperands;
3391 3392 3393 3394 3395 3396
#ifndef ROCKSDB_LITE
      } else if (name == "verifychecksum") {
        method = &Benchmark::VerifyChecksum;
      } else if (name == "verifyfilechecksums") {
        method = &Benchmark::VerifyFileChecksums;
#endif                             // ROCKSDB_LITE
3397 3398
      } else if (!name.empty()) {  // No error message for empty name
        fprintf(stderr, "unknown benchmark '%s'\n", name.c_str());
3399
        ErrorExit();
3400
      }
3401 3402 3403 3404

      if (fresh_db) {
        if (FLAGS_use_existing_db) {
          fprintf(stdout, "%-12s : skipped (--use_existing_db is true)\n",
3405
                  name.c_str());
3406
          method = nullptr;
3407
        } else {
3408
          if (db_.db != nullptr) {
A
agiardullo 已提交
3409
            db_.DeleteDBs();
3410
            DestroyDB(FLAGS_db, open_options_);
3411
          }
3412
          Options options = open_options_;
3413
          for (size_t i = 0; i < multi_dbs_.size(); i++) {
3414
            delete multi_dbs_[i].db;
3415 3416 3417 3418
            if (!open_options_.wal_dir.empty()) {
              options.wal_dir = GetPathForMultiple(open_options_.wal_dir, i);
            }
            DestroyDB(GetPathForMultiple(FLAGS_db, i), options);
3419 3420
          }
          multi_dbs_.clear();
3421
        }
3422
        Open(&open_options_);  // use open_options for the last accessed
3423 3424
      }

3425
      if (method != nullptr) {
3426
        fprintf(stdout, "DB path: [%s]\n", FLAGS_db.c_str());
3427

3428
#ifndef ROCKSDB_LITE
3429 3430 3431 3432 3433 3434 3435 3436 3437 3438 3439
        // A trace_file option can be provided both for trace and replay
        // operations. But db_bench does not support tracing and replaying at
        // the same time, for now. So, start tracing only when it is not a
        // replay.
        if (FLAGS_trace_file != "" && name != "replay") {
          std::unique_ptr<TraceWriter> trace_writer;
          Status s = NewFileTraceWriter(FLAGS_env, EnvOptions(),
                                        FLAGS_trace_file, &trace_writer);
          if (!s.ok()) {
            fprintf(stderr, "Encountered an error starting a trace, %s\n",
                    s.ToString().c_str());
3440
            ErrorExit();
3441 3442 3443 3444 3445
          }
          s = db_.db->StartTrace(trace_options_, std::move(trace_writer));
          if (!s.ok()) {
            fprintf(stderr, "Encountered an error starting a trace, %s\n",
                    s.ToString().c_str());
3446
            ErrorExit();
3447 3448 3449 3450
          }
          fprintf(stdout, "Tracing the workload to: [%s]\n",
                  FLAGS_trace_file.c_str());
        }
3451 3452 3453 3454 3455 3456 3457
        // Start block cache tracing.
        if (!FLAGS_block_cache_trace_file.empty()) {
          // Sanity checks.
          if (FLAGS_block_cache_trace_sampling_frequency <= 0) {
            fprintf(stderr,
                    "Block cache trace sampling frequency must be higher than "
                    "0.\n");
3458
            ErrorExit();
3459 3460 3461 3462 3463
          }
          if (FLAGS_block_cache_trace_max_trace_file_size_in_bytes <= 0) {
            fprintf(stderr,
                    "The maximum file size for block cache tracing must be "
                    "higher than 0.\n");
3464
            ErrorExit();
3465 3466 3467 3468 3469 3470 3471 3472 3473 3474 3475 3476 3477
          }
          block_cache_trace_options_.max_trace_file_size =
              FLAGS_block_cache_trace_max_trace_file_size_in_bytes;
          block_cache_trace_options_.sampling_frequency =
              FLAGS_block_cache_trace_sampling_frequency;
          std::unique_ptr<TraceWriter> block_cache_trace_writer;
          Status s = NewFileTraceWriter(FLAGS_env, EnvOptions(),
                                        FLAGS_block_cache_trace_file,
                                        &block_cache_trace_writer);
          if (!s.ok()) {
            fprintf(stderr,
                    "Encountered an error when creating trace writer, %s\n",
                    s.ToString().c_str());
3478
            ErrorExit();
3479 3480 3481 3482 3483 3484 3485 3486
          }
          s = db_.db->StartBlockCacheTrace(block_cache_trace_options_,
                                           std::move(block_cache_trace_writer));
          if (!s.ok()) {
            fprintf(
                stderr,
                "Encountered an error when starting block cache tracing, %s\n",
                s.ToString().c_str());
3487
            ErrorExit();
3488 3489 3490 3491
          }
          fprintf(stdout, "Tracing block cache accesses to: [%s]\n",
                  FLAGS_block_cache_trace_file.c_str());
        }
3492
#endif  // ROCKSDB_LITE
3493

3494 3495 3496 3497 3498 3499 3500 3501 3502 3503 3504 3505 3506 3507 3508 3509 3510 3511 3512 3513
        if (num_warmup > 0) {
          printf("Warming up benchmark by running %d times\n", num_warmup);
        }

        for (int i = 0; i < num_warmup; i++) {
          RunBenchmark(num_threads, name, method);
        }

        if (num_repeat > 1) {
          printf("Running benchmark for %d times\n", num_repeat);
        }

        CombinedStats combined_stats;
        for (int i = 0; i < num_repeat; i++) {
          Stats stats = RunBenchmark(num_threads, name, method);
          combined_stats.AddStats(stats);
        }
        if (num_repeat > 1) {
          combined_stats.Report(name);
        }
J
jorlow@chromium.org 已提交
3514
      }
A
agiardullo 已提交
3515 3516 3517
      if (post_process_method != nullptr) {
        (this->*post_process_method)();
      }
J
jorlow@chromium.org 已提交
3518
    }
3519

3520 3521 3522 3523 3524 3525
    if (secondary_update_thread_) {
      secondary_update_stopped_.store(1, std::memory_order_relaxed);
      secondary_update_thread_->join();
      secondary_update_thread_.reset();
    }

3526
#ifndef ROCKSDB_LITE
3527 3528 3529 3530 3531 3532 3533
    if (name != "replay" && FLAGS_trace_file != "") {
      Status s = db_.db->EndTrace();
      if (!s.ok()) {
        fprintf(stderr, "Encountered an error ending the trace, %s\n",
                s.ToString().c_str());
      }
    }
3534 3535 3536 3537 3538 3539 3540 3541
    if (!FLAGS_block_cache_trace_file.empty()) {
      Status s = db_.db->EndBlockCacheTrace();
      if (!s.ok()) {
        fprintf(stderr,
                "Encountered an error ending the block cache tracing, %s\n",
                s.ToString().c_str());
      }
    }
3542
#endif  // ROCKSDB_LITE
3543

3544
    if (FLAGS_statistics) {
K
krad 已提交
3545
      fprintf(stdout, "STATISTICS:\n%s\n", dbstats->ToString().c_str());
3546
    }
I
Islam AbdelRahman 已提交
3547
    if (FLAGS_simcache_size >= 0) {
3548 3549 3550
      fprintf(
          stdout, "SIMULATOR CACHE STATISTICS:\n%s\n",
          static_cast_with_check<SimCache>(cache_.get())->ToString().c_str());
3551
    }
3552 3553

#ifndef ROCKSDB_LITE
3554 3555 3556 3557
    if (FLAGS_use_secondary_db) {
      fprintf(stdout, "Secondary instance updated  %" PRIu64 " times.\n",
              secondary_db_updates_);
    }
3558
#endif  // ROCKSDB_LITE
J
jorlow@chromium.org 已提交
3559 3560
  }

3561
 private:
3562
  std::shared_ptr<TimestampEmulator> timestamp_emulator_;
3563 3564
  std::unique_ptr<port::Thread> secondary_update_thread_;
  std::atomic<int> secondary_update_stopped_{0};
3565
#ifndef ROCKSDB_LITE
3566
  uint64_t secondary_db_updates_ = 0;
3567
#endif  // ROCKSDB_LITE
3568 3569 3570 3571 3572 3573 3574 3575 3576 3577 3578 3579 3580 3581 3582 3583 3584 3585 3586 3587 3588 3589
  struct ThreadArg {
    Benchmark* bm;
    SharedState* shared;
    ThreadState* thread;
    void (Benchmark::*method)(ThreadState*);
  };

  static void ThreadBody(void* v) {
    ThreadArg* arg = reinterpret_cast<ThreadArg*>(v);
    SharedState* shared = arg->shared;
    ThreadState* thread = arg->thread;
    {
      MutexLock l(&shared->mu);
      shared->num_initialized++;
      if (shared->num_initialized >= shared->total) {
        shared->cv.SignalAll();
      }
      while (!shared->start) {
        shared->cv.Wait();
      }
    }

3590
    SetPerfLevel(static_cast<PerfLevel> (shared->perf_level));
3591
    perf_context.EnablePerLevelPerfContext();
3592
    thread->stats.Start(thread->tid);
3593 3594 3595 3596 3597 3598 3599 3600 3601 3602 3603 3604
    (arg->bm->*(arg->method))(thread);
    thread->stats.Stop();

    {
      MutexLock l(&shared->mu);
      shared->num_done++;
      if (shared->num_done >= shared->total) {
        shared->cv.SignalAll();
      }
    }
  }

3605 3606
  Stats RunBenchmark(int n, Slice name,
                     void (Benchmark::*method)(ThreadState*)) {
3607 3608 3609 3610 3611
    SharedState shared;
    shared.total = n;
    shared.num_initialized = 0;
    shared.num_done = 0;
    shared.start = false;
3612 3613 3614 3615
    if (FLAGS_benchmark_write_rate_limit > 0) {
      shared.write_rate_limiter.reset(
          NewGenericRateLimiter(FLAGS_benchmark_write_rate_limit));
    }
3616
    if (FLAGS_benchmark_read_rate_limit > 0) {
3617 3618 3619
      shared.read_rate_limiter.reset(NewGenericRateLimiter(
          FLAGS_benchmark_read_rate_limit, 100000 /* refill_period_us */,
          10 /* fairness */, RateLimiter::Mode::kReadsOnly));
3620
    }
3621

3622 3623 3624 3625 3626 3627
    std::unique_ptr<ReporterAgent> reporter_agent;
    if (FLAGS_report_interval_seconds > 0) {
      reporter_agent.reset(new ReporterAgent(FLAGS_env, FLAGS_report_file,
                                             FLAGS_report_interval_seconds));
    }

3628
    ThreadArg* arg = new ThreadArg[n];
3629

3630
    for (int i = 0; i < n; i++) {
3631 3632 3633 3634 3635 3636 3637 3638 3639 3640 3641 3642 3643 3644 3645 3646 3647
#ifdef NUMA
      if (FLAGS_enable_numa) {
        // Performs a local allocation of memory to threads in numa node.
        int n_nodes = numa_num_task_nodes();  // Number of nodes in NUMA.
        numa_exit_on_error = 1;
        int numa_node = i % n_nodes;
        bitmask* nodes = numa_allocate_nodemask();
        numa_bitmask_clearall(nodes);
        numa_bitmask_setbit(nodes, numa_node);
        // numa_bind() call binds the process to the node and these
        // properties are passed on to the thread that is created in
        // StartThread method called later in the loop.
        numa_bind(nodes);
        numa_set_strict(1);
        numa_free_nodemask(nodes);
      }
#endif
3648 3649 3650 3651
      arg[i].bm = this;
      arg[i].method = method;
      arg[i].shared = &shared;
      arg[i].thread = new ThreadState(i);
3652
      arg[i].thread->stats.SetReporterAgent(reporter_agent.get());
3653
      arg[i].thread->shared = &shared;
3654
      FLAGS_env->StartThread(ThreadBody, &arg[i]);
3655 3656 3657 3658 3659 3660 3661 3662 3663 3664 3665 3666 3667 3668
    }

    shared.mu.Lock();
    while (shared.num_initialized < n) {
      shared.cv.Wait();
    }

    shared.start = true;
    shared.cv.SignalAll();
    while (shared.num_done < n) {
      shared.cv.Wait();
    }
    shared.mu.Unlock();

3669 3670 3671 3672
    // Stats for some threads can be excluded.
    Stats merge_stats;
    for (int i = 0; i < n; i++) {
      merge_stats.Merge(arg[i].thread->stats);
3673
    }
3674
    merge_stats.Report(name);
3675

3676 3677 3678 3679 3680
    for (int i = 0; i < n; i++) {
      delete arg[i].thread;
    }
    delete[] arg;

3681
    return merge_stats;
3682 3683
  }

3684 3685 3686
  template <OperationType kOpType, typename FnType, typename... Args>
  static inline void ChecksumBenchmark(FnType fn, ThreadState* thread,
                                       Args... args) {
3687 3688 3689 3690
    const int size = FLAGS_block_size; // use --block_size option for db_bench
    std::string labels = "(" + ToString(FLAGS_block_size) + " per op)";
    const char* label = labels.c_str();

J
jorlow@chromium.org 已提交
3691
    std::string data(size, 'x');
3692 3693 3694 3695 3696
    uint64_t bytes = 0;
    uint32_t val = 0;
    while (bytes < 5000U * uint64_t{1048576}) {  // ~5GB
      val += static_cast<uint32_t>(fn(data.data(), size, args...));
      thread->stats.FinishedOps(nullptr, nullptr, 1, kOpType);
J
jorlow@chromium.org 已提交
3697 3698 3699
      bytes += size;
    }
    // Print so result is not dead
3700
    fprintf(stderr, "... val=0x%x\r", static_cast<unsigned int>(val));
J
jorlow@chromium.org 已提交
3701

3702 3703
    thread->stats.AddBytes(bytes);
    thread->stats.AddMessage(label);
J
jorlow@chromium.org 已提交
3704 3705
  }

3706 3707 3708 3709
  void Crc32c(ThreadState* thread) {
    ChecksumBenchmark<kCrc>(crc32c::Value, thread);
  }

I
xxHash  
Igor Canadi 已提交
3710
  void xxHash(ThreadState* thread) {
3711 3712
    ChecksumBenchmark<kHash>(XXH32, thread, /*seed*/ 0);
  }
I
xxHash  
Igor Canadi 已提交
3713

3714 3715 3716 3717 3718 3719
  void xxHash64(ThreadState* thread) {
    ChecksumBenchmark<kHash>(XXH64, thread, /*seed*/ 0);
  }

  void xxh3(ThreadState* thread) {
    ChecksumBenchmark<kHash>(XXH3_64bits, thread);
I
xxHash  
Igor Canadi 已提交
3720 3721
  }

3722
  void AcquireLoad(ThreadState* thread) {
3723
    int dummy;
I
Igor Canadi 已提交
3724
    std::atomic<void*> ap(&dummy);
3725
    int count = 0;
3726
    void *ptr = nullptr;
3727
    thread->stats.AddMessage("(each op is 1000 loads)");
3728 3729
    while (count < 100000) {
      for (int i = 0; i < 1000; i++) {
I
Igor Canadi 已提交
3730
        ptr = ap.load(std::memory_order_acquire);
3731 3732
      }
      count++;
3733
      thread->stats.FinishedOps(nullptr, nullptr, 1, kOthers);
3734
    }
3735
    if (ptr == nullptr) exit(1);  // Disable unused variable warning.
3736 3737
  }

A
Albert Strasheim 已提交
3738
  void Compress(ThreadState *thread) {
3739
    RandomGenerator gen;
3740
    Slice input = gen.Generate(FLAGS_block_size);
3741 3742 3743 3744
    int64_t bytes = 0;
    int64_t produced = 0;
    bool ok = true;
    std::string compressed;
3745 3746 3747
    CompressionOptions opts;
    CompressionContext context(FLAGS_compression_type_e);
    CompressionInfo info(opts, context, CompressionDict::GetEmptyDict(),
3748 3749
                         FLAGS_compression_type_e,
                         FLAGS_sample_for_compression);
A
Albert Strasheim 已提交
3750 3751
    // Compress 1G
    while (ok && bytes < int64_t(1) << 30) {
3752
      compressed.clear();
3753
      ok = CompressSlice(info, input, &compressed);
3754 3755
      produced += compressed.size();
      bytes += input.size();
3756
      thread->stats.FinishedOps(nullptr, nullptr, 1, kCompress);
3757 3758 3759
    }

    if (!ok) {
A
Albert Strasheim 已提交
3760
      thread->stats.AddMessage("(compression failure)");
3761
    } else {
D
Daniel Black 已提交
3762
      char buf[340];
3763 3764
      snprintf(buf, sizeof(buf), "(output: %.1f%%)",
               (produced * 100.0) / bytes);
3765 3766
      thread->stats.AddMessage(buf);
      thread->stats.AddBytes(bytes);
3767 3768 3769
    }
  }

A
Albert Strasheim 已提交
3770
  void Uncompress(ThreadState *thread) {
3771
    RandomGenerator gen;
3772
    Slice input = gen.Generate(FLAGS_block_size);
3773
    std::string compressed;
A
Albert Strasheim 已提交
3774

3775 3776
    CompressionContext compression_ctx(FLAGS_compression_type_e);
    CompressionOptions compression_opts;
3777 3778 3779
    CompressionInfo compression_info(
        compression_opts, compression_ctx, CompressionDict::GetEmptyDict(),
        FLAGS_compression_type_e, FLAGS_sample_for_compression);
3780
    UncompressionContext uncompression_ctx(FLAGS_compression_type_e);
3781 3782 3783
    UncompressionInfo uncompression_info(uncompression_ctx,
                                         UncompressionDict::GetEmptyDict(),
                                         FLAGS_compression_type_e);
3784

3785
    bool ok = CompressSlice(compression_info, input, &compressed);
3786
    int64_t bytes = 0;
3787
    size_t uncompressed_size = 0;
A
Albert Strasheim 已提交
3788
    while (ok && bytes < 1024 * 1048576) {
3789 3790 3791 3792 3793 3794 3795
      constexpr uint32_t compress_format_version = 2;

      CacheAllocationPtr uncompressed = UncompressData(
          uncompression_info, compressed.data(), compressed.size(),
          &uncompressed_size, compress_format_version);

      ok = uncompressed.get() != nullptr;
3796
      bytes += input.size();
3797
      thread->stats.FinishedOps(nullptr, nullptr, 1, kUncompress);
3798 3799 3800
    }

    if (!ok) {
A
Albert Strasheim 已提交
3801
      thread->stats.AddMessage("(compression failure)");
3802
    } else {
3803
      thread->stats.AddBytes(bytes);
3804 3805 3806
    }
  }

3807 3808 3809 3810 3811 3812 3813 3814
  // Returns true if the options is initialized from the specified
  // options file.
  bool InitializeOptionsFromFile(Options* opts) {
#ifndef ROCKSDB_LITE
    printf("Initializing RocksDB Options from the specified file\n");
    DBOptions db_opts;
    std::vector<ColumnFamilyDescriptor> cf_descs;
    if (FLAGS_options_file != "") {
3815
      auto s = LoadOptionsFromFile(FLAGS_options_file, FLAGS_env, &db_opts,
3816
                                   &cf_descs);
3817
      db_opts.env = FLAGS_env;
3818 3819 3820 3821 3822 3823 3824 3825
      if (s.ok()) {
        *opts = Options(db_opts, cf_descs[0].options);
        return true;
      }
      fprintf(stderr, "Unable to load options file %s --- %s\n",
              FLAGS_options_file.c_str(), s.ToString().c_str());
      exit(1);
    }
3826 3827
#else
    (void)opts;
3828 3829 3830 3831 3832 3833
#endif
    return false;
  }

  void InitializeOptionsFromFlags(Options* opts) {
    printf("Initializing RocksDB Options from command-line flags\n");
3834
    Options& options = *opts;
3835
    ConfigOptions config_options(options);
3836
    config_options.ignore_unsupported_options = false;
3837

3838
    assert(db_.db == nullptr);
3839

3840
    options.env = FLAGS_env;
3841
    options.max_open_files = FLAGS_open_files;
3842 3843 3844 3845
    if (FLAGS_cost_write_buffer_to_cache || FLAGS_db_write_buffer_size != 0) {
      options.write_buffer_manager.reset(
          new WriteBufferManager(FLAGS_db_write_buffer_size, cache_));
    }
3846
    options.arena_block_size = FLAGS_arena_block_size;
3847
    options.write_buffer_size = FLAGS_write_buffer_size;
3848
    options.max_write_buffer_number = FLAGS_max_write_buffer_number;
3849 3850
    options.min_write_buffer_number_to_merge =
      FLAGS_min_write_buffer_number_to_merge;
3851 3852
    options.max_write_buffer_number_to_maintain =
        FLAGS_max_write_buffer_number_to_maintain;
3853 3854
    options.max_write_buffer_size_to_maintain =
        FLAGS_max_write_buffer_size_to_maintain;
3855
    options.max_background_jobs = FLAGS_max_background_jobs;
3856
    options.max_background_compactions = FLAGS_max_background_compactions;
3857
    options.max_subcompactions = static_cast<uint32_t>(FLAGS_subcompactions);
3858
    options.max_background_flushes = FLAGS_max_background_flushes;
3859
    options.compaction_style = FLAGS_compaction_style_e;
3860
    options.compaction_pri = FLAGS_compaction_pri_e;
3861 3862 3863
    options.allow_mmap_reads = FLAGS_mmap_read;
    options.allow_mmap_writes = FLAGS_mmap_write;
    options.use_direct_reads = FLAGS_use_direct_reads;
3864 3865
    options.use_direct_io_for_flush_and_compaction =
        FLAGS_use_direct_io_for_flush_and_compaction;
3866
    options.manual_wal_flush = FLAGS_manual_wal_flush;
3867
    options.wal_compression = FLAGS_wal_compression_e;
3868
#ifndef ROCKSDB_LITE
3869
    options.ttl = FLAGS_fifo_compaction_ttl;
3870
    options.compaction_options_fifo = CompactionOptionsFIFO(
3871
        FLAGS_fifo_compaction_max_table_files_size_mb * 1024 * 1024,
3872
        FLAGS_fifo_compaction_allow_compaction);
3873
    options.compaction_options_fifo.age_for_warm = FLAGS_fifo_age_for_warm;
3874
#endif  // ROCKSDB_LITE
3875
    if (FLAGS_prefix_size != 0) {
3876 3877 3878
      options.prefix_extractor.reset(
          NewFixedPrefixTransform(FLAGS_prefix_size));
    }
3879 3880 3881 3882 3883 3884 3885
    if (FLAGS_use_uint64_comparator) {
      options.comparator = test::Uint64Comparator();
      if (FLAGS_key_size != 8) {
        fprintf(stderr, "Using Uint64 comparator but key size is not 8.\n");
        exit(1);
      }
    }
3886 3887 3888
    if (FLAGS_use_stderr_info_logger) {
      options.info_log.reset(new StderrLogger());
    }
3889
    options.memtable_huge_page_size = FLAGS_memtable_use_huge_page ? 2048 : 0;
3890
    options.memtable_prefix_bloom_size_ratio = FLAGS_memtable_bloom_size_ratio;
3891
    options.memtable_whole_key_filtering = FLAGS_memtable_whole_key_filtering;
3892 3893 3894 3895 3896
    if (FLAGS_memtable_insert_with_hint_prefix_size > 0) {
      options.memtable_insert_with_hint_prefix_extractor.reset(
          NewCappedPrefixTransform(
              FLAGS_memtable_insert_with_hint_prefix_size));
    }
L
Lei Jin 已提交
3897
    options.bloom_locality = FLAGS_bloom_locality;
3898
    options.max_file_opening_threads = FLAGS_file_opening_threads;
3899
    options.compaction_readahead_size = FLAGS_compaction_readahead_size;
3900
    options.log_readahead_size = FLAGS_log_readahead_size;
3901
    options.random_access_max_buffer_size = FLAGS_random_access_max_buffer_size;
3902
    options.writable_file_max_buffer_size = FLAGS_writable_file_max_buffer_size;
3903
    options.use_fsync = FLAGS_use_fsync;
3904
    options.num_levels = FLAGS_num_levels;
H
heyongqiang 已提交
3905 3906 3907
    options.target_file_size_base = FLAGS_target_file_size_base;
    options.target_file_size_multiplier = FLAGS_target_file_size_multiplier;
    options.max_bytes_for_level_base = FLAGS_max_bytes_for_level_base;
3908 3909
    options.level_compaction_dynamic_level_bytes =
        FLAGS_level_compaction_dynamic_level_bytes;
H
heyongqiang 已提交
3910 3911
    options.max_bytes_for_level_multiplier =
        FLAGS_max_bytes_for_level_multiplier;
3912 3913 3914 3915 3916 3917 3918 3919 3920
    Status s =
        CreateMemTableRepFactory(config_options, &options.memtable_factory);
    if (!s.ok()) {
      fprintf(stderr, "Could not create memtable factory: %s\n",
              s.ToString().c_str());
      exit(1);
    } else if ((FLAGS_prefix_size == 0) &&
               (options.memtable_factory->IsInstanceOf("prefix_hash") ||
                options.memtable_factory->IsInstanceOf("hash_linkedlist"))) {
3921 3922
      fprintf(stderr, "prefix_size should be non-zero if PrefixHash or "
                      "HashLinkedList memtablerep is used\n");
J
Jim Paton 已提交
3923 3924
      exit(1);
    }
L
Lei Jin 已提交
3925
    if (FLAGS_use_plain_table) {
S
sdong 已提交
3926
#ifndef ROCKSDB_LITE
3927 3928 3929 3930
      if (!options.memtable_factory->IsInstanceOf("prefix_hash") &&
          !options.memtable_factory->IsInstanceOf("hash_linkedlist")) {
        fprintf(stderr, "Warning: plain table is used with %s\n",
                options.memtable_factory->Name());
L
Lei Jin 已提交
3931 3932 3933 3934
      }

      int bloom_bits_per_key = FLAGS_bloom_bits;
      if (bloom_bits_per_key < 0) {
3935
        bloom_bits_per_key = PlainTableOptions().bloom_bits_per_key;
L
Lei Jin 已提交
3936
      }
S
Stanislau Hlebik 已提交
3937 3938 3939 3940 3941 3942 3943

      PlainTableOptions plain_table_options;
      plain_table_options.user_key_len = FLAGS_key_size;
      plain_table_options.bloom_bits_per_key = bloom_bits_per_key;
      plain_table_options.hash_table_ratio = 0.75;
      options.table_factory = std::shared_ptr<TableFactory>(
          NewPlainTableFactory(plain_table_options));
S
sdong 已提交
3944 3945 3946 3947
#else
      fprintf(stderr, "Plain table is not supported in lite mode\n");
      exit(1);
#endif  // ROCKSDB_LITE
3948
    } else if (FLAGS_use_cuckoo_table) {
S
sdong 已提交
3949
#ifndef ROCKSDB_LITE
3950 3951 3952 3953
      if (FLAGS_cuckoo_hash_ratio > 1 || FLAGS_cuckoo_hash_ratio < 0) {
        fprintf(stderr, "Invalid cuckoo_hash_ratio\n");
        exit(1);
      }
3954 3955 3956 3957 3958 3959

      if (!FLAGS_mmap_read) {
        fprintf(stderr, "cuckoo table format requires mmap read to operate\n");
        exit(1);
      }

3960
      ROCKSDB_NAMESPACE::CuckooTableOptions table_options;
3961 3962
      table_options.hash_table_ratio = FLAGS_cuckoo_hash_ratio;
      table_options.identity_as_first_hash = FLAGS_identity_as_first_hash;
3963
      options.table_factory = std::shared_ptr<TableFactory>(
3964
          NewCuckooTableFactory(table_options));
S
sdong 已提交
3965 3966 3967 3968
#else
      fprintf(stderr, "Cuckoo table is not supported in lite mode\n");
      exit(1);
#endif  // ROCKSDB_LITE
3969 3970
    } else {
      BlockBasedTableOptions block_based_options;
3971 3972
      block_based_options.checksum =
          static_cast<ChecksumType>(FLAGS_checksum_type);
3973
      if (FLAGS_use_hash_search) {
3974 3975 3976 3977 3978
        if (FLAGS_prefix_size == 0) {
          fprintf(stderr,
              "prefix_size not assigned when enable use_hash_search \n");
          exit(1);
        }
3979 3980 3981 3982
        block_based_options.index_type = BlockBasedTableOptions::kHashSearch;
      } else {
        block_based_options.index_type = BlockBasedTableOptions::kBinarySearch;
      }
3983
      if (FLAGS_partition_index_and_filters || FLAGS_partition_index) {
3984 3985 3986 3987 3988
        if (FLAGS_index_with_first_key) {
          fprintf(stderr,
                  "--index_with_first_key is not compatible with"
                  " partition index.");
        }
3989 3990 3991
        if (FLAGS_use_hash_search) {
          fprintf(stderr,
                  "use_hash_search is incompatible with "
3992
                  "partition index and is ignored");
3993 3994 3995 3996
        }
        block_based_options.index_type =
            BlockBasedTableOptions::kTwoLevelIndexSearch;
        block_based_options.metadata_block_size = FLAGS_metadata_block_size;
3997 3998 3999
        if (FLAGS_partition_index_and_filters) {
          block_based_options.partition_filters = true;
        }
4000 4001 4002 4003 4004 4005 4006 4007 4008 4009 4010 4011 4012 4013 4014 4015 4016 4017 4018 4019 4020
      } else if (FLAGS_index_with_first_key) {
        block_based_options.index_type =
            BlockBasedTableOptions::kBinarySearchWithFirstKey;
      }
      BlockBasedTableOptions::IndexShorteningMode index_shortening =
          block_based_options.index_shortening;
      switch (FLAGS_index_shortening_mode) {
        case 0:
          index_shortening =
              BlockBasedTableOptions::IndexShorteningMode::kNoShortening;
          break;
        case 1:
          index_shortening =
              BlockBasedTableOptions::IndexShorteningMode::kShortenSeparators;
          break;
        case 2:
          index_shortening = BlockBasedTableOptions::IndexShorteningMode::
              kShortenSeparatorsAndSuccessor;
          break;
        default:
          fprintf(stderr, "Unknown key shortening mode\n");
4021
      }
4022 4023
      block_based_options.optimize_filters_for_memory =
          FLAGS_optimize_filters_for_memory;
4024
      block_based_options.index_shortening = index_shortening;
4025 4026 4027
      if (cache_ == nullptr) {
        block_based_options.no_block_cache = true;
      }
4028 4029
      block_based_options.cache_index_and_filter_blocks =
          FLAGS_cache_index_and_filter_blocks;
4030 4031
      block_based_options.pin_l0_filter_and_index_blocks_in_cache =
          FLAGS_pin_l0_filter_and_index_blocks_in_cache;
4032 4033
      block_based_options.pin_top_level_index_and_filter =
          FLAGS_pin_top_level_index_and_filter;
4034 4035 4036 4037
      if (FLAGS_cache_high_pri_pool_ratio > 1e-6) {  // > 0.0 + eps
        block_based_options.cache_index_and_filter_blocks_with_high_priority =
            true;
      }
4038 4039 4040 4041
      block_based_options.block_cache = cache_;
      block_based_options.block_cache_compressed = compressed_cache_;
      block_based_options.block_size = FLAGS_block_size;
      block_based_options.block_restart_interval = FLAGS_block_restart_interval;
4042 4043
      block_based_options.index_block_restart_interval =
          FLAGS_index_block_restart_interval;
4044 4045
      block_based_options.format_version =
          static_cast<uint32_t>(FLAGS_format_version);
4046
      block_based_options.read_amp_bytes_per_bit = FLAGS_read_amp_bytes_per_bit;
4047 4048
      block_based_options.enable_index_compression =
          FLAGS_enable_index_compression;
4049
      block_based_options.block_align = FLAGS_block_align;
4050
      block_based_options.whole_key_filtering = FLAGS_whole_key_filtering;
4051 4052 4053 4054 4055 4056 4057 4058 4059 4060 4061 4062 4063 4064 4065
      BlockBasedTableOptions::PrepopulateBlockCache prepopulate_block_cache =
          block_based_options.prepopulate_block_cache;
      switch (FLAGS_prepopulate_block_cache) {
        case 0:
          prepopulate_block_cache =
              BlockBasedTableOptions::PrepopulateBlockCache::kDisable;
          break;
        case 1:
          prepopulate_block_cache =
              BlockBasedTableOptions::PrepopulateBlockCache::kFlushOnly;
          break;
        default:
          fprintf(stderr, "Unknown prepopulate block cache mode\n");
      }
      block_based_options.prepopulate_block_cache = prepopulate_block_cache;
4066 4067
      if (FLAGS_use_data_block_hash_index) {
        block_based_options.data_block_index_type =
4068
            ROCKSDB_NAMESPACE::BlockBasedTableOptions::kDataBlockBinaryAndHash;
4069 4070
      } else {
        block_based_options.data_block_index_type =
4071
            ROCKSDB_NAMESPACE::BlockBasedTableOptions::kDataBlockBinarySearch;
4072 4073 4074
      }
      block_based_options.data_block_hash_table_util_ratio =
          FLAGS_data_block_hash_table_util_ratio;
4075 4076 4077 4078 4079 4080 4081 4082 4083 4084 4085 4086 4087 4088 4089 4090 4091 4092 4093 4094 4095 4096 4097 4098 4099 4100 4101 4102 4103 4104 4105 4106 4107 4108 4109 4110 4111 4112 4113
      if (FLAGS_read_cache_path != "") {
#ifndef ROCKSDB_LITE
        Status rc_status;

        // Read cache need to be provided with a the Logger, we will put all
        // reac cache logs in the read cache path in a file named rc_LOG
        rc_status = FLAGS_env->CreateDirIfMissing(FLAGS_read_cache_path);
        std::shared_ptr<Logger> read_cache_logger;
        if (rc_status.ok()) {
          rc_status = FLAGS_env->NewLogger(FLAGS_read_cache_path + "/rc_LOG",
                                           &read_cache_logger);
        }

        if (rc_status.ok()) {
          PersistentCacheConfig rc_cfg(FLAGS_env, FLAGS_read_cache_path,
                                       FLAGS_read_cache_size,
                                       read_cache_logger);

          rc_cfg.enable_direct_reads = FLAGS_read_cache_direct_read;
          rc_cfg.enable_direct_writes = FLAGS_read_cache_direct_write;
          rc_cfg.writer_qdepth = 4;
          rc_cfg.writer_dispatch_size = 4 * 1024;

          auto pcache = std::make_shared<BlockCacheTier>(rc_cfg);
          block_based_options.persistent_cache = pcache;
          rc_status = pcache->Open();
        }

        if (!rc_status.ok()) {
          fprintf(stderr, "Error initializing read cache, %s\n",
                  rc_status.ToString().c_str());
          exit(1);
        }
#else
        fprintf(stderr, "Read cache is not supported in LITE\n");
        exit(1);

#endif
      }
4114 4115
      options.table_factory.reset(
          NewBlockBasedTableFactory(block_based_options));
L
Lei Jin 已提交
4116
    }
4117 4118
    if (FLAGS_max_bytes_for_level_multiplier_additional_v.size() > 0) {
      if (FLAGS_max_bytes_for_level_multiplier_additional_v.size() !=
4119
          static_cast<unsigned int>(FLAGS_num_levels)) {
4120 4121 4122
        fprintf(stderr, "Insufficient number of fanouts specified %d\n",
                static_cast<int>(
                    FLAGS_max_bytes_for_level_multiplier_additional_v.size()));
4123 4124 4125
        exit(1);
      }
      options.max_bytes_for_level_multiplier_additional =
4126
        FLAGS_max_bytes_for_level_multiplier_additional_v;
4127
    }
H
heyongqiang 已提交
4128
    options.level0_stop_writes_trigger = FLAGS_level0_stop_writes_trigger;
M
Mark Callaghan 已提交
4129
    options.level0_file_num_compaction_trigger =
4130
        FLAGS_level0_file_num_compaction_trigger;
H
heyongqiang 已提交
4131 4132
    options.level0_slowdown_writes_trigger =
      FLAGS_level0_slowdown_writes_trigger;
4133
    options.compression = FLAGS_compression_type_e;
4134 4135 4136
    if (FLAGS_simulate_hybrid_fs_file != "") {
      options.bottommost_temperature = Temperature::kWarm;
    }
4137
    options.sample_for_compression = FLAGS_sample_for_compression;
4138 4139
    options.WAL_ttl_seconds = FLAGS_wal_ttl_seconds;
    options.WAL_size_limit_MB = FLAGS_wal_size_limit_MB;
4140 4141
    options.max_total_wal_size = FLAGS_max_total_wal_size;

4142 4143
    if (FLAGS_min_level_to_compress >= 0) {
      assert(FLAGS_min_level_to_compress <= FLAGS_num_levels);
4144
      options.compression_per_level.resize(FLAGS_num_levels);
4145
      for (int i = 0; i < FLAGS_min_level_to_compress; i++) {
4146 4147
        options.compression_per_level[i] = kNoCompression;
      }
4148
      for (int i = FLAGS_min_level_to_compress;
4149
           i < FLAGS_num_levels; i++) {
4150
        options.compression_per_level[i] = FLAGS_compression_type_e;
4151 4152
      }
    }
4153 4154
    options.soft_pending_compaction_bytes_limit =
        FLAGS_soft_pending_compaction_bytes_limit;
4155 4156
    options.hard_pending_compaction_bytes_limit =
        FLAGS_hard_pending_compaction_bytes_limit;
S
sdong 已提交
4157
    options.delayed_write_rate = FLAGS_delayed_write_rate;
4158 4159
    options.allow_concurrent_memtable_write =
        FLAGS_allow_concurrent_memtable_write;
4160 4161
    options.experimental_mempurge_threshold =
        FLAGS_experimental_mempurge_threshold;
4162 4163
    options.inplace_update_support = FLAGS_inplace_update_support;
    options.inplace_update_num_locks = FLAGS_inplace_update_num_locks;
4164 4165
    options.enable_write_thread_adaptive_yield =
        FLAGS_enable_write_thread_adaptive_yield;
4166
    options.enable_pipelined_write = FLAGS_enable_pipelined_write;
M
Maysam Yabandeh 已提交
4167
    options.unordered_write = FLAGS_unordered_write;
4168 4169
    options.write_thread_max_yield_usec = FLAGS_write_thread_max_yield_usec;
    options.write_thread_slow_yield_usec = FLAGS_write_thread_slow_yield_usec;
4170
    options.table_cache_numshardbits = FLAGS_table_cache_numshardbits;
4171
    options.max_compaction_bytes = FLAGS_max_compaction_bytes;
4172
    options.disable_auto_compactions = FLAGS_disable_auto_compactions;
4173
    options.optimize_filters_for_hits = FLAGS_optimize_filters_for_hits;
4174 4175 4176 4177
    options.paranoid_checks = FLAGS_paranoid_checks;
    options.force_consistency_checks = FLAGS_force_consistency_checks;
    options.check_flush_compaction_key_order =
        FLAGS_check_flush_compaction_key_order;
4178
    options.periodic_compaction_seconds = FLAGS_periodic_compaction_seconds;
4179
    options.ttl = FLAGS_ttl_seconds;
4180
    // fill storage options
4181
    options.advise_random_on_open = FLAGS_advise_random_on_open;
4182
    options.access_hint_on_compaction_start = FLAGS_compaction_fadvice_e;
H
Haobo Xu 已提交
4183
    options.use_adaptive_mutex = FLAGS_use_adaptive_mutex;
H
Haobo Xu 已提交
4184
    options.bytes_per_sync = FLAGS_bytes_per_sync;
4185
    options.wal_bytes_per_sync = FLAGS_wal_bytes_per_sync;
H
Haobo Xu 已提交
4186

D
Deon Nicholas 已提交
4187
    // merge operator options
4188
    if (!FLAGS_merge_operator.empty()) {
4189 4190
      s = MergeOperator::CreateFromString(config_options, FLAGS_merge_operator,
                                          &options.merge_operator);
4191 4192 4193 4194 4195
      if (!s.ok()) {
        fprintf(stderr, "invalid merge operator[%s]: %s\n",
                FLAGS_merge_operator.c_str(), s.ToString().c_str());
        exit(1);
      }
D
Deon Nicholas 已提交
4196
    }
4197
    options.max_successive_merges = FLAGS_max_successive_merges;
4198
    options.report_bg_io_stats = FLAGS_report_bg_io_stats;
D
Deon Nicholas 已提交
4199

4200 4201 4202 4203 4204 4205 4206 4207 4208 4209 4210 4211 4212 4213 4214 4215 4216
    // set universal style compaction configurations, if applicable
    if (FLAGS_universal_size_ratio != 0) {
      options.compaction_options_universal.size_ratio =
        FLAGS_universal_size_ratio;
    }
    if (FLAGS_universal_min_merge_width != 0) {
      options.compaction_options_universal.min_merge_width =
        FLAGS_universal_min_merge_width;
    }
    if (FLAGS_universal_max_merge_width != 0) {
      options.compaction_options_universal.max_merge_width =
        FLAGS_universal_max_merge_width;
    }
    if (FLAGS_universal_max_size_amplification_percent != 0) {
      options.compaction_options_universal.max_size_amplification_percent =
        FLAGS_universal_max_size_amplification_percent;
    }
4217 4218 4219 4220
    if (FLAGS_universal_compression_size_percent != -1) {
      options.compaction_options_universal.compression_size_percent =
        FLAGS_universal_compression_size_percent;
    }
4221 4222
    options.compaction_options_universal.allow_trivial_move =
        FLAGS_universal_allow_trivial_move;
4223 4224
    options.compaction_options_universal.incremental =
        FLAGS_universal_incremental;
4225 4226 4227
    if (FLAGS_thread_status_per_interval > 0) {
      options.enable_thread_tracking = true;
    }
4228

4229 4230 4231 4232 4233
    if (FLAGS_user_timestamp_size > 0) {
      if (FLAGS_user_timestamp_size != 8) {
        fprintf(stderr, "Only 64 bits timestamps are supported.\n");
        exit(1);
      }
4234
      options.comparator = test::BytewiseComparatorWithU64TsWrapper();
4235 4236
    }

4237 4238 4239 4240 4241 4242 4243 4244 4245 4246
    // Integrated BlobDB
    options.enable_blob_files = FLAGS_enable_blob_files;
    options.min_blob_size = FLAGS_min_blob_size;
    options.blob_file_size = FLAGS_blob_file_size;
    options.blob_compression_type =
        StringToCompressionType(FLAGS_blob_compression_type.c_str());
    options.enable_blob_garbage_collection =
        FLAGS_enable_blob_garbage_collection;
    options.blob_garbage_collection_age_cutoff =
        FLAGS_blob_garbage_collection_age_cutoff;
4247 4248
    options.blob_garbage_collection_force_threshold =
        FLAGS_blob_garbage_collection_force_threshold;
4249 4250
    options.blob_compaction_readahead_size =
        FLAGS_blob_compaction_readahead_size;
4251

4252
#ifndef ROCKSDB_LITE
A
agiardullo 已提交
4253 4254 4255 4256
    if (FLAGS_readonly && FLAGS_transaction_db) {
      fprintf(stderr, "Cannot use readonly flag with transaction_db\n");
      exit(1);
    }
4257 4258 4259 4260 4261
    if (FLAGS_use_secondary_db &&
        (FLAGS_transaction_db || FLAGS_optimistic_transaction_db)) {
      fprintf(stderr, "Cannot use use_secondary_db flag with transaction_db\n");
      exit(1);
    }
4262
#endif  // ROCKSDB_LITE
A
agiardullo 已提交
4263

4264 4265 4266 4267 4268
  }

  void InitializeOptionsGeneral(Options* opts) {
    Options& options = *opts;

4269
    options.create_missing_column_families = FLAGS_num_column_families > 1;
4270 4271 4272
    options.statistics = dbstats;
    options.wal_dir = FLAGS_wal_dir;
    options.create_if_missing = !FLAGS_use_existing_db;
4273
    options.dump_malloc_stats = FLAGS_dump_malloc_stats;
4274 4275
    options.stats_dump_period_sec =
        static_cast<unsigned int>(FLAGS_stats_dump_period_sec);
4276 4277
    options.stats_persist_period_sec =
        static_cast<unsigned int>(FLAGS_stats_persist_period_sec);
4278
    options.persist_stats_to_disk = FLAGS_persist_stats_to_disk;
4279 4280
    options.stats_history_buffer_size =
        static_cast<size_t>(FLAGS_stats_history_buffer_size);
4281

A
Andrew Kryczka 已提交
4282 4283 4284 4285
    options.compression_opts.level = FLAGS_compression_level;
    options.compression_opts.max_dict_bytes = FLAGS_compression_max_dict_bytes;
    options.compression_opts.zstd_max_train_bytes =
        FLAGS_compression_zstd_max_train_bytes;
4286 4287
    options.compression_opts.parallel_threads =
        FLAGS_compression_parallel_threads;
4288 4289
    options.compression_opts.max_dict_buffer_bytes =
        FLAGS_compression_max_dict_buffer_bytes;
4290
    // If this is a block based table, set some related options
4291 4292 4293
    auto table_options =
        options.table_factory->GetOptions<BlockBasedTableOptions>();
    if (table_options != nullptr) {
4294
      if (FLAGS_cache_size) {
4295 4296
        table_options->block_cache = cache_;
      }
4297 4298 4299 4300
      if (FLAGS_bloom_bits < 0) {
        table_options->filter_policy = BlockBasedTableOptions().filter_policy;
      } else if (FLAGS_bloom_bits == 0) {
        table_options->filter_policy.reset();
4301 4302 4303 4304 4305 4306 4307 4308 4309 4310 4311 4312
      } else if (FLAGS_use_block_based_filter) {
        // Use back-door way of enabling obsolete block-based Bloom
        Status s = FilterPolicy::CreateFromString(
            ConfigOptions(),
            "rocksdb.internal.DeprecatedBlockBasedBloomFilter:" +
                ROCKSDB_NAMESPACE::ToString(FLAGS_bloom_bits),
            &table_options->filter_policy);
        if (!s.ok()) {
          fprintf(stderr, "failure creating obsolete block-based filter: %s\n",
                  s.ToString().c_str());
          exit(1);
        }
4313
      } else {
4314
        table_options->filter_policy.reset(
4315 4316
            FLAGS_use_ribbon_filter ? NewRibbonFilterPolicy(FLAGS_bloom_bits)
                                    : NewBloomFilterPolicy(FLAGS_bloom_bits));
4317
      }
4318
    }
4319 4320 4321 4322 4323 4324 4325 4326 4327 4328 4329 4330
    if (FLAGS_row_cache_size) {
      if (FLAGS_cache_numshardbits >= 1) {
        options.row_cache =
            NewLRUCache(FLAGS_row_cache_size, FLAGS_cache_numshardbits);
      } else {
        options.row_cache = NewLRUCache(FLAGS_row_cache_size);
      }
    }
    if (FLAGS_enable_io_prio) {
      FLAGS_env->LowerThreadPoolIOPriority(Env::LOW);
      FLAGS_env->LowerThreadPoolIOPriority(Env::HIGH);
    }
4331 4332 4333 4334
    if (FLAGS_enable_cpu_prio) {
      FLAGS_env->LowerThreadPoolCPUPriority(Env::LOW);
      FLAGS_env->LowerThreadPoolCPUPriority(Env::HIGH);
    }
I
Igor Canadi 已提交
4335
    options.env = FLAGS_env;
4336 4337 4338
    if (FLAGS_sine_write_rate) {
      FLAGS_benchmark_write_rate_limit = static_cast<uint64_t>(SineRate(0));
    }
4339

4340 4341
    if (FLAGS_rate_limiter_bytes_per_sec > 0) {
      options.rate_limiter.reset(NewGenericRateLimiter(
4342
          FLAGS_rate_limiter_bytes_per_sec, FLAGS_rate_limiter_refill_period_us,
4343 4344
          10 /* fairness */,
          FLAGS_rate_limit_bg_reads ? RateLimiter::Mode::kReadsOnly
A
Andrew Kryczka 已提交
4345 4346
                                    : RateLimiter::Mode::kWritesOnly,
          FLAGS_rate_limiter_auto_tuned));
4347 4348
    }

4349
    options.listeners.emplace_back(listener_);
4350

4351 4352 4353 4354 4355
    if (FLAGS_file_checksum) {
      options.file_checksum_gen_factory.reset(
          new FileChecksumGenCrc32cFactory());
    }

4356 4357 4358 4359
    if (FLAGS_num_multi_db <= 1) {
      OpenDb(options, FLAGS_db, &db_);
    } else {
      multi_dbs_.clear();
4360
      multi_dbs_.resize(FLAGS_num_multi_db);
4361
      auto wal_dir = options.wal_dir;
4362
      for (int i = 0; i < FLAGS_num_multi_db; i++) {
4363 4364 4365 4366
        if (!wal_dir.empty()) {
          options.wal_dir = GetPathForMultiple(wal_dir, i);
        }
        OpenDb(options, GetPathForMultiple(FLAGS_db, i), &multi_dbs_[i]);
4367
      }
4368
      options.wal_dir = wal_dir;
4369
    }
4370 4371 4372 4373 4374 4375

    // KeepFilter is a noop filter, this can be used to test compaction filter
    if (FLAGS_use_keep_filter) {
      options.compaction_filter = new KeepFilter();
      fprintf(stdout, "A noop compaction filter is used\n");
    }
4376 4377 4378 4379

    if (FLAGS_use_existing_keys) {
      // Only work on single database
      assert(db_.db != nullptr);
4380
      ReadOptions read_opts;  // before read_options_ initialized
4381 4382 4383 4384 4385 4386 4387 4388
      read_opts.total_order_seek = true;
      Iterator* iter = db_.db->NewIterator(read_opts);
      for (iter->SeekToFirst(); iter->Valid(); iter->Next()) {
        keys_.emplace_back(iter->key().ToString());
      }
      delete iter;
      FLAGS_num = keys_.size();
    }
4389 4390 4391 4392 4393
  }

  void Open(Options* opts) {
    if (!InitializeOptionsFromFile(opts)) {
      InitializeOptionsFromFlags(opts);
4394
    }
4395

4396
    InitializeOptionsGeneral(opts);
4397 4398
  }

Y
Yi Wu 已提交
4399
  void OpenDb(Options options, const std::string& db_name,
4400
      DBWithColumnFamilies* db) {
4401
    uint64_t open_start = FLAGS_report_open_timing ? FLAGS_env->NowNanos() : 0;
H
heyongqiang 已提交
4402
    Status s;
4403 4404
    // Open with column families if necessary.
    if (FLAGS_num_column_families > 1) {
4405 4406 4407 4408 4409 4410 4411
      size_t num_hot = FLAGS_num_column_families;
      if (FLAGS_num_hot_column_families > 0 &&
          FLAGS_num_hot_column_families < FLAGS_num_column_families) {
        num_hot = FLAGS_num_hot_column_families;
      } else {
        FLAGS_num_hot_column_families = FLAGS_num_column_families;
      }
4412
      std::vector<ColumnFamilyDescriptor> column_families;
4413
      for (size_t i = 0; i < num_hot; i++) {
4414 4415 4416
        column_families.push_back(ColumnFamilyDescriptor(
              ColumnFamilyName(i), ColumnFamilyOptions(options)));
      }
4417 4418 4419 4420 4421 4422 4423 4424 4425 4426 4427 4428 4429 4430 4431 4432 4433 4434 4435 4436 4437 4438
      std::vector<int> cfh_idx_to_prob;
      if (!FLAGS_column_family_distribution.empty()) {
        std::stringstream cf_prob_stream(FLAGS_column_family_distribution);
        std::string cf_prob;
        int sum = 0;
        while (std::getline(cf_prob_stream, cf_prob, ',')) {
          cfh_idx_to_prob.push_back(std::stoi(cf_prob));
          sum += cfh_idx_to_prob.back();
        }
        if (sum != 100) {
          fprintf(stderr, "column_family_distribution items must sum to 100\n");
          exit(1);
        }
        if (cfh_idx_to_prob.size() != num_hot) {
          fprintf(stderr,
                  "got %" ROCKSDB_PRIszt
                  " column_family_distribution items; expected "
                  "%" ROCKSDB_PRIszt "\n",
                  cfh_idx_to_prob.size(), num_hot);
          exit(1);
        }
      }
4439
#ifndef ROCKSDB_LITE
4440 4441 4442
      if (FLAGS_readonly) {
        s = DB::OpenForReadOnly(options, db_name, column_families,
            &db->cfh, &db->db);
A
agiardullo 已提交
4443
      } else if (FLAGS_optimistic_transaction_db) {
A
agiardullo 已提交
4444
        s = OptimisticTransactionDB::Open(options, db_name, column_families,
A
agiardullo 已提交
4445 4446 4447 4448 4449 4450 4451
                                          &db->cfh, &db->opt_txn_db);
        if (s.ok()) {
          db->db = db->opt_txn_db->GetBaseDB();
        }
      } else if (FLAGS_transaction_db) {
        TransactionDB* ptr;
        TransactionDBOptions txn_db_options;
4452 4453 4454 4455 4456
        if (options.unordered_write) {
          options.two_write_queues = true;
          txn_db_options.skip_concurrency_control = true;
          txn_db_options.write_policy = WRITE_PREPARED;
        }
A
agiardullo 已提交
4457 4458
        s = TransactionDB::Open(options, txn_db_options, db_name,
                                column_families, &db->cfh, &ptr);
A
agiardullo 已提交
4459
        if (s.ok()) {
A
agiardullo 已提交
4460
          db->db = ptr;
A
agiardullo 已提交
4461
        }
4462 4463 4464
      } else {
        s = DB::Open(options, db_name, column_families, &db->cfh, &db->db);
      }
4465 4466 4467
#else
      s = DB::Open(options, db_name, column_families, &db->cfh, &db->db);
#endif  // ROCKSDB_LITE
4468 4469 4470
      db->cfh.resize(FLAGS_num_column_families);
      db->num_created = num_hot;
      db->num_hot = num_hot;
4471
      db->cfh_idx_to_prob = std::move(cfh_idx_to_prob);
4472
#ifndef ROCKSDB_LITE
4473 4474
    } else if (FLAGS_readonly) {
      s = DB::OpenForReadOnly(options, db_name, &db->db);
A
agiardullo 已提交
4475 4476 4477 4478 4479
    } else if (FLAGS_optimistic_transaction_db) {
      s = OptimisticTransactionDB::Open(options, db_name, &db->opt_txn_db);
      if (s.ok()) {
        db->db = db->opt_txn_db->GetBaseDB();
      }
A
agiardullo 已提交
4480
    } else if (FLAGS_transaction_db) {
4481
      TransactionDB* ptr = nullptr;
A
agiardullo 已提交
4482
      TransactionDBOptions txn_db_options;
4483 4484 4485 4486 4487
      if (options.unordered_write) {
        options.two_write_queues = true;
        txn_db_options.skip_concurrency_control = true;
        txn_db_options.write_policy = WRITE_PREPARED;
      }
Y
Yi Wu 已提交
4488 4489 4490 4491
      s = CreateLoggerFromOptions(db_name, options, &options.info_log);
      if (s.ok()) {
        s = TransactionDB::Open(options, txn_db_options, db_name, &ptr);
      }
A
agiardullo 已提交
4492
      if (s.ok()) {
A
agiardullo 已提交
4493
        db->db = ptr;
A
agiardullo 已提交
4494
      }
4495
    } else if (FLAGS_use_blob_db) {
4496
      // Stacked BlobDB
A
Anirban Rahut 已提交
4497
      blob_db::BlobDBOptions blob_db_options;
4498
      blob_db_options.enable_garbage_collection = FLAGS_blob_db_enable_gc;
4499
      blob_db_options.garbage_collection_cutoff = FLAGS_blob_db_gc_cutoff;
S
Sagar Vemuri 已提交
4500
      blob_db_options.is_fifo = FLAGS_blob_db_is_fifo;
Y
Yi Wu 已提交
4501
      blob_db_options.max_db_size = FLAGS_blob_db_max_db_size;
S
Sagar Vemuri 已提交
4502 4503 4504 4505
      blob_db_options.ttl_range_secs = FLAGS_blob_db_ttl_range_secs;
      blob_db_options.min_blob_size = FLAGS_blob_db_min_blob_size;
      blob_db_options.bytes_per_sync = FLAGS_blob_db_bytes_per_sync;
      blob_db_options.blob_file_size = FLAGS_blob_db_file_size;
4506
      blob_db_options.compression = FLAGS_blob_db_compression_type_e;
4507
      blob_db::BlobDB* ptr = nullptr;
Y
Yi Wu 已提交
4508
      s = blob_db::BlobDB::Open(options, blob_db_options, db_name, &ptr);
A
Anirban Rahut 已提交
4509 4510 4511
      if (s.ok()) {
        db->db = ptr;
      }
4512 4513 4514 4515 4516 4517 4518 4519 4520 4521 4522 4523 4524 4525 4526 4527 4528 4529 4530 4531 4532 4533 4534 4535 4536 4537
    } else if (FLAGS_use_secondary_db) {
      if (FLAGS_secondary_path.empty()) {
        std::string default_secondary_path;
        FLAGS_env->GetTestDirectory(&default_secondary_path);
        default_secondary_path += "/dbbench_secondary";
        FLAGS_secondary_path = default_secondary_path;
      }
      s = DB::OpenAsSecondary(options, db_name, FLAGS_secondary_path, &db->db);
      if (s.ok() && FLAGS_secondary_update_interval > 0) {
        secondary_update_thread_.reset(new port::Thread(
            [this](int interval, DBWithColumnFamilies* _db) {
              while (0 == secondary_update_stopped_.load(
                              std::memory_order_relaxed)) {
                Status secondary_update_status =
                    _db->db->TryCatchUpWithPrimary();
                if (!secondary_update_status.ok()) {
                  fprintf(stderr, "Failed to catch up with primary: %s\n",
                          secondary_update_status.ToString().c_str());
                  break;
                }
                ++secondary_db_updates_;
                FLAGS_env->SleepForMicroseconds(interval * 1000000);
              }
            },
            FLAGS_secondary_update_interval, db));
      }
A
Anirban Rahut 已提交
4538
#endif  // ROCKSDB_LITE
H
heyongqiang 已提交
4539
    } else {
4540
      s = DB::Open(options, db_name, &db->db);
H
heyongqiang 已提交
4541
    }
4542 4543 4544 4545 4546
    if (FLAGS_report_open_timing) {
      std::cout << "OpenDb:     "
                << (FLAGS_env->NowNanos() - open_start) / 1000000.0
                << " milliseconds\n";
    }
4547 4548 4549 4550 4551 4552
    if (!s.ok()) {
      fprintf(stderr, "open error: %s\n", s.ToString().c_str());
      exit(1);
    }
  }

4553 4554 4555 4556
  enum WriteMode {
    RANDOM, SEQUENTIAL, UNIQUE_RANDOM
  };

4557 4558 4559 4560 4561 4562 4563 4564 4565
  void WriteSeqDeterministic(ThreadState* thread) {
    DoDeterministicCompact(thread, open_options_.compaction_style, SEQUENTIAL);
  }

  void WriteUniqueRandomDeterministic(ThreadState* thread) {
    DoDeterministicCompact(thread, open_options_.compaction_style,
                           UNIQUE_RANDOM);
  }

4566
  void WriteSeq(ThreadState* thread) {
4567
    DoWrite(thread, SEQUENTIAL);
4568
  }
4569

4570
  void WriteRandom(ThreadState* thread) {
4571
    DoWrite(thread, RANDOM);
4572 4573
  }

4574 4575 4576 4577
  void WriteUniqueRandom(ThreadState* thread) {
    DoWrite(thread, UNIQUE_RANDOM);
  }

4578 4579
  class KeyGenerator {
   public:
A
Andrew Kryczka 已提交
4580 4581 4582
    KeyGenerator(Random64* rand, WriteMode mode, uint64_t num,
                 uint64_t /*num_per_set*/ = 64 * 1024)
        : rand_(rand), mode_(mode), num_(num), next_(0) {
4583 4584 4585 4586 4587 4588 4589 4590 4591
      if (mode_ == UNIQUE_RANDOM) {
        // NOTE: if memory consumption of this approach becomes a concern,
        // we can either break it into pieces and only random shuffle a section
        // each time. Alternatively, use a bit map implementation
        // (https://reviews.facebook.net/differential/diff/54627/)
        values_.resize(num_);
        for (uint64_t i = 0; i < num_; ++i) {
          values_[i] = i;
        }
P
Peter Dillinger 已提交
4592 4593
        RandomShuffle(values_.begin(), values_.end(),
                      static_cast<uint32_t>(FLAGS_seed));
4594 4595 4596 4597 4598 4599 4600 4601 4602 4603
      }
    }

    uint64_t Next() {
      switch (mode_) {
        case SEQUENTIAL:
          return next_++;
        case RANDOM:
          return rand_->Next() % num_;
        case UNIQUE_RANDOM:
4604
          assert(next_ < num_);
4605 4606 4607 4608 4609 4610
          return values_[next_++];
      }
      assert(false);
      return std::numeric_limits<uint64_t>::max();
    }

4611 4612 4613 4614 4615 4616 4617
    // Only available for UNIQUE_RANDOM mode.
    uint64_t Fetch(uint64_t index) {
      assert(mode_ == UNIQUE_RANDOM);
      assert(index < values_.size());
      return values_[index];
    }

4618 4619 4620 4621 4622 4623 4624 4625
   private:
    Random64* rand_;
    WriteMode mode_;
    const uint64_t num_;
    uint64_t next_;
    std::vector<uint64_t> values_;
  };

4626
  DB* SelectDB(ThreadState* thread) {
4627 4628 4629 4630 4631 4632 4633 4634 4635 4636 4637 4638
    return SelectDBWithCfh(thread)->db;
  }

  DBWithColumnFamilies* SelectDBWithCfh(ThreadState* thread) {
    return SelectDBWithCfh(thread->rand.Next());
  }

  DBWithColumnFamilies* SelectDBWithCfh(uint64_t rand_int) {
    if (db_.db != nullptr) {
      return &db_;
    } else  {
      return &multi_dbs_[rand_int % multi_dbs_.size()];
4639 4640
    }
  }
4641

4642 4643 4644 4645
  double SineRate(double x) {
    return FLAGS_sine_a*sin((FLAGS_sine_b*x) + FLAGS_sine_c) + FLAGS_sine_d;
  }

4646 4647
  void DoWrite(ThreadState* thread, WriteMode write_mode) {
    const int test_duration = write_mode == RANDOM ? FLAGS_duration : 0;
4648
    const int64_t num_ops = writes_ == 0 ? num_ : writes_;
4649

4650
    size_t num_key_gens = 1;
4651
    if (db_.db == nullptr) {
4652 4653 4654
      num_key_gens = multi_dbs_.size();
    }
    std::vector<std::unique_ptr<KeyGenerator>> key_gens(num_key_gens);
4655 4656 4657 4658 4659 4660 4661 4662 4663
    int64_t max_ops = num_ops * num_key_gens;
    int64_t ops_per_stage = max_ops;
    if (FLAGS_num_column_families > 1 && FLAGS_num_hot_column_families > 0) {
      ops_per_stage = (max_ops - 1) / (FLAGS_num_column_families /
                                       FLAGS_num_hot_column_families) +
                      1;
    }

    Duration duration(test_duration, max_ops, ops_per_stage);
4664
    const uint64_t num_per_key_gen = num_ + max_num_range_tombstones_;
4665
    for (size_t i = 0; i < num_key_gens; i++) {
4666
      key_gens[i].reset(new KeyGenerator(&(thread->rand), write_mode,
4667
                                         num_per_key_gen, ops_per_stage));
4668
    }
M
Mark Callaghan 已提交
4669

4670
    if (num_ != FLAGS_num) {
4671
      char msg[100];
4672
      snprintf(msg, sizeof(msg), "(%" PRIu64 " ops)", num_);
4673
      thread->stats.AddMessage(msg);
4674 4675
    }

4676
    RandomGenerator gen;
4677
    WriteBatch batch(/*reserved_bytes=*/0, /*max_bytes=*/0,
4678
                     /*protection_bytes_per_key=*/0, user_timestamp_size_);
J
jorlow@chromium.org 已提交
4679
    Status s;
4680
    int64_t bytes = 0;
L
Lei Jin 已提交
4681

4682 4683
    std::unique_ptr<const char[]> key_guard;
    Slice key = AllocateKey(&key_guard);
A
Andrew Kryczka 已提交
4684 4685 4686 4687
    std::unique_ptr<const char[]> begin_key_guard;
    Slice begin_key = AllocateKey(&begin_key_guard);
    std::unique_ptr<const char[]> end_key_guard;
    Slice end_key = AllocateKey(&end_key_guard);
4688
    double p = 0.0;
4689
    uint64_t num_overwrites = 0, num_unique_keys = 0, num_selective_deletes = 0;
4690 4691 4692 4693 4694 4695 4696 4697 4698 4699 4700 4701 4702 4703 4704 4705 4706 4707 4708 4709 4710 4711 4712 4713 4714 4715 4716 4717
    // If user set overwrite_probability flag,
    // check if value is in [0.0,1.0].
    if (FLAGS_overwrite_probability > 0.0) {
      p = FLAGS_overwrite_probability > 1.0 ? 1.0 : FLAGS_overwrite_probability;
      // If overwrite set by user, and UNIQUE_RANDOM mode on,
      // the overwrite_window_size must be > 0.
      if (write_mode == UNIQUE_RANDOM && FLAGS_overwrite_window_size == 0) {
        fprintf(stderr,
                "Overwrite_window_size must be  strictly greater than 0.\n");
        ErrorExit();
      }
    }

    // Default_random_engine provides slightly
    // improved throughput over mt19937.
    std::default_random_engine overwrite_gen{
        static_cast<unsigned int>(FLAGS_seed)};
    std::bernoulli_distribution overwrite_decider(p);

    // Inserted key window is filled with the last N
    // keys previously inserted into the DB (with
    // N=FLAGS_overwrite_window_size).
    // We use a deque struct because:
    // - random access is O(1)
    // - insertion/removal at beginning/end is also O(1).
    std::deque<int64_t> inserted_key_window;
    Random64 reservoir_id_gen(FLAGS_seed);

4718 4719 4720 4721 4722 4723 4724 4725 4726 4727 4728 4729 4730 4731 4732 4733 4734 4735 4736 4737 4738 4739 4740 4741 4742 4743 4744 4745 4746 4747 4748 4749 4750 4751 4752 4753 4754 4755 4756 4757 4758 4759 4760
    // --- Variables used in disposable/persistent keys simulation:
    // The following variables are used when
    // disposable_entries_batch_size is >0. We simualte a workload
    // where the following sequence is repeated multiple times:
    // "A set of keys S1 is inserted ('disposable entries'), then after
    // some delay another set of keys S2 is inserted ('persistent entries')
    // and the first set of keys S1 is deleted. S2 artificially represents
    // the insertion of hypothetical results from some undefined computation
    // done on the first set of keys S1. The next sequence can start as soon
    // as the last disposable entry in the set S1 of this sequence is
    // inserted, if the delay is non negligible"
    bool skip_for_loop = false, is_disposable_entry = true;
    std::vector<uint64_t> disposable_entries_index(num_key_gens, 0);
    std::vector<uint64_t> persistent_ent_and_del_index(num_key_gens, 0);
    const uint64_t kNumDispAndPersEntries =
        FLAGS_disposable_entries_batch_size +
        FLAGS_persistent_entries_batch_size;
    if (kNumDispAndPersEntries > 0) {
      if ((write_mode != UNIQUE_RANDOM) || (writes_per_range_tombstone_ > 0) ||
          (p > 0.0)) {
        fprintf(
            stderr,
            "Disposable/persistent deletes are not compatible with overwrites "
            "and DeleteRanges; and are only supported in filluniquerandom.\n");
        ErrorExit();
      }
      if (FLAGS_disposable_entries_value_size < 0 ||
          FLAGS_persistent_entries_value_size < 0) {
        fprintf(
            stderr,
            "disposable_entries_value_size and persistent_entries_value_size"
            "have to be positive.\n");
        ErrorExit();
      }
    }
    Random rnd_disposable_entry(static_cast<uint32_t>(FLAGS_seed));
    std::string random_value;
    // Queue that stores scheduled timestamp of disposable entries deletes,
    // along with starting index of disposable entry keys to delete.
    std::vector<std::queue<std::pair<uint64_t, uint64_t>>> disposable_entries_q(
        num_key_gens);
    // --- End of variables used in disposable/persistent keys simulation.

4761 4762 4763 4764 4765 4766 4767 4768
    std::vector<std::unique_ptr<const char[]>> expanded_key_guards;
    std::vector<Slice> expanded_keys;
    if (FLAGS_expand_range_tombstones) {
      expanded_key_guards.resize(range_tombstone_width_);
      for (auto& expanded_key_guard : expanded_key_guards) {
        expanded_keys.emplace_back(AllocateKey(&expanded_key_guard));
      }
    }
A
Andrew Kryczka 已提交
4769

4770 4771 4772 4773 4774
    std::unique_ptr<char[]> ts_guard;
    if (user_timestamp_size_ > 0) {
      ts_guard.reset(new char[user_timestamp_size_]);
    }

4775
    int64_t stage = 0;
A
Andrew Kryczka 已提交
4776
    int64_t num_written = 0;
4777 4778 4779
    int64_t next_seq_db_at = num_ops;
    size_t id = 0;

4780
    while ((num_per_key_gen != 0) && !duration.Done(entries_per_batch_)) {
4781 4782 4783 4784 4785 4786 4787 4788 4789 4790
      if (duration.GetStage() != stage) {
        stage = duration.GetStage();
        if (db_.db != nullptr) {
          db_.CreateNewCf(open_options_, stage);
        } else {
          for (auto& db : multi_dbs_) {
            db.CreateNewCf(open_options_, stage);
          }
        }
      }
4791

4792 4793 4794 4795 4796 4797 4798 4799 4800 4801 4802 4803 4804 4805 4806 4807 4808
      if (write_mode != SEQUENTIAL) {
        id = thread->rand.Next() % num_key_gens;
      } else {
        // When doing a sequential load with multiple databases, load them in
        // order rather than all at the same time to avoid:
        // 1) long delays between flushing memtables
        // 2) flushing memtables for all of them at the same point in time
        // 3) not putting the same number of keys in each database
        if (num_written >= next_seq_db_at) {
          next_seq_db_at += num_ops;
          id++;
          if (id >= num_key_gens) {
            fprintf(stderr, "Logic error. Filled all databases\n");
            ErrorExit();
          }
        }
      }
4809
      DBWithColumnFamilies* db_with_cfh = SelectDBWithCfh(id);
4810

J
jorlow@chromium.org 已提交
4811
      batch.Clear();
4812
      int64_t batch_bytes = 0;
4813

L
Lei Jin 已提交
4814
      for (int64_t j = 0; j < entries_per_batch_; j++) {
4815 4816 4817 4818 4819 4820 4821 4822 4823 4824 4825 4826 4827 4828 4829 4830 4831
        int64_t rand_num = 0;
        if ((write_mode == UNIQUE_RANDOM) && (p > 0.0)) {
          if ((inserted_key_window.size() > 0) &&
              overwrite_decider(overwrite_gen)) {
            num_overwrites++;
            rand_num = inserted_key_window[reservoir_id_gen.Next() %
                                           inserted_key_window.size()];
          } else {
            num_unique_keys++;
            rand_num = key_gens[id]->Next();
            if (inserted_key_window.size() < FLAGS_overwrite_window_size) {
              inserted_key_window.push_back(rand_num);
            } else {
              inserted_key_window.pop_front();
              inserted_key_window.push_back(rand_num);
            }
          }
4832 4833 4834 4835 4836 4837 4838 4839 4840 4841 4842 4843 4844 4845 4846 4847 4848 4849 4850 4851 4852 4853 4854 4855 4856 4857 4858 4859 4860 4861 4862 4863 4864 4865 4866 4867 4868 4869 4870 4871 4872 4873 4874 4875 4876 4877 4878 4879 4880 4881 4882 4883 4884 4885 4886 4887 4888 4889 4890 4891 4892 4893 4894 4895 4896 4897 4898 4899 4900 4901 4902 4903 4904 4905 4906 4907 4908 4909 4910 4911 4912
        } else if (kNumDispAndPersEntries > 0) {
          // Check if queue is non-empty and if we need to insert
          // 'persistent' KV entries (KV entries that are never deleted)
          // and delete disposable entries previously inserted.
          if (!disposable_entries_q[id].empty() &&
              (disposable_entries_q[id].front().first <
               FLAGS_env->NowMicros())) {
            // If we need to perform a "merge op" pattern,
            // we first write all the persistent KV entries not targeted
            // by deletes, and then we write the disposable entries deletes.
            if (persistent_ent_and_del_index[id] <
                FLAGS_persistent_entries_batch_size) {
              // Generate key to insert.
              rand_num =
                  key_gens[id]->Fetch(disposable_entries_q[id].front().second +
                                      FLAGS_disposable_entries_batch_size +
                                      persistent_ent_and_del_index[id]);
              persistent_ent_and_del_index[id]++;
              is_disposable_entry = false;
              skip_for_loop = false;
            } else if (persistent_ent_and_del_index[id] <
                       kNumDispAndPersEntries) {
              // Find key of the entry to delete.
              rand_num =
                  key_gens[id]->Fetch(disposable_entries_q[id].front().second +
                                      (persistent_ent_and_del_index[id] -
                                       FLAGS_persistent_entries_batch_size));
              persistent_ent_and_del_index[id]++;
              GenerateKeyFromInt(rand_num, FLAGS_num, &key);
              // For the delete operation, everything happens here and we
              // skip the rest of the for-loop, which is designed for
              // inserts.
              if (FLAGS_num_column_families <= 1) {
                batch.Delete(key);
              } else {
                // We use same rand_num as seed for key and column family so
                // that we can deterministically find the cfh corresponding to a
                // particular key while reading the key.
                batch.Delete(db_with_cfh->GetCfh(rand_num), key);
              }
              // A delete only includes Key+Timestamp (no value).
              batch_bytes += key_size_ + user_timestamp_size_;
              bytes += key_size_ + user_timestamp_size_;
              num_selective_deletes++;
              // Skip rest of the for-loop (j=0, j<entries_per_batch_,j++).
              skip_for_loop = true;
            } else {
              assert(false);  // should never reach this point.
            }
            // If disposable_entries_q needs to be updated (ie: when a selective
            // insert+delete was successfully completed, pop the job out of the
            // queue).
            if (!disposable_entries_q[id].empty() &&
                (disposable_entries_q[id].front().first <
                 FLAGS_env->NowMicros()) &&
                persistent_ent_and_del_index[id] == kNumDispAndPersEntries) {
              disposable_entries_q[id].pop();
              persistent_ent_and_del_index[id] = 0;
            }

            // If we are deleting disposable entries, skip the rest of the
            // for-loop since there is no key-value inserts at this moment in
            // time.
            if (skip_for_loop) {
              continue;
            }

          }
          // If no job is in the queue, then we keep inserting disposable KV
          // entries that will be deleted later by a series of deletes.
          else {
            rand_num = key_gens[id]->Fetch(disposable_entries_index[id]);
            disposable_entries_index[id]++;
            is_disposable_entry = true;
            if ((disposable_entries_index[id] %
                 FLAGS_disposable_entries_batch_size) == 0) {
              // Skip the persistent KV entries inserts for now
              disposable_entries_index[id] +=
                  FLAGS_persistent_entries_batch_size;
            }
          }
4913 4914 4915
        } else {
          rand_num = key_gens[id]->Next();
        }
4916
        GenerateKeyFromInt(rand_num, FLAGS_num, &key);
4917 4918 4919 4920 4921 4922 4923 4924 4925 4926
        Slice val;
        if (kNumDispAndPersEntries > 0) {
          random_value = rnd_disposable_entry.RandomString(
              is_disposable_entry ? FLAGS_disposable_entries_value_size
                                  : FLAGS_persistent_entries_value_size);
          val = Slice(random_value);
          num_unique_keys++;
        } else {
          val = gen.Generate();
        }
Y
Yi Wu 已提交
4927 4928
        if (use_blob_db_) {
#ifndef ROCKSDB_LITE
4929
          // Stacked BlobDB
A
Anirban Rahut 已提交
4930 4931
          blob_db::BlobDB* blobdb =
              static_cast<blob_db::BlobDB*>(db_with_cfh->db);
4932 4933 4934 4935 4936 4937
          if (FLAGS_blob_db_max_ttl_range > 0) {
            int ttl = rand() % FLAGS_blob_db_max_ttl_range;
            s = blobdb->PutWithTTL(write_options_, key, val, ttl);
          } else {
            s = blobdb->Put(write_options_, key, val);
          }
Y
Yi Wu 已提交
4938
#endif  //  ROCKSDB_LITE
4939
        } else if (FLAGS_num_column_families <= 1) {
4940
          batch.Put(key, val);
4941 4942 4943 4944
        } else {
          // We use same rand_num as seed for key and column family so that we
          // can deterministically find the cfh corresponding to a particular
          // key while reading the key.
4945
          batch.Put(db_with_cfh->GetCfh(rand_num), key,
4946
                    val);
4947
        }
4948 4949
        batch_bytes += val.size() + key_size_ + user_timestamp_size_;
        bytes += val.size() + key_size_ + user_timestamp_size_;
A
Andrew Kryczka 已提交
4950
        ++num_written;
4951 4952 4953 4954 4955 4956 4957 4958 4959 4960 4961 4962 4963 4964 4965 4966 4967

        // If all disposable entries have been inserted, then we need to
        // add in the job queue a call for 'persistent entry insertions +
        // disposable entry deletions'.
        if (kNumDispAndPersEntries > 0 && is_disposable_entry &&
            ((disposable_entries_index[id] % kNumDispAndPersEntries) == 0)) {
          // Queue contains [timestamp, starting_idx],
          // timestamp = current_time + delay (minimum aboslute time when to
          // start inserting the selective deletes) starting_idx = index in the
          // keygen of the rand_num to generate the key of the first KV entry to
          // delete (= key of the first selective delete).
          disposable_entries_q[id].push(std::make_pair(
              FLAGS_env->NowMicros() +
                  FLAGS_disposable_entries_delete_delay /* timestamp */,
              disposable_entries_index[id] - kNumDispAndPersEntries
              /*starting idx*/));
        }
A
Andrew Kryczka 已提交
4968
        if (writes_per_range_tombstone_ > 0 &&
4969 4970 4971
            num_written > writes_before_delete_range_ &&
            (num_written - writes_before_delete_range_) /
                    writes_per_range_tombstone_ <=
A
Andrew Kryczka 已提交
4972
                max_num_range_tombstones_ &&
4973 4974 4975
            (num_written - writes_before_delete_range_) %
                    writes_per_range_tombstone_ ==
                0) {
A
Andrew Kryczka 已提交
4976
          int64_t begin_num = key_gens[id]->Next();
4977 4978 4979 4980 4981
          if (FLAGS_expand_range_tombstones) {
            for (int64_t offset = 0; offset < range_tombstone_width_;
                 ++offset) {
              GenerateKeyFromInt(begin_num + offset, FLAGS_num,
                                 &expanded_keys[offset]);
Y
Yi Wu 已提交
4982 4983
              if (use_blob_db_) {
#ifndef ROCKSDB_LITE
4984
                // Stacked BlobDB
4985 4986
                s = db_with_cfh->db->Delete(write_options_,
                                            expanded_keys[offset]);
Y
Yi Wu 已提交
4987
#endif  //  ROCKSDB_LITE
4988 4989 4990 4991 4992 4993 4994
              } else if (FLAGS_num_column_families <= 1) {
                batch.Delete(expanded_keys[offset]);
              } else {
                batch.Delete(db_with_cfh->GetCfh(rand_num),
                             expanded_keys[offset]);
              }
            }
A
Andrew Kryczka 已提交
4995
          } else {
4996 4997 4998
            GenerateKeyFromInt(begin_num, FLAGS_num, &begin_key);
            GenerateKeyFromInt(begin_num + range_tombstone_width_, FLAGS_num,
                               &end_key);
Y
Yi Wu 已提交
4999 5000
            if (use_blob_db_) {
#ifndef ROCKSDB_LITE
5001
              // Stacked BlobDB
5002 5003 5004
              s = db_with_cfh->db->DeleteRange(
                  write_options_, db_with_cfh->db->DefaultColumnFamily(),
                  begin_key, end_key);
Y
Yi Wu 已提交
5005
#endif  //  ROCKSDB_LITE
5006 5007 5008 5009 5010 5011
            } else if (FLAGS_num_column_families <= 1) {
              batch.DeleteRange(begin_key, end_key);
            } else {
              batch.DeleteRange(db_with_cfh->GetCfh(rand_num), begin_key,
                                end_key);
            }
A
Andrew Kryczka 已提交
5012 5013
          }
        }
5014
      }
5015 5016 5017 5018 5019 5020 5021 5022 5023
      if (thread->shared->write_rate_limiter.get() != nullptr) {
        thread->shared->write_rate_limiter->Request(
            batch_bytes, Env::IO_HIGH,
            nullptr /* stats */, RateLimiter::OpType::kWrite);
        // Set time at which last op finished to Now() to hide latency and
        // sleep from rate limiter. Also, do the check once per batch, not
        // once per write.
        thread->stats.ResetLastOpTime();
      }
5024 5025
      if (user_timestamp_size_ > 0) {
        Slice user_ts = mock_app_clock_->Allocate(ts_guard.get());
5026 5027
        s = batch.UpdateTimestamps(
            user_ts, [this](uint32_t) { return user_timestamp_size_; });
5028 5029 5030 5031 5032 5033
        if (!s.ok()) {
          fprintf(stderr, "assign timestamp to write batch: %s\n",
                  s.ToString().c_str());
          ErrorExit();
        }
      }
Y
Yi Wu 已提交
5034
      if (!use_blob_db_) {
5035
        // Not stacked BlobDB
5036 5037
        s = db_with_cfh->db->Write(write_options_, &batch);
      }
5038
      thread->stats.FinishedOps(db_with_cfh, db_with_cfh->db,
5039
                                entries_per_batch_, kWrite);
5040 5041 5042 5043 5044 5045 5046 5047 5048 5049 5050 5051 5052 5053 5054 5055 5056 5057 5058 5059 5060
      if (FLAGS_sine_write_rate) {
        uint64_t now = FLAGS_env->NowMicros();

        uint64_t usecs_since_last;
        if (now > thread->stats.GetSineInterval()) {
          usecs_since_last = now - thread->stats.GetSineInterval();
        } else {
          usecs_since_last = 0;
        }

        if (usecs_since_last >
            (FLAGS_sine_write_rate_interval_milliseconds * uint64_t{1000})) {
          double usecs_since_start =
                  static_cast<double>(now - thread->stats.GetStart());
          thread->stats.ResetSineInterval();
          uint64_t write_rate =
                  static_cast<uint64_t>(SineRate(usecs_since_start / 1000000.0));
          thread->shared->write_rate_limiter.reset(
                  NewGenericRateLimiter(write_rate));
        }
      }
5061 5062 5063 5064
      if (!s.ok()) {
        s = listener_->WaitForRecovery(600000000) ? Status::OK() : s;
      }

J
jorlow@chromium.org 已提交
5065 5066
      if (!s.ok()) {
        fprintf(stderr, "put error: %s\n", s.ToString().c_str());
5067
        ErrorExit();
J
jorlow@chromium.org 已提交
5068 5069
      }
    }
5070 5071
    if ((write_mode == UNIQUE_RANDOM) && (p > 0.0)) {
      fprintf(stdout,
5072
              "Number of unique keys inserted: %" PRIu64
5073 5074
              ".\nNumber of overwrites: %" PRIu64 "\n",
              num_unique_keys, num_overwrites);
5075 5076 5077 5078 5079
    } else if (kNumDispAndPersEntries > 0) {
      fprintf(stdout,
              "Number of unique keys inserted (disposable+persistent): %" PRIu64
              ".\nNumber of 'disposable entry delete': %" PRIu64 "\n",
              num_written, num_selective_deletes);
5080
    }
5081
    thread->stats.AddBytes(bytes);
J
jorlow@chromium.org 已提交
5082 5083
  }

5084 5085 5086 5087 5088 5089 5090 5091 5092 5093 5094 5095 5096 5097 5098 5099
  Status DoDeterministicCompact(ThreadState* thread,
                                CompactionStyle compaction_style,
                                WriteMode write_mode) {
#ifndef ROCKSDB_LITE
    ColumnFamilyMetaData meta;
    std::vector<DB*> db_list;
    if (db_.db != nullptr) {
      db_list.push_back(db_.db);
    } else {
      for (auto& db : multi_dbs_) {
        db_list.push_back(db.db);
      }
    }
    std::vector<Options> options_list;
    for (auto db : db_list) {
      options_list.push_back(db->GetOptions());
5100 5101 5102 5103 5104 5105 5106
      if (compaction_style != kCompactionStyleFIFO) {
        db->SetOptions({{"disable_auto_compactions", "1"},
                        {"level0_slowdown_writes_trigger", "400000000"},
                        {"level0_stop_writes_trigger", "400000000"}});
      } else {
        db->SetOptions({{"disable_auto_compactions", "1"}});
      }
5107 5108 5109 5110 5111 5112 5113 5114 5115 5116 5117 5118 5119 5120 5121 5122 5123 5124 5125 5126 5127 5128 5129 5130 5131 5132 5133 5134 5135 5136 5137 5138 5139 5140 5141 5142 5143 5144 5145 5146 5147 5148 5149
    }

    assert(!db_list.empty());
    auto num_db = db_list.size();
    size_t num_levels = static_cast<size_t>(open_options_.num_levels);
    size_t output_level = open_options_.num_levels - 1;
    std::vector<std::vector<std::vector<SstFileMetaData>>> sorted_runs(num_db);
    std::vector<size_t> num_files_at_level0(num_db, 0);
    if (compaction_style == kCompactionStyleLevel) {
      if (num_levels == 0) {
        return Status::InvalidArgument("num_levels should be larger than 1");
      }
      bool should_stop = false;
      while (!should_stop) {
        if (sorted_runs[0].empty()) {
          DoWrite(thread, write_mode);
        } else {
          DoWrite(thread, UNIQUE_RANDOM);
        }
        for (size_t i = 0; i < num_db; i++) {
          auto db = db_list[i];
          db->Flush(FlushOptions());
          db->GetColumnFamilyMetaData(&meta);
          if (num_files_at_level0[i] == meta.levels[0].files.size() ||
              writes_ == 0) {
            should_stop = true;
            continue;
          }
          sorted_runs[i].emplace_back(
              meta.levels[0].files.begin(),
              meta.levels[0].files.end() - num_files_at_level0[i]);
          num_files_at_level0[i] = meta.levels[0].files.size();
          if (sorted_runs[i].back().size() == 1) {
            should_stop = true;
            continue;
          }
          if (sorted_runs[i].size() == output_level) {
            auto& L1 = sorted_runs[i].back();
            L1.erase(L1.begin(), L1.begin() + L1.size() / 3);
            should_stop = true;
            continue;
          }
        }
5150
        writes_ /= static_cast<int64_t>(open_options_.max_bytes_for_level_multiplier);
5151 5152 5153
      }
      for (size_t i = 0; i < num_db; i++) {
        if (sorted_runs[i].size() < num_levels - 1) {
5154
          fprintf(stderr, "n is too small to fill %" ROCKSDB_PRIszt " levels\n", num_levels);
5155 5156 5157 5158 5159 5160
          exit(1);
        }
      }
      for (size_t i = 0; i < num_db; i++) {
        auto db = db_list[i];
        auto compactionOptions = CompactionOptions();
5161
        compactionOptions.compression = FLAGS_compression_type_e;
5162 5163 5164 5165
        auto options = db->GetOptions();
        MutableCFOptions mutable_cf_options(options);
        for (size_t j = 0; j < sorted_runs[i].size(); j++) {
          compactionOptions.output_file_size_limit =
5166 5167
              MaxFileSizeForLevel(mutable_cf_options,
                  static_cast<int>(output_level), compaction_style);
5168 5169 5170 5171 5172 5173 5174 5175 5176 5177 5178 5179 5180 5181 5182 5183 5184 5185 5186 5187 5188 5189 5190 5191 5192 5193 5194 5195 5196 5197 5198 5199 5200 5201
          std::cout << sorted_runs[i][j].size() << std::endl;
          db->CompactFiles(compactionOptions, {sorted_runs[i][j].back().name,
                                               sorted_runs[i][j].front().name},
                           static_cast<int>(output_level - j) /*level*/);
        }
      }
    } else if (compaction_style == kCompactionStyleUniversal) {
      auto ratio = open_options_.compaction_options_universal.size_ratio;
      bool should_stop = false;
      while (!should_stop) {
        if (sorted_runs[0].empty()) {
          DoWrite(thread, write_mode);
        } else {
          DoWrite(thread, UNIQUE_RANDOM);
        }
        for (size_t i = 0; i < num_db; i++) {
          auto db = db_list[i];
          db->Flush(FlushOptions());
          db->GetColumnFamilyMetaData(&meta);
          if (num_files_at_level0[i] == meta.levels[0].files.size() ||
              writes_ == 0) {
            should_stop = true;
            continue;
          }
          sorted_runs[i].emplace_back(
              meta.levels[0].files.begin(),
              meta.levels[0].files.end() - num_files_at_level0[i]);
          num_files_at_level0[i] = meta.levels[0].files.size();
          if (sorted_runs[i].back().size() == 1) {
            should_stop = true;
            continue;
          }
          num_files_at_level0[i] = meta.levels[0].files.size();
        }
5202
        writes_ =  static_cast<int64_t>(writes_* static_cast<double>(100) / (ratio + 200));
5203 5204 5205
      }
      for (size_t i = 0; i < num_db; i++) {
        if (sorted_runs[i].size() < num_levels) {
5206
          fprintf(stderr, "n is too small to fill %" ROCKSDB_PRIszt  " levels\n", num_levels);
5207 5208 5209 5210 5211 5212
          exit(1);
        }
      }
      for (size_t i = 0; i < num_db; i++) {
        auto db = db_list[i];
        auto compactionOptions = CompactionOptions();
5213
        compactionOptions.compression = FLAGS_compression_type_e;
5214 5215 5216 5217
        auto options = db->GetOptions();
        MutableCFOptions mutable_cf_options(options);
        for (size_t j = 0; j < sorted_runs[i].size(); j++) {
          compactionOptions.output_file_size_limit =
5218 5219
              MaxFileSizeForLevel(mutable_cf_options,
                  static_cast<int>(output_level), compaction_style);
5220 5221 5222 5223 5224 5225 5226 5227
          db->CompactFiles(
              compactionOptions,
              {sorted_runs[i][j].back().name, sorted_runs[i][j].front().name},
              (output_level > j ? static_cast<int>(output_level - j)
                                : 0) /*level*/);
        }
      }
    } else if (compaction_style == kCompactionStyleFIFO) {
5228 5229 5230 5231 5232 5233 5234 5235 5236 5237 5238 5239 5240 5241 5242 5243 5244 5245 5246 5247 5248 5249 5250 5251 5252 5253 5254 5255 5256 5257 5258
      if (num_levels != 1) {
        return Status::InvalidArgument(
          "num_levels should be 1 for FIFO compaction");
      }
      if (FLAGS_num_multi_db != 0) {
        return Status::InvalidArgument("Doesn't support multiDB");
      }
      auto db = db_list[0];
      std::vector<std::string> file_names;
      while (true) {
        if (sorted_runs[0].empty()) {
          DoWrite(thread, write_mode);
        } else {
          DoWrite(thread, UNIQUE_RANDOM);
        }
        db->Flush(FlushOptions());
        db->GetColumnFamilyMetaData(&meta);
        auto total_size = meta.levels[0].size;
        if (total_size >=
          db->GetOptions().compaction_options_fifo.max_table_files_size) {
          for (auto file_meta : meta.levels[0].files) {
            file_names.emplace_back(file_meta.name);
          }
          break;
        }
      }
      // TODO(shuzhang1989): Investigate why CompactFiles not working
      // auto compactionOptions = CompactionOptions();
      // db->CompactFiles(compactionOptions, file_names, 0);
      auto compactionOptions = CompactRangeOptions();
      db->CompactRange(compactionOptions, nullptr, nullptr);
5259 5260 5261 5262 5263 5264 5265 5266 5267 5268 5269 5270 5271 5272 5273 5274 5275 5276 5277 5278 5279 5280
    } else {
      fprintf(stdout,
              "%-12s : skipped (-compaction_stype=kCompactionStyleNone)\n",
              "filldeterministic");
      return Status::InvalidArgument("None compaction is not supported");
    }

// Verify seqno and key range
// Note: the seqno get changed at the max level by implementation
// optimization, so skip the check of the max level.
#ifndef NDEBUG
    for (size_t k = 0; k < num_db; k++) {
      auto db = db_list[k];
      db->GetColumnFamilyMetaData(&meta);
      // verify the number of sorted runs
      if (compaction_style == kCompactionStyleLevel) {
        assert(num_levels - 1 == sorted_runs[k].size());
      } else if (compaction_style == kCompactionStyleUniversal) {
        assert(meta.levels[0].files.size() + num_levels - 1 ==
               sorted_runs[k].size());
      } else if (compaction_style == kCompactionStyleFIFO) {
        // TODO(gzh): FIFO compaction
5281 5282 5283 5284 5285
        db->GetColumnFamilyMetaData(&meta);
        auto total_size = meta.levels[0].size;
        assert(total_size <=
          db->GetOptions().compaction_options_fifo.max_table_files_size);
          break;
5286 5287 5288 5289 5290 5291 5292 5293 5294 5295 5296 5297 5298 5299 5300 5301 5302 5303 5304 5305 5306 5307 5308 5309 5310 5311 5312 5313 5314 5315 5316 5317 5318 5319 5320 5321 5322 5323 5324 5325 5326 5327 5328 5329 5330 5331 5332 5333 5334 5335 5336 5337 5338 5339 5340 5341 5342 5343 5344 5345 5346 5347 5348 5349 5350
      }

      // verify smallest/largest seqno and key range of each sorted run
      auto max_level = num_levels - 1;
      int level;
      for (size_t i = 0; i < sorted_runs[k].size(); i++) {
        level = static_cast<int>(max_level - i);
        SequenceNumber sorted_run_smallest_seqno = kMaxSequenceNumber;
        SequenceNumber sorted_run_largest_seqno = 0;
        std::string sorted_run_smallest_key, sorted_run_largest_key;
        bool first_key = true;
        for (auto fileMeta : sorted_runs[k][i]) {
          sorted_run_smallest_seqno =
              std::min(sorted_run_smallest_seqno, fileMeta.smallest_seqno);
          sorted_run_largest_seqno =
              std::max(sorted_run_largest_seqno, fileMeta.largest_seqno);
          if (first_key ||
              db->DefaultColumnFamily()->GetComparator()->Compare(
                  fileMeta.smallestkey, sorted_run_smallest_key) < 0) {
            sorted_run_smallest_key = fileMeta.smallestkey;
          }
          if (first_key ||
              db->DefaultColumnFamily()->GetComparator()->Compare(
                  fileMeta.largestkey, sorted_run_largest_key) > 0) {
            sorted_run_largest_key = fileMeta.largestkey;
          }
          first_key = false;
        }
        if (compaction_style == kCompactionStyleLevel ||
            (compaction_style == kCompactionStyleUniversal && level > 0)) {
          SequenceNumber level_smallest_seqno = kMaxSequenceNumber;
          SequenceNumber level_largest_seqno = 0;
          for (auto fileMeta : meta.levels[level].files) {
            level_smallest_seqno =
                std::min(level_smallest_seqno, fileMeta.smallest_seqno);
            level_largest_seqno =
                std::max(level_largest_seqno, fileMeta.largest_seqno);
          }
          assert(sorted_run_smallest_key ==
                 meta.levels[level].files.front().smallestkey);
          assert(sorted_run_largest_key ==
                 meta.levels[level].files.back().largestkey);
          if (level != static_cast<int>(max_level)) {
            // compaction at max_level would change sequence number
            assert(sorted_run_smallest_seqno == level_smallest_seqno);
            assert(sorted_run_largest_seqno == level_largest_seqno);
          }
        } else if (compaction_style == kCompactionStyleUniversal) {
          // level <= 0 means sorted runs on level 0
          auto level0_file =
              meta.levels[0].files[sorted_runs[k].size() - 1 - i];
          assert(sorted_run_smallest_key == level0_file.smallestkey);
          assert(sorted_run_largest_key == level0_file.largestkey);
          if (level != static_cast<int>(max_level)) {
            assert(sorted_run_smallest_seqno == level0_file.smallest_seqno);
            assert(sorted_run_largest_seqno == level0_file.largest_seqno);
          }
        }
      }
    }
#endif
    // print the size of each sorted_run
    for (size_t k = 0; k < num_db; k++) {
      auto db = db_list[k];
      fprintf(stdout,
5351
              "---------------------- DB %" ROCKSDB_PRIszt " LSM ---------------------\n", k);
5352 5353 5354 5355 5356 5357 5358
      db->GetColumnFamilyMetaData(&meta);
      for (auto& levelMeta : meta.levels) {
        if (levelMeta.files.empty()) {
          continue;
        }
        if (levelMeta.level == 0) {
          for (auto& fileMeta : levelMeta.files) {
5359
            fprintf(stdout, "Level[%d]: %s(size: %" PRIi64 " bytes)\n",
5360
                    levelMeta.level, fileMeta.name.c_str(), fileMeta.size);
5361 5362
          }
        } else {
5363
          fprintf(stdout, "Level[%d]: %s - %s(total size: %" PRIi64 " bytes)\n",
5364 5365 5366 5367 5368 5369 5370 5371 5372 5373 5374 5375 5376 5377 5378 5379
                  levelMeta.level, levelMeta.files.front().name.c_str(),
                  levelMeta.files.back().name.c_str(), levelMeta.size);
        }
      }
    }
    for (size_t i = 0; i < num_db; i++) {
      db_list[i]->SetOptions(
          {{"disable_auto_compactions",
            std::to_string(options_list[i].disable_auto_compactions)},
           {"level0_slowdown_writes_trigger",
            std::to_string(options_list[i].level0_slowdown_writes_trigger)},
           {"level0_stop_writes_trigger",
            std::to_string(options_list[i].level0_stop_writes_trigger)}});
    }
    return Status::OK();
#else
5380 5381 5382
    (void)thread;
    (void)compaction_style;
    (void)write_mode;
5383 5384 5385 5386 5387 5388
    fprintf(stderr, "Rocksdb Lite doesn't support filldeterministic\n");
    return Status::NotSupported(
        "Rocksdb Lite doesn't support filldeterministic");
#endif  // ROCKSDB_LITE
  }

5389
  void ReadSequential(ThreadState* thread) {
5390 5391
    if (db_.db != nullptr) {
      ReadSequential(thread, db_.db);
5392
    } else {
5393 5394
      for (const auto& db_with_cfh : multi_dbs_) {
        ReadSequential(thread, db_with_cfh.db);
5395 5396 5397 5398 5399
      }
    }
  }

  void ReadSequential(ThreadState* thread, DB* db) {
5400
    ReadOptions options = read_options_;
5401 5402 5403 5404 5405 5406 5407
    std::unique_ptr<char[]> ts_guard;
    Slice ts;
    if (user_timestamp_size_ > 0) {
      ts_guard.reset(new char[user_timestamp_size_]);
      ts = mock_app_clock_->GetTimestampForRead(thread->rand, ts_guard.get());
      options.timestamp = &ts;
    }
5408

5409
    options.adaptive_readahead = FLAGS_adaptive_readahead;
5410 5411
    options.async_io = FLAGS_async_io;

5412
    Iterator* iter = db->NewIterator(options);
5413
    int64_t i = 0;
5414
    int64_t bytes = 0;
5415
    for (iter->SeekToFirst(); i < reads_ && iter->Valid(); iter->Next()) {
5416
      bytes += iter->key().size() + iter->value().size();
5417
      thread->stats.FinishedOps(nullptr, db, 1, kRead);
5418
      ++i;
5419 5420 5421

      if (thread->shared->read_rate_limiter.get() != nullptr &&
          i % 1024 == 1023) {
5422
        thread->shared->read_rate_limiter->Request(1024, Env::IO_HIGH,
5423 5424
                                                   nullptr /* stats */,
                                                   RateLimiter::OpType::kRead);
5425
      }
5426
    }
5427

5428
    delete iter;
5429
    thread->stats.AddBytes(bytes);
5430
    if (FLAGS_perf_level > ROCKSDB_NAMESPACE::PerfLevel::kDisable) {
5431 5432
      thread->stats.AddMessage(std::string("PERF_CONTEXT:\n") +
                               get_perf_context()->ToString());
5433 5434 5435 5436 5437 5438 5439 5440 5441 5442 5443 5444 5445 5446 5447 5448 5449 5450 5451 5452 5453 5454
    }
  }

  void ReadToRowCache(ThreadState* thread) {
    int64_t read = 0;
    int64_t found = 0;
    int64_t bytes = 0;
    int64_t key_rand = 0;
    std::unique_ptr<const char[]> key_guard;
    Slice key = AllocateKey(&key_guard);
    PinnableSlice pinnable_val;

    while (key_rand < FLAGS_num) {
      DBWithColumnFamilies* db_with_cfh = SelectDBWithCfh(thread);
      // We use same key_rand as seed for key and column family so that we can
      // deterministically find the cfh corresponding to a particular key, as it
      // is done in DoWrite method.
      GenerateKeyFromInt(key_rand, FLAGS_num, &key);
      key_rand++;
      read++;
      Status s;
      if (FLAGS_num_column_families > 1) {
5455 5456
        s = db_with_cfh->db->Get(read_options_, db_with_cfh->GetCfh(key_rand),
                                 key, &pinnable_val);
5457 5458
      } else {
        pinnable_val.Reset();
5459
        s = db_with_cfh->db->Get(read_options_,
5460 5461 5462 5463 5464 5465 5466 5467 5468 5469 5470 5471 5472 5473 5474 5475 5476 5477 5478 5479 5480 5481
                                 db_with_cfh->db->DefaultColumnFamily(), key,
                                 &pinnable_val);
      }

      if (s.ok()) {
        found++;
        bytes += key.size() + pinnable_val.size();
      } else if (!s.IsNotFound()) {
        fprintf(stderr, "Get returned an error: %s\n", s.ToString().c_str());
        abort();
      }

      if (thread->shared->read_rate_limiter.get() != nullptr &&
          read % 256 == 255) {
        thread->shared->read_rate_limiter->Request(
            256, Env::IO_HIGH, nullptr /* stats */, RateLimiter::OpType::kRead);
      }

      thread->stats.FinishedOps(db_with_cfh, db_with_cfh->db, 1, kRead);
    }

    char msg[100];
5482 5483
    snprintf(msg, sizeof(msg), "(%" PRIu64 " of %" PRIu64 " found)\n", found,
             read);
5484 5485 5486 5487

    thread->stats.AddBytes(bytes);
    thread->stats.AddMessage(msg);

5488
    if (FLAGS_perf_level > ROCKSDB_NAMESPACE::PerfLevel::kDisable) {
5489 5490
      thread->stats.AddMessage(std::string("PERF_CONTEXT:\n") +
                               get_perf_context()->ToString());
5491
    }
5492 5493
  }

5494
  void ReadReverse(ThreadState* thread) {
5495 5496
    if (db_.db != nullptr) {
      ReadReverse(thread, db_.db);
5497
    } else {
5498 5499
      for (const auto& db_with_cfh : multi_dbs_) {
        ReadReverse(thread, db_with_cfh.db);
5500 5501 5502 5503 5504
      }
    }
  }

  void ReadReverse(ThreadState* thread, DB* db) {
5505
    Iterator* iter = db->NewIterator(read_options_);
5506
    int64_t i = 0;
5507
    int64_t bytes = 0;
5508
    for (iter->SeekToLast(); i < reads_ && iter->Valid(); iter->Prev()) {
5509
      bytes += iter->key().size() + iter->value().size();
5510
      thread->stats.FinishedOps(nullptr, db, 1, kRead);
5511
      ++i;
5512 5513
      if (thread->shared->read_rate_limiter.get() != nullptr &&
          i % 1024 == 1023) {
5514
        thread->shared->read_rate_limiter->Request(1024, Env::IO_HIGH,
5515 5516
                                                   nullptr /* stats */,
                                                   RateLimiter::OpType::kRead);
5517
      }
5518 5519
    }
    delete iter;
5520
    thread->stats.AddBytes(bytes);
5521 5522
  }

L
Lei Jin 已提交
5523 5524 5525
  void ReadRandomFast(ThreadState* thread) {
    int64_t read = 0;
    int64_t found = 0;
5526
    int64_t nonexist = 0;
5527
    ReadOptions options = read_options_;
5528 5529
    std::unique_ptr<const char[]> key_guard;
    Slice key = AllocateKey(&key_guard);
L
Lei Jin 已提交
5530
    std::string value;
5531 5532 5533 5534 5535
    Slice ts;
    std::unique_ptr<char[]> ts_guard;
    if (user_timestamp_size_ > 0) {
      ts_guard.reset(new char[user_timestamp_size_]);
    }
L
Lei Jin 已提交
5536 5537 5538 5539 5540 5541 5542 5543 5544 5545 5546 5547 5548
    DB* db = SelectDBWithCfh(thread)->db;

    int64_t pot = 1;
    while (pot < FLAGS_num) {
      pot <<= 1;
    }

    Duration duration(FLAGS_duration, reads_);
    do {
      for (int i = 0; i < 100; ++i) {
        int64_t key_rand = thread->rand.Next() & (pot - 1);
        GenerateKeyFromInt(key_rand, FLAGS_num, &key);
        ++read;
5549 5550 5551 5552 5553 5554 5555 5556 5557
        std::string ts_ret;
        std::string* ts_ptr = nullptr;
        if (user_timestamp_size_ > 0) {
          ts = mock_app_clock_->GetTimestampForRead(thread->rand,
                                                    ts_guard.get());
          options.timestamp = &ts;
          ts_ptr = &ts_ret;
        }
        auto status = db->Get(options, key, &value, ts_ptr);
5558
        if (status.ok()) {
L
Lei Jin 已提交
5559
          ++found;
5560
        } else if (!status.IsNotFound()) {
I
Igor Canadi 已提交
5561 5562
          fprintf(stderr, "Get returned an error: %s\n",
                  status.ToString().c_str());
5563
          abort();
L
Lei Jin 已提交
5564
        }
5565 5566 5567
        if (key_rand >= FLAGS_num) {
          ++nonexist;
        }
L
Lei Jin 已提交
5568
      }
5569
      if (thread->shared->read_rate_limiter.get() != nullptr) {
5570 5571
        thread->shared->read_rate_limiter->Request(
            100, Env::IO_HIGH, nullptr /* stats */, RateLimiter::OpType::kRead);
5572 5573
      }

5574
      thread->stats.FinishedOps(nullptr, db, 100, kRead);
L
Lei Jin 已提交
5575 5576 5577
    } while (!duration.Done(100));

    char msg[100];
5578 5579 5580
    snprintf(msg, sizeof(msg), "(%" PRIu64 " of %" PRIu64 " found, "
             "issued %" PRIu64 " non-exist keys)\n",
             found, read, nonexist);
L
Lei Jin 已提交
5581 5582 5583

    thread->stats.AddMessage(msg);

5584
    if (FLAGS_perf_level > ROCKSDB_NAMESPACE::PerfLevel::kDisable) {
5585 5586
      thread->stats.AddMessage(std::string("PERF_CONTEXT:\n") +
                               get_perf_context()->ToString());
L
Lei Jin 已提交
5587 5588 5589
    }
  }

5590 5591 5592 5593 5594 5595 5596 5597 5598 5599 5600
  int64_t GetRandomKey(Random64* rand) {
    uint64_t rand_int = rand->Next();
    int64_t key_rand;
    if (read_random_exp_range_ == 0) {
      key_rand = rand_int % FLAGS_num;
    } else {
      const uint64_t kBigInt = static_cast<uint64_t>(1U) << 62;
      long double order = -static_cast<long double>(rand_int % kBigInt) /
                          static_cast<long double>(kBigInt) *
                          read_random_exp_range_;
      long double exp_ran = std::exp(order);
5601
      uint64_t rand_num =
5602
          static_cast<int64_t>(exp_ran * static_cast<long double>(FLAGS_num));
5603 5604 5605 5606
      // Map to a different number to avoid locality.
      const uint64_t kBigPrime = 0x5bd1e995;
      // Overflow is like %(2^64). Will have little impact of results.
      key_rand = static_cast<int64_t>((rand_num * kBigPrime) % FLAGS_num);
5607 5608 5609 5610
    }
    return key_rand;
  }

5611
  void ReadRandom(ThreadState* thread) {
L
Lei Jin 已提交
5612
    int64_t read = 0;
L
Lei Jin 已提交
5613
    int64_t found = 0;
5614
    int64_t bytes = 0;
5615
    int num_keys = 0;
5616
    int64_t key_rand = 0;
5617
    ReadOptions options = read_options_;
5618 5619
    std::unique_ptr<const char[]> key_guard;
    Slice key = AllocateKey(&key_guard);
M
Maysam Yabandeh 已提交
5620
    PinnableSlice pinnable_val;
5621 5622 5623 5624 5625
    std::unique_ptr<char[]> ts_guard;
    Slice ts;
    if (user_timestamp_size_ > 0) {
      ts_guard.reset(new char[user_timestamp_size_]);
    }
5626

L
Lei Jin 已提交
5627 5628
    Duration duration(FLAGS_duration, reads_);
    while (!duration.Done(1)) {
5629 5630 5631 5632
      DBWithColumnFamilies* db_with_cfh = SelectDBWithCfh(thread);
      // We use same key_rand as seed for key and column family so that we can
      // deterministically find the cfh corresponding to a particular key, as it
      // is done in DoWrite method.
5633 5634 5635 5636 5637 5638 5639 5640 5641 5642 5643 5644 5645 5646
      if (entries_per_batch_ > 1 && FLAGS_multiread_stride) {
        if (++num_keys == entries_per_batch_) {
          num_keys = 0;
          key_rand = GetRandomKey(&thread->rand);
          if ((key_rand + (entries_per_batch_ - 1) * FLAGS_multiread_stride) >=
              FLAGS_num) {
            key_rand = FLAGS_num - entries_per_batch_ * FLAGS_multiread_stride;
          }
        } else {
          key_rand += FLAGS_multiread_stride;
        }
      } else {
        key_rand = GetRandomKey(&thread->rand);
      }
5647
      GenerateKeyFromInt(key_rand, FLAGS_num, &key);
L
Lei Jin 已提交
5648
      read++;
5649 5650 5651 5652 5653 5654 5655
      std::string ts_ret;
      std::string* ts_ptr = nullptr;
      if (user_timestamp_size_ > 0) {
        ts = mock_app_clock_->GetTimestampForRead(thread->rand, ts_guard.get());
        options.timestamp = &ts;
        ts_ptr = &ts_ret;
      }
5656
      Status s;
5657
      pinnable_val.Reset();
5658
      if (FLAGS_num_column_families > 1) {
5659
        s = db_with_cfh->db->Get(options, db_with_cfh->GetCfh(key_rand), key,
5660
                                 &pinnable_val, ts_ptr);
5661
      } else {
5662 5663
        s = db_with_cfh->db->Get(options,
                                 db_with_cfh->db->DefaultColumnFamily(), key,
5664
                                 &pinnable_val, ts_ptr);
5665 5666
      }
      if (s.ok()) {
L
Lei Jin 已提交
5667
        found++;
5668
        bytes += key.size() + pinnable_val.size() + user_timestamp_size_;
5669
      } else if (!s.IsNotFound()) {
I
Igor Canadi 已提交
5670
        fprintf(stderr, "Get returned an error: %s\n", s.ToString().c_str());
5671
        abort();
M
Mark Callaghan 已提交
5672
      }
5673 5674 5675

      if (thread->shared->read_rate_limiter.get() != nullptr &&
          read % 256 == 255) {
5676 5677
        thread->shared->read_rate_limiter->Request(
            256, Env::IO_HIGH, nullptr /* stats */, RateLimiter::OpType::kRead);
5678 5679
      }

5680
      thread->stats.FinishedOps(db_with_cfh, db_with_cfh->db, 1, kRead);
S
Sanjay Ghemawat 已提交
5681
    }
5682

S
Sanjay Ghemawat 已提交
5683
    char msg[100];
L
Lei Jin 已提交
5684
    snprintf(msg, sizeof(msg), "(%" PRIu64 " of %" PRIu64 " found)\n",
L
Lei Jin 已提交
5685
             found, read);
5686

5687
    thread->stats.AddBytes(bytes);
S
Sanjay Ghemawat 已提交
5688
    thread->stats.AddMessage(msg);
5689

5690
    if (FLAGS_perf_level > ROCKSDB_NAMESPACE::PerfLevel::kDisable) {
5691 5692
      thread->stats.AddMessage(std::string("PERF_CONTEXT:\n") +
                               get_perf_context()->ToString());
5693
    }
S
Sanjay Ghemawat 已提交
5694 5695
  }

L
Lei Jin 已提交
5696 5697 5698 5699
  // Calls MultiGet over a list of keys from a random distribution.
  // Returns the total number of keys found.
  void MultiReadRandom(ThreadState* thread) {
    int64_t read = 0;
5700
    int64_t bytes = 0;
5701
    int64_t num_multireads = 0;
5702
    int64_t found = 0;
5703
    ReadOptions options = read_options_;
S
sdong 已提交
5704
    std::vector<Slice> keys;
5705
    std::vector<std::unique_ptr<const char[]> > key_guards;
L
Lei Jin 已提交
5706
    std::vector<std::string> values(entries_per_batch_);
5707
    PinnableSlice* pin_values = new PinnableSlice[entries_per_batch_];
5708
    std::unique_ptr<PinnableSlice[]> pin_values_guard(pin_values);
5709
    std::vector<Status> stat_list(entries_per_batch_);
5710
    while (static_cast<int64_t>(keys.size()) < entries_per_batch_) {
5711
      key_guards.push_back(std::unique_ptr<const char[]>());
5712
      keys.push_back(AllocateKey(&key_guards.back()));
J
jorlow@chromium.org 已提交
5713 5714
    }

5715 5716 5717 5718 5719
    std::unique_ptr<char[]> ts_guard;
    if (user_timestamp_size_ > 0) {
      ts_guard.reset(new char[user_timestamp_size_]);
    }

M
Mark Callaghan 已提交
5720
    Duration duration(FLAGS_duration, reads_);
5721
    while (!duration.Done(entries_per_batch_)) {
5722
      DB* db = SelectDB(thread);
5723 5724 5725
      if (FLAGS_multiread_stride) {
        int64_t key = GetRandomKey(&thread->rand);
        if ((key + (entries_per_batch_ - 1) * FLAGS_multiread_stride) >=
5726
            static_cast<int64_t>(FLAGS_num)) {
5727 5728 5729 5730 5731 5732 5733 5734 5735 5736
          key = FLAGS_num - entries_per_batch_ * FLAGS_multiread_stride;
        }
        for (int64_t i = 0; i < entries_per_batch_; ++i) {
          GenerateKeyFromInt(key, FLAGS_num, &keys[i]);
          key += FLAGS_multiread_stride;
        }
      } else {
        for (int64_t i = 0; i < entries_per_batch_; ++i) {
          GenerateKeyFromInt(GetRandomKey(&thread->rand), FLAGS_num, &keys[i]);
        }
5737
      }
5738 5739 5740 5741 5742
      Slice ts;
      if (user_timestamp_size_ > 0) {
        ts = mock_app_clock_->GetTimestampForRead(thread->rand, ts_guard.get());
        options.timestamp = &ts;
      }
5743 5744 5745 5746 5747 5748 5749 5750
      if (!FLAGS_multiread_batched) {
        std::vector<Status> statuses = db->MultiGet(options, keys, &values);
        assert(static_cast<int64_t>(statuses.size()) == entries_per_batch_);

        read += entries_per_batch_;
        num_multireads++;
        for (int64_t i = 0; i < entries_per_batch_; ++i) {
          if (statuses[i].ok()) {
5751
            bytes += keys[i].size() + values[i].size() + user_timestamp_size_;
5752 5753 5754 5755 5756 5757 5758 5759 5760 5761 5762 5763 5764 5765 5766
            ++found;
          } else if (!statuses[i].IsNotFound()) {
            fprintf(stderr, "MultiGet returned an error: %s\n",
                    statuses[i].ToString().c_str());
            abort();
          }
        }
      } else {
        db->MultiGet(options, db->DefaultColumnFamily(), keys.size(),
                     keys.data(), pin_values, stat_list.data());

        read += entries_per_batch_;
        num_multireads++;
        for (int64_t i = 0; i < entries_per_batch_; ++i) {
          if (stat_list[i].ok()) {
5767 5768
            bytes +=
                keys[i].size() + pin_values[i].size() + user_timestamp_size_;
5769 5770 5771 5772 5773 5774 5775 5776
            ++found;
          } else if (!stat_list[i].IsNotFound()) {
            fprintf(stderr, "MultiGet returned an error: %s\n",
                    stat_list[i].ToString().c_str());
            abort();
          }
          stat_list[i] = Status::OK();
          pin_values[i].Reset();
5777 5778
        }
      }
5779 5780
      if (thread->shared->read_rate_limiter.get() != nullptr &&
          num_multireads % 256 == 255) {
5781
        thread->shared->read_rate_limiter->Request(
5782 5783
            256 * entries_per_batch_, Env::IO_HIGH, nullptr /* stats */,
            RateLimiter::OpType::kRead);
5784
      }
5785
      thread->stats.FinishedOps(nullptr, db, entries_per_batch_, kRead);
5786
    }
5787 5788

    char msg[100];
5789
    snprintf(msg, sizeof(msg), "(%" PRIu64 " of %" PRIu64 " found)",
L
Lei Jin 已提交
5790
             found, read);
5791
    thread->stats.AddBytes(bytes);
5792
    thread->stats.AddMessage(msg);
5793 5794
  }

5795 5796 5797 5798 5799 5800 5801 5802 5803 5804 5805 5806 5807 5808 5809 5810 5811 5812 5813 5814 5815 5816 5817 5818 5819 5820 5821 5822 5823 5824 5825 5826 5827 5828 5829 5830 5831 5832 5833 5834 5835 5836 5837 5838 5839 5840 5841
  // Calls ApproximateSize over random key ranges.
  void ApproximateSizeRandom(ThreadState* thread) {
    int64_t size_sum = 0;
    int64_t num_sizes = 0;
    const size_t batch_size = entries_per_batch_;
    std::vector<Range> ranges;
    std::vector<Slice> lkeys;
    std::vector<std::unique_ptr<const char[]>> lkey_guards;
    std::vector<Slice> rkeys;
    std::vector<std::unique_ptr<const char[]>> rkey_guards;
    std::vector<uint64_t> sizes;
    while (ranges.size() < batch_size) {
      // Ugly without C++17 return from emplace_back
      lkey_guards.emplace_back();
      rkey_guards.emplace_back();
      lkeys.emplace_back(AllocateKey(&lkey_guards.back()));
      rkeys.emplace_back(AllocateKey(&rkey_guards.back()));
      ranges.emplace_back(lkeys.back(), rkeys.back());
      sizes.push_back(0);
    }
    Duration duration(FLAGS_duration, reads_);
    while (!duration.Done(1)) {
      DB* db = SelectDB(thread);
      for (size_t i = 0; i < batch_size; ++i) {
        int64_t lkey = GetRandomKey(&thread->rand);
        int64_t rkey = GetRandomKey(&thread->rand);
        if (lkey > rkey) {
          std::swap(lkey, rkey);
        }
        GenerateKeyFromInt(lkey, FLAGS_num, &lkeys[i]);
        GenerateKeyFromInt(rkey, FLAGS_num, &rkeys[i]);
      }
      db->GetApproximateSizes(&ranges[0], static_cast<int>(entries_per_batch_),
                              &sizes[0]);
      num_sizes += entries_per_batch_;
      for (int64_t size : sizes) {
        size_sum += size;
      }
      thread->stats.FinishedOps(nullptr, db, entries_per_batch_, kOthers);
    }

    char msg[100];
    snprintf(msg, sizeof(msg), "(Avg approx size=%g)",
             static_cast<double>(size_sum) / static_cast<double>(num_sizes));
    thread->stats.AddMessage(msg);
  }

5842
  // The inverse function of Pareto distribution
5843 5844 5845 5846 5847 5848 5849 5850 5851
  int64_t ParetoCdfInversion(double u, double theta, double k, double sigma) {
    double ret;
    if (k == 0.0) {
      ret = theta - sigma * std::log(u);
    } else {
      ret = theta + sigma * (std::pow(u, -1 * k) - 1) / k;
    }
    return static_cast<int64_t>(ceil(ret));
  }
5852
  // The inverse function of power distribution (y=ax^b)
5853 5854 5855 5856 5857 5858 5859 5860 5861 5862 5863 5864 5865 5866 5867 5868 5869 5870 5871 5872
  int64_t PowerCdfInversion(double u, double a, double b) {
    double ret;
    ret = std::pow((u / a), (1 / b));
    return static_cast<int64_t>(ceil(ret));
  }

  // Add the noice to the QPS
  double AddNoise(double origin, double noise_ratio) {
    if (noise_ratio < 0.0 || noise_ratio > 1.0) {
      return origin;
    }
    int band_int = static_cast<int>(FLAGS_sine_a);
    double delta = (rand() % band_int - band_int / 2) * noise_ratio;
    if (origin + delta < 0) {
      return origin;
    } else {
      return (origin + delta);
    }
  }

5873
  // Decide the ratio of different query types
5874 5875 5876 5877 5878 5879 5880 5881 5882 5883 5884 5885 5886 5887 5888 5889 5890 5891 5892 5893 5894 5895 5896 5897 5898 5899 5900 5901 5902
  // 0 Get, 1 Put, 2 Seek, 3 SeekForPrev, 4 Delete, 5 SingleDelete, 6 merge
  class QueryDecider {
   public:
    std::vector<int> type_;
    std::vector<double> ratio_;
    int range_;

    QueryDecider() {}
    ~QueryDecider() {}

    Status Initiate(std::vector<double> ratio_input) {
      int range_max = 1000;
      double sum = 0.0;
      for (auto& ratio : ratio_input) {
        sum += ratio;
      }
      range_ = 0;
      for (auto& ratio : ratio_input) {
        range_ += static_cast<int>(ceil(range_max * (ratio / sum)));
        type_.push_back(range_);
        ratio_.push_back(ratio / sum);
      }
      return Status::OK();
    }

    int GetType(int64_t rand_num) {
      if (rand_num < 0) {
        rand_num = rand_num * (-1);
      }
5903
      assert(range_ != 0);
5904 5905 5906 5907 5908 5909 5910 5911 5912 5913
      int pos = static_cast<int>(rand_num % range_);
      for (int i = 0; i < static_cast<int>(type_.size()); i++) {
        if (pos < type_[i]) {
          return i;
        }
      }
      return 0;
    }
  };

5914 5915 5916 5917 5918 5919 5920 5921 5922 5923 5924 5925 5926 5927 5928 5929 5930 5931 5932 5933 5934 5935 5936
  // KeyrangeUnit is the struct of a keyrange. It is used in a keyrange vector
  // to transfer a random value to one keyrange based on the hotness.
  struct KeyrangeUnit {
    int64_t keyrange_start;
    int64_t keyrange_access;
    int64_t keyrange_keys;
  };

  // From our observations, the prefix hotness (key-range hotness) follows
  // the two-term-exponential distribution: f(x) = a*exp(b*x) + c*exp(d*x).
  // However, we cannot directly use the inverse function to decide a
  // key-range from a random distribution. To achieve it, we create a list of
  // KeyrangeUnit, each KeyrangeUnit occupies a range of integers whose size is
  // decided based on the hotness of the key-range. When a random value is
  // generated based on uniform distribution, we map it to the KeyrangeUnit Vec
  // and one KeyrangeUnit is selected. The probability of a  KeyrangeUnit being
  // selected is the same as the hotness of this KeyrangeUnit. After that, the
  // key can be randomly allocated to the key-range of this KeyrangeUnit, or we
  // can based on the power distribution (y=ax^b) to generate the offset of
  // the key in the selected key-range. In this way, we generate the keyID
  // based on the hotness of the prefix and also the key hotness distribution.
  class GenerateTwoTermExpKeys {
   public:
5937 5938 5939 5940
    // Avoid uninitialized warning-as-error in some compilers
    int64_t keyrange_rand_max_ = 0;
    int64_t keyrange_size_ = 0;
    int64_t keyrange_num_ = 0;
5941 5942 5943 5944 5945 5946 5947 5948 5949 5950 5951 5952 5953 5954 5955 5956 5957 5958 5959 5960 5961 5962 5963 5964 5965 5966 5967 5968 5969 5970 5971 5972 5973 5974 5975 5976 5977 5978 5979 5980 5981 5982 5983 5984 5985 5986 5987 5988 5989 5990 5991 5992 5993 5994 5995 5996 5997 5998 5999 6000 6001 6002 6003 6004 6005 6006 6007 6008 6009 6010 6011 6012 6013 6014 6015 6016 6017 6018 6019 6020 6021 6022 6023 6024 6025 6026 6027 6028 6029 6030 6031 6032 6033 6034 6035 6036 6037
    std::vector<KeyrangeUnit> keyrange_set_;

    // Initiate the KeyrangeUnit vector and calculate the size of each
    // KeyrangeUnit.
    Status InitiateExpDistribution(int64_t total_keys, double prefix_a,
                                   double prefix_b, double prefix_c,
                                   double prefix_d) {
      int64_t amplify = 0;
      int64_t keyrange_start = 0;
      if (FLAGS_keyrange_num <= 0) {
        keyrange_num_ = 1;
      } else {
        keyrange_num_ = FLAGS_keyrange_num;
      }
      keyrange_size_ = total_keys / keyrange_num_;

      // Calculate the key-range shares size based on the input parameters
      for (int64_t pfx = keyrange_num_; pfx >= 1; pfx--) {
        // Step 1. Calculate the probability that this key range will be
        // accessed in a query. It is based on the two-term expoential
        // distribution
        double keyrange_p = prefix_a * std::exp(prefix_b * pfx) +
                            prefix_c * std::exp(prefix_d * pfx);
        if (keyrange_p < std::pow(10.0, -16.0)) {
          keyrange_p = 0.0;
        }
        // Step 2. Calculate the amplify
        // In order to allocate a query to a key-range based on the random
        // number generated for this query, we need to extend the probability
        // of each key range from [0,1] to [0, amplify]. Amplify is calculated
        // by 1/(smallest key-range probability). In this way, we ensure that
        // all key-ranges are assigned with an Integer that  >=0
        if (amplify == 0 && keyrange_p > 0) {
          amplify = static_cast<int64_t>(std::floor(1 / keyrange_p)) + 1;
        }

        // Step 3. For each key-range, we calculate its position in the
        // [0, amplify] range, including the start, the size (keyrange_access)
        KeyrangeUnit p_unit;
        p_unit.keyrange_start = keyrange_start;
        if (0.0 >= keyrange_p) {
          p_unit.keyrange_access = 0;
        } else {
          p_unit.keyrange_access =
              static_cast<int64_t>(std::floor(amplify * keyrange_p));
        }
        p_unit.keyrange_keys = keyrange_size_;
        keyrange_set_.push_back(p_unit);
        keyrange_start += p_unit.keyrange_access;
      }
      keyrange_rand_max_ = keyrange_start;

      // Step 4. Shuffle the key-ranges randomly
      // Since the access probability is calculated from small to large,
      // If we do not re-allocate them, hot key-ranges are always at the end
      // and cold key-ranges are at the begin of the key space. Therefore, the
      // key-ranges are shuffled and the rand seed is only decide by the
      // key-range hotness distribution. With the same distribution parameters
      // the shuffle results are the same.
      Random64 rand_loca(keyrange_rand_max_);
      for (int64_t i = 0; i < FLAGS_keyrange_num; i++) {
        int64_t pos = rand_loca.Next() % FLAGS_keyrange_num;
        assert(i >= 0 && i < static_cast<int64_t>(keyrange_set_.size()) &&
               pos >= 0 && pos < static_cast<int64_t>(keyrange_set_.size()));
        std::swap(keyrange_set_[i], keyrange_set_[pos]);
      }

      // Step 5. Recalculate the prefix start postion after shuffling
      int64_t offset = 0;
      for (auto& p_unit : keyrange_set_) {
        p_unit.keyrange_start = offset;
        offset += p_unit.keyrange_access;
      }

      return Status::OK();
    }

    // Generate the Key ID according to the input ini_rand and key distribution
    int64_t DistGetKeyID(int64_t ini_rand, double key_dist_a,
                         double key_dist_b) {
      int64_t keyrange_rand = ini_rand % keyrange_rand_max_;

      // Calculate and select one key-range that contains the new key
      int64_t start = 0, end = static_cast<int64_t>(keyrange_set_.size());
      while (start + 1 < end) {
        int64_t mid = start + (end - start) / 2;
        assert(mid >= 0 && mid < static_cast<int64_t>(keyrange_set_.size()));
        if (keyrange_rand < keyrange_set_[mid].keyrange_start) {
          end = mid;
        } else {
          start = mid;
        }
      }
      int64_t keyrange_id = start;

      // Select one key in the key-range and compose the keyID
      int64_t key_offset = 0, key_seed;
6038
      if (key_dist_a == 0.0 || key_dist_b == 0.0) {
6039 6040
        key_offset = ini_rand % keyrange_size_;
      } else {
6041 6042
        double u =
            static_cast<double>(ini_rand % keyrange_size_) / keyrange_size_;
6043
        key_seed = static_cast<int64_t>(
6044
            ceil(std::pow((u / key_dist_a), (1 / key_dist_b))));
6045
        Random64 rand_key(key_seed);
6046
        key_offset = rand_key.Next() % keyrange_size_;
6047 6048 6049 6050 6051
      }
      return keyrange_size_ * keyrange_id + key_offset;
    }
  };

J
junhan lee 已提交
6052
  // The social graph workload mixed with Get, Put, Iterator queries.
6053 6054 6055 6056
  // The value size and iterator length follow Pareto distribution.
  // The overall key access follow power distribution. If user models the
  // workload based on different key-ranges (or different prefixes), user
  // can use two-term-exponential distribution to fit the workload. User
J
junhan lee 已提交
6057
  // needs to decide the ratio between Get, Put, Iterator queries before
6058
  // starting the benchmark.
6059 6060 6061
  void MixGraph(ThreadState* thread) {
    int64_t gets = 0;
    int64_t puts = 0;
6062
    int64_t get_found = 0;
6063 6064 6065
    int64_t seek = 0;
    int64_t seek_found = 0;
    int64_t bytes = 0;
6066 6067
    double total_scan_length = 0;
    double total_val_size = 0;
6068
    const int64_t default_value_max = 1 * 1024 * 1024;
6069
    int64_t value_max = default_value_max;
6070 6071 6072
    int64_t scan_len_max = FLAGS_mix_max_scan_len;
    double write_rate = 1000000.0;
    double read_rate = 1000000.0;
6073
    bool use_prefix_modeling = false;
6074
    bool use_random_modeling = false;
6075
    GenerateTwoTermExpKeys gen_exp;
6076 6077
    std::vector<double> ratio{FLAGS_mix_get_ratio, FLAGS_mix_put_ratio,
                              FLAGS_mix_seek_ratio};
6078
    char value_buffer[default_value_max];
6079 6080 6081
    QueryDecider query;
    RandomGenerator gen;
    Status s;
6082
    if (value_max > FLAGS_mix_max_value_size) {
6083 6084
      value_max = FLAGS_mix_max_value_size;
    }
6085 6086 6087 6088 6089 6090 6091

    std::unique_ptr<const char[]> key_guard;
    Slice key = AllocateKey(&key_guard);
    PinnableSlice pinnable_val;
    query.Initiate(ratio);

    // the limit of qps initiation
U
Ubuntu 已提交
6092 6093 6094
    if (FLAGS_sine_mix_rate) {
      thread->shared->read_rate_limiter.reset(
          NewGenericRateLimiter(static_cast<int64_t>(read_rate)));
6095
      thread->shared->write_rate_limiter.reset(
6096
          NewGenericRateLimiter(static_cast<int64_t>(write_rate)));
6097 6098
    }

6099 6100 6101 6102 6103 6104 6105 6106
    // Decide if user wants to use prefix based key generation
    if (FLAGS_keyrange_dist_a != 0.0 || FLAGS_keyrange_dist_b != 0.0 ||
        FLAGS_keyrange_dist_c != 0.0 || FLAGS_keyrange_dist_d != 0.0) {
      use_prefix_modeling = true;
      gen_exp.InitiateExpDistribution(
          FLAGS_num, FLAGS_keyrange_dist_a, FLAGS_keyrange_dist_b,
          FLAGS_keyrange_dist_c, FLAGS_keyrange_dist_d);
    }
6107 6108 6109
    if (FLAGS_key_dist_a == 0 || FLAGS_key_dist_b == 0) {
      use_random_modeling = true;
    }
6110

6111 6112 6113
    Duration duration(FLAGS_duration, reads_);
    while (!duration.Done(1)) {
      DBWithColumnFamilies* db_with_cfh = SelectDBWithCfh(thread);
6114 6115 6116
      int64_t ini_rand, rand_v, key_rand, key_seed;
      ini_rand = GetRandomKey(&thread->rand);
      rand_v = ini_rand % FLAGS_num;
6117
      double u = static_cast<double>(rand_v) / FLAGS_num;
6118 6119

      // Generate the keyID based on the key hotness and prefix hotness
6120 6121 6122
      if (use_random_modeling) {
        key_rand = ini_rand;
      } else if (use_prefix_modeling) {
6123 6124 6125 6126 6127 6128 6129
        key_rand =
            gen_exp.DistGetKeyID(ini_rand, FLAGS_key_dist_a, FLAGS_key_dist_b);
      } else {
        key_seed = PowerCdfInversion(u, FLAGS_key_dist_a, FLAGS_key_dist_b);
        Random64 rand(key_seed);
        key_rand = static_cast<int64_t>(rand.Next()) % FLAGS_num;
      }
6130 6131 6132 6133 6134 6135 6136 6137 6138 6139 6140 6141
      GenerateKeyFromInt(key_rand, FLAGS_num, &key);
      int query_type = query.GetType(rand_v);

      // change the qps
      uint64_t now = FLAGS_env->NowMicros();
      uint64_t usecs_since_last;
      if (now > thread->stats.GetSineInterval()) {
        usecs_since_last = now - thread->stats.GetSineInterval();
      } else {
        usecs_since_last = 0;
      }

U
Ubuntu 已提交
6142 6143 6144
      if (FLAGS_sine_mix_rate &&
          usecs_since_last >
              (FLAGS_sine_mix_rate_interval_milliseconds * uint64_t{1000})) {
6145 6146 6147 6148 6149 6150
        double usecs_since_start =
            static_cast<double>(now - thread->stats.GetStart());
        thread->stats.ResetSineInterval();
        double mix_rate_with_noise = AddNoise(
            SineRate(usecs_since_start / 1000000.0), FLAGS_sine_mix_rate_noise);
        read_rate = mix_rate_with_noise * (query.ratio_[0] + query.ratio_[2]);
U
Ubuntu 已提交
6151
        write_rate = mix_rate_with_noise * query.ratio_[1];
6152

U
Ubuntu 已提交
6153 6154 6155 6156 6157 6158 6159 6160
        if (read_rate > 0) {
          thread->shared->read_rate_limiter->SetBytesPerSecond(
              static_cast<int64_t>(read_rate));
        }
        if (write_rate > 0) {
          thread->shared->write_rate_limiter->SetBytesPerSecond(
              static_cast<int64_t>(write_rate));
        }
6161 6162 6163 6164 6165 6166
      }
      // Start the query
      if (query_type == 0) {
        // the Get query
        gets++;
        if (FLAGS_num_column_families > 1) {
6167 6168
          s = db_with_cfh->db->Get(read_options_, db_with_cfh->GetCfh(key_rand),
                                   key, &pinnable_val);
6169 6170
        } else {
          pinnable_val.Reset();
6171
          s = db_with_cfh->db->Get(read_options_,
6172 6173 6174 6175 6176
                                   db_with_cfh->db->DefaultColumnFamily(), key,
                                   &pinnable_val);
        }

        if (s.ok()) {
6177
          get_found++;
6178 6179 6180 6181 6182 6183
          bytes += key.size() + pinnable_val.size();
        } else if (!s.IsNotFound()) {
          fprintf(stderr, "Get returned an error: %s\n", s.ToString().c_str());
          abort();
        }

6184
        if (thread->shared->read_rate_limiter && (gets + seek) % 100 == 0) {
U
Ubuntu 已提交
6185 6186
          thread->shared->read_rate_limiter->Request(100, Env::IO_HIGH,
                                                     nullptr /*stats*/);
6187
        }
6188
        thread->stats.FinishedOps(db_with_cfh, db_with_cfh->db, 1, kRead);
6189 6190 6191
      } else if (query_type == 1) {
        // the Put query
        puts++;
6192
        int64_t val_size = ParetoCdfInversion(
6193
            u, FLAGS_value_theta, FLAGS_value_k, FLAGS_value_sigma);
6194
        if (val_size < 10) {
6195 6196 6197
          val_size = 10;
        } else if (val_size > value_max) {
          val_size = val_size % value_max;
6198
        }
6199 6200
        total_val_size += val_size;

6201 6202
        s = db_with_cfh->db->Put(
            write_options_, key,
6203
            gen.Generate(static_cast<unsigned int>(val_size)));
6204 6205
        if (!s.ok()) {
          fprintf(stderr, "put error: %s\n", s.ToString().c_str());
6206
          ErrorExit();
6207 6208
        }

U
Ubuntu 已提交
6209 6210 6211
        if (thread->shared->write_rate_limiter && puts % 100 == 0) {
          thread->shared->write_rate_limiter->Request(100, Env::IO_HIGH,
                                                      nullptr /*stats*/);
6212
        }
6213
        thread->stats.FinishedOps(db_with_cfh, db_with_cfh->db, 1, kWrite);
6214 6215 6216 6217
      } else if (query_type == 2) {
        // Seek query
        if (db_with_cfh->db != nullptr) {
          Iterator* single_iter = nullptr;
6218
          single_iter = db_with_cfh->db->NewIterator(read_options_);
6219 6220 6221 6222 6223 6224 6225 6226 6227 6228 6229 6230 6231 6232 6233 6234 6235
          if (single_iter != nullptr) {
            single_iter->Seek(key);
            seek++;
            if (single_iter->Valid() && single_iter->key().compare(key) == 0) {
              seek_found++;
            }
            int64_t scan_length =
                ParetoCdfInversion(u, FLAGS_iter_theta, FLAGS_iter_k,
                                   FLAGS_iter_sigma) %
                scan_len_max;
            for (int64_t j = 0; j < scan_length && single_iter->Valid(); j++) {
              Slice value = single_iter->value();
              memcpy(value_buffer, value.data(),
                     std::min(value.size(), sizeof(value_buffer)));
              bytes += single_iter->key().size() + single_iter->value().size();
              single_iter->Next();
              assert(single_iter->status().ok());
6236
              total_scan_length++;
6237 6238 6239 6240
            }
          }
          delete single_iter;
        }
6241
        thread->stats.FinishedOps(db_with_cfh, db_with_cfh->db, 1, kSeek);
6242 6243
      }
    }
6244
    char msg[256];
6245
    snprintf(msg, sizeof(msg),
6246 6247 6248 6249 6250 6251
             "( Gets:%" PRIu64 " Puts:%" PRIu64 " Seek:%" PRIu64
             ", reads %" PRIu64 " in %" PRIu64
             " found, "
             "avg size: %.1f value, %.1f scan)\n",
             gets, puts, seek, get_found + seek_found, gets + seek,
             total_val_size / puts, total_scan_length / seek);
6252 6253 6254 6255

    thread->stats.AddBytes(bytes);
    thread->stats.AddMessage(msg);

6256
    if (FLAGS_perf_level > ROCKSDB_NAMESPACE::PerfLevel::kDisable) {
6257 6258 6259 6260 6261
      thread->stats.AddMessage(std::string("PERF_CONTEXT:\n") +
                               get_perf_context()->ToString());
    }
  }

6262 6263
  void IteratorCreation(ThreadState* thread) {
    Duration duration(FLAGS_duration, reads_);
6264
    ReadOptions options = read_options_;
6265 6266 6267 6268
    std::unique_ptr<char[]> ts_guard;
    if (user_timestamp_size_ > 0) {
      ts_guard.reset(new char[user_timestamp_size_]);
    }
6269
    while (!duration.Done(1)) {
6270
      DB* db = SelectDB(thread);
6271 6272 6273 6274 6275
      Slice ts;
      if (user_timestamp_size_ > 0) {
        ts = mock_app_clock_->GetTimestampForRead(thread->rand, ts_guard.get());
        options.timestamp = &ts;
      }
6276
      Iterator* iter = db->NewIterator(options);
6277
      delete iter;
6278
      thread->stats.FinishedOps(nullptr, db, 1, kOthers);
6279 6280 6281
    }
  }

6282 6283 6284 6285
  void IteratorCreationWhileWriting(ThreadState* thread) {
    if (thread->tid > 0) {
      IteratorCreation(thread);
    } else {
6286
      BGWriter(thread, kWrite);
6287 6288 6289
    }
  }

S
Sanjay Ghemawat 已提交
6290
  void SeekRandom(ThreadState* thread) {
L
Lei Jin 已提交
6291
    int64_t read = 0;
6292
    int64_t found = 0;
6293
    int64_t bytes = 0;
6294
    ReadOptions options = read_options_;
6295 6296 6297 6298 6299 6300 6301
    std::unique_ptr<char[]> ts_guard;
    Slice ts;
    if (user_timestamp_size_ > 0) {
      ts_guard.reset(new char[user_timestamp_size_]);
      ts = mock_app_clock_->GetTimestampForRead(thread->rand, ts_guard.get());
      options.timestamp = &ts;
    }
6302

6303 6304 6305 6306 6307 6308 6309 6310
    std::vector<Iterator*> tailing_iters;
    if (FLAGS_use_tailing_iterator) {
      if (db_.db != nullptr) {
        tailing_iters.push_back(db_.db->NewIterator(options));
      } else {
        for (const auto& db_with_cfh : multi_dbs_) {
          tailing_iters.push_back(db_with_cfh.db->NewIterator(options));
        }
6311 6312 6313
      }
    }

6314 6315
    std::unique_ptr<const char[]> key_guard;
    Slice key = AllocateKey(&key_guard);
L
Lei Jin 已提交
6316

6317 6318 6319 6320 6321
    std::unique_ptr<const char[]> upper_bound_key_guard;
    Slice upper_bound = AllocateKey(&upper_bound_key_guard);
    std::unique_ptr<const char[]> lower_bound_key_guard;
    Slice lower_bound = AllocateKey(&lower_bound_key_guard);

L
Lei Jin 已提交
6322
    Duration duration(FLAGS_duration, reads_);
6323
    char value_buffer[256];
M
Mark Callaghan 已提交
6324
    while (!duration.Done(1)) {
6325
      int64_t seek_pos = thread->rand.Next() % FLAGS_num;
6326 6327
      GenerateKeyFromIntForSeek(static_cast<uint64_t>(seek_pos), FLAGS_num,
                                &key);
6328 6329 6330
      if (FLAGS_max_scan_distance != 0) {
        if (FLAGS_reverse_iterator) {
          GenerateKeyFromInt(
6331 6332
              static_cast<uint64_t>(std::max(
                  static_cast<int64_t>(0), seek_pos - FLAGS_max_scan_distance)),
6333 6334 6335
              FLAGS_num, &lower_bound);
          options.iterate_lower_bound = &lower_bound;
        } else {
6336
          auto min_num =
6337 6338 6339
              std::min(FLAGS_num, seek_pos + FLAGS_max_scan_distance);
          GenerateKeyFromInt(static_cast<uint64_t>(min_num), FLAGS_num,
                             &upper_bound);
6340 6341 6342 6343
          options.iterate_upper_bound = &upper_bound;
        }
      }

6344
      // Pick a Iterator to use
6345 6346 6347 6348
      size_t db_idx_to_use =
          (db_.db == nullptr)
              ? (size_t{thread->rand.Next()} % multi_dbs_.size())
              : 0;
6349 6350 6351
      std::unique_ptr<Iterator> single_iter;
      Iterator* iter_to_use;
      if (FLAGS_use_tailing_iterator) {
6352
        iter_to_use = tailing_iters[db_idx_to_use];
6353
      } else {
M
Mark Callaghan 已提交
6354
        if (db_.db != nullptr) {
6355
          single_iter.reset(db_.db->NewIterator(options));
M
Mark Callaghan 已提交
6356
        } else {
6357
          single_iter.reset(multi_dbs_[db_idx_to_use].db->NewIterator(options));
6358
        }
6359
        iter_to_use = single_iter.get();
6360 6361 6362
      }

      iter_to_use->Seek(key);
L
Lei Jin 已提交
6363
      read++;
6364
      if (iter_to_use->Valid() && iter_to_use->key().compare(key) == 0) {
L
Lei Jin 已提交
6365 6366
        found++;
      }
6367 6368 6369 6370 6371 6372

      for (int j = 0; j < FLAGS_seek_nexts && iter_to_use->Valid(); ++j) {
        // Copy out iterator's value to make sure we read them.
        Slice value = iter_to_use->value();
        memcpy(value_buffer, value.data(),
               std::min(value.size(), sizeof(value_buffer)));
6373
        bytes += iter_to_use->key().size() + iter_to_use->value().size();
M
Mark Callaghan 已提交
6374 6375 6376 6377 6378 6379

        if (!FLAGS_reverse_iterator) {
          iter_to_use->Next();
        } else {
          iter_to_use->Prev();
        }
6380 6381 6382
        assert(iter_to_use->status().ok());
      }

6383 6384
      if (thread->shared->read_rate_limiter.get() != nullptr &&
          read % 256 == 255) {
6385 6386
        thread->shared->read_rate_limiter->Request(
            256, Env::IO_HIGH, nullptr /* stats */, RateLimiter::OpType::kRead);
6387 6388
      }

6389
      thread->stats.FinishedOps(&db_, db_.db, 1, kSeek);
S
Sanjay Ghemawat 已提交
6390
    }
6391
    for (auto iter : tailing_iters) {
6392 6393
      delete iter;
    }
L
Lei Jin 已提交
6394

S
Sanjay Ghemawat 已提交
6395
    char msg[100];
L
Lei Jin 已提交
6396
    snprintf(msg, sizeof(msg), "(%" PRIu64 " of %" PRIu64 " found)\n",
L
Lei Jin 已提交
6397
             found, read);
6398
    thread->stats.AddBytes(bytes);
S
Sanjay Ghemawat 已提交
6399
    thread->stats.AddMessage(msg);
6400
    if (FLAGS_perf_level > ROCKSDB_NAMESPACE::PerfLevel::kDisable) {
6401 6402
      thread->stats.AddMessage(std::string("PERF_CONTEXT:\n") +
                               get_perf_context()->ToString());
L
Lei Jin 已提交
6403
    }
S
Sanjay Ghemawat 已提交
6404
  }
L
Lei Jin 已提交
6405 6406 6407 6408 6409

  void SeekRandomWhileWriting(ThreadState* thread) {
    if (thread->tid > 0) {
      SeekRandom(thread);
    } else {
6410
      BGWriter(thread, kWrite);
L
Lei Jin 已提交
6411 6412
    }
  }
S
Sanjay Ghemawat 已提交
6413

6414 6415 6416 6417 6418 6419 6420 6421
  void SeekRandomWhileMerging(ThreadState* thread) {
    if (thread->tid > 0) {
      SeekRandom(thread);
    } else {
      BGWriter(thread, kMerge);
    }
  }

S
Sanjay Ghemawat 已提交
6422
  void DoDelete(ThreadState* thread, bool seq) {
6423
    WriteBatch batch(/*reserved_bytes=*/0, /*max_bytes=*/0,
6424
                     /*protection_bytes_per_key=*/0, user_timestamp_size_);
Y
Yueh-Hsuan Chiang 已提交
6425
    Duration duration(seq ? 0 : FLAGS_duration, deletes_);
L
Lei Jin 已提交
6426
    int64_t i = 0;
6427 6428
    std::unique_ptr<const char[]> key_guard;
    Slice key = AllocateKey(&key_guard);
6429 6430 6431 6432 6433
    std::unique_ptr<char[]> ts_guard;
    Slice ts;
    if (user_timestamp_size_ > 0) {
      ts_guard.reset(new char[user_timestamp_size_]);
    }
L
Lei Jin 已提交
6434

M
Mark Callaghan 已提交
6435
    while (!duration.Done(entries_per_batch_)) {
6436
      DB* db = SelectDB(thread);
S
Sanjay Ghemawat 已提交
6437
      batch.Clear();
L
Lei Jin 已提交
6438 6439 6440
      for (int64_t j = 0; j < entries_per_batch_; ++j) {
        const int64_t k = seq ? i + j : (thread->rand.Next() % FLAGS_num);
        GenerateKeyFromInt(k, FLAGS_num, &key);
6441
        batch.Delete(key);
S
Sanjay Ghemawat 已提交
6442
      }
6443 6444 6445
      Status s;
      if (user_timestamp_size_ > 0) {
        ts = mock_app_clock_->Allocate(ts_guard.get());
6446 6447
        s = batch.UpdateTimestamps(
            ts, [this](uint32_t) { return user_timestamp_size_; });
6448 6449 6450 6451 6452 6453
        if (!s.ok()) {
          fprintf(stderr, "assign timestamp: %s\n", s.ToString().c_str());
          ErrorExit();
        }
      }
      s = db->Write(write_options_, &batch);
6454
      thread->stats.FinishedOps(nullptr, db, entries_per_batch_, kDelete);
S
Sanjay Ghemawat 已提交
6455 6456 6457 6458
      if (!s.ok()) {
        fprintf(stderr, "del error: %s\n", s.ToString().c_str());
        exit(1);
      }
L
Lei Jin 已提交
6459
      i += entries_per_batch_;
S
Sanjay Ghemawat 已提交
6460 6461 6462 6463 6464 6465 6466 6467 6468 6469 6470
    }
  }

  void DeleteSeq(ThreadState* thread) {
    DoDelete(thread, true);
  }

  void DeleteRandom(ThreadState* thread) {
    DoDelete(thread, false);
  }

6471 6472 6473 6474
  void ReadWhileWriting(ThreadState* thread) {
    if (thread->tid > 0) {
      ReadRandom(thread);
    } else {
6475
      BGWriter(thread, kWrite);
6476 6477
    }
  }
6478

M
Mark Callaghan 已提交
6479 6480 6481 6482 6483 6484 6485 6486
  void ReadWhileMerging(ThreadState* thread) {
    if (thread->tid > 0) {
      ReadRandom(thread);
    } else {
      BGWriter(thread, kMerge);
    }
  }

6487
  void BGWriter(ThreadState* thread, enum OperationType write_merge) {
6488 6489
    // Special thread that keeps writing until other threads are done.
    RandomGenerator gen;
6490
    int64_t bytes = 0;
6491

6492 6493 6494 6495 6496
    std::unique_ptr<RateLimiter> write_rate_limiter;
    if (FLAGS_benchmark_write_rate_limit > 0) {
      write_rate_limiter.reset(
          NewGenericRateLimiter(FLAGS_benchmark_write_rate_limit));
    }
6497 6498 6499 6500

    // Don't merge stats from this thread with the readers.
    thread->stats.SetExcludeFromMerge();

6501 6502
    std::unique_ptr<const char[]> key_guard;
    Slice key = AllocateKey(&key_guard);
6503 6504 6505 6506
    std::unique_ptr<char[]> ts_guard;
    if (user_timestamp_size_ > 0) {
      ts_guard.reset(new char[user_timestamp_size_]);
    }
6507 6508
    uint32_t written = 0;
    bool hint_printed = false;
6509 6510

    while (true) {
6511
      DB* db = SelectDB(thread);
6512 6513
      {
        MutexLock l(&thread->shared->mu);
6514 6515 6516 6517
        if (FLAGS_finish_after_writes && written == writes_) {
          fprintf(stderr, "Exiting the writer after %u writes...\n", written);
          break;
        }
6518 6519
        if (thread->shared->num_done + 1 >= thread->shared->num_initialized) {
          // Other threads have finished
6520 6521 6522 6523
          if (FLAGS_finish_after_writes) {
            // Wait for the writes to be finished
            if (!hint_printed) {
              fprintf(stderr, "Reads are finished. Have %d more writes to do\n",
6524
                      static_cast<int>(writes_) - written);
6525 6526 6527 6528 6529 6530
              hint_printed = true;
            }
          } else {
            // Finish the write immediately
            break;
          }
6531
        }
6532 6533 6534
      }

      GenerateKeyFromInt(thread->rand.Next() % FLAGS_num, FLAGS_num, &key);
M
Mark Callaghan 已提交
6535 6536
      Status s;

6537
      Slice val = gen.Generate();
6538 6539 6540 6541
      Slice ts;
      if (user_timestamp_size_ > 0) {
        ts = mock_app_clock_->Allocate(ts_guard.get());
      }
6542
      if (write_merge == kWrite) {
6543 6544 6545 6546 6547
        if (user_timestamp_size_ == 0) {
          s = db->Put(write_options_, key, val);
        } else {
          s = db->Put(write_options_, key, ts, val);
        }
M
Mark Callaghan 已提交
6548
      } else {
6549
        s = db->Merge(write_options_, key, val);
M
Mark Callaghan 已提交
6550
      }
6551
      // Restore write_options_
6552
      written++;
M
Mark Callaghan 已提交
6553

6554
      if (!s.ok()) {
M
Mark Callaghan 已提交
6555
        fprintf(stderr, "put or merge error: %s\n", s.ToString().c_str());
6556 6557
        exit(1);
      }
6558
      bytes += key.size() + val.size() + user_timestamp_size_;
6559
      thread->stats.FinishedOps(&db_, db_.db, 1, kWrite);
6560

6561 6562
      if (FLAGS_benchmark_write_rate_limit > 0) {
        write_rate_limiter->Request(
6563
            key.size() + val.size(), Env::IO_HIGH,
6564
            nullptr /* stats */, RateLimiter::OpType::kWrite);
6565 6566
      }
    }
6567
    thread->stats.AddBytes(bytes);
6568 6569
  }

Y
Yi Wu 已提交
6570 6571 6572 6573 6574 6575 6576 6577 6578 6579 6580 6581 6582 6583
  void ReadWhileScanning(ThreadState* thread) {
    if (thread->tid > 0) {
      ReadRandom(thread);
    } else {
      BGScan(thread);
    }
  }

  void BGScan(ThreadState* thread) {
    if (FLAGS_num_multi_db > 0) {
      fprintf(stderr, "Not supporting multiple DBs.\n");
      abort();
    }
    assert(db_.db != nullptr);
6584
    ReadOptions read_options = read_options_;
6585 6586 6587 6588 6589 6590 6591
    std::unique_ptr<char[]> ts_guard;
    Slice ts;
    if (user_timestamp_size_ > 0) {
      ts_guard.reset(new char[user_timestamp_size_]);
      ts = mock_app_clock_->GetTimestampForRead(thread->rand, ts_guard.get());
      read_options.timestamp = &ts;
    }
Y
Yi Wu 已提交
6592 6593
    Iterator* iter = db_.db->NewIterator(read_options);

6594
    fprintf(stderr, "num reads to do %" PRIu64 "\n", reads_);
Y
Yi Wu 已提交
6595 6596 6597 6598 6599 6600 6601 6602 6603 6604 6605 6606 6607 6608 6609 6610 6611 6612 6613 6614 6615
    Duration duration(FLAGS_duration, reads_);
    uint64_t num_seek_to_first = 0;
    uint64_t num_next = 0;
    while (!duration.Done(1)) {
      if (!iter->Valid()) {
        iter->SeekToFirst();
        num_seek_to_first++;
      } else if (!iter->status().ok()) {
        fprintf(stderr, "Iterator error: %s\n",
                iter->status().ToString().c_str());
        abort();
      } else {
        iter->Next();
        num_next++;
      }

      thread->stats.FinishedOps(&db_, db_.db, 1, kSeek);
    }
    delete iter;
  }

6616
  // Given a key K and value V, this puts (K+"0", V), (K+"1", V), (K+"2", V)
6617
  // in DB atomically i.e in a single batch. Also refer GetMany.
6618 6619
  Status PutMany(DB* db, const WriteOptions& writeoptions, const Slice& key,
                 const Slice& value) {
6620 6621 6622
    std::string suffixes[3] = {"2", "1", "0"};
    std::string keys[3];

6623
    WriteBatch batch(/*reserved_bytes=*/0, /*max_bytes=*/0,
6624
                     /*protection_bytes_per_key=*/0, user_timestamp_size_);
6625 6626 6627 6628 6629 6630
    Status s;
    for (int i = 0; i < 3; i++) {
      keys[i] = key.ToString() + suffixes[i];
      batch.Put(keys[i], value);
    }

6631 6632 6633 6634
    std::unique_ptr<char[]> ts_guard;
    if (user_timestamp_size_ > 0) {
      ts_guard.reset(new char[user_timestamp_size_]);
      Slice ts = mock_app_clock_->Allocate(ts_guard.get());
6635 6636
      s = batch.UpdateTimestamps(
          ts, [this](uint32_t) { return user_timestamp_size_; });
6637 6638 6639 6640 6641 6642 6643
      if (!s.ok()) {
        fprintf(stderr, "assign timestamp to batch: %s\n",
                s.ToString().c_str());
        ErrorExit();
      }
    }

6644
    s = db->Write(writeoptions, &batch);
6645 6646 6647 6648 6649
    return s;
  }


  // Given a key K, this deletes (K+"0", V), (K+"1", V), (K+"2", V)
6650
  // in DB atomically i.e in a single batch. Also refer GetMany.
6651 6652
  Status DeleteMany(DB* db, const WriteOptions& writeoptions,
                    const Slice& key) {
6653 6654 6655
    std::string suffixes[3] = {"1", "2", "0"};
    std::string keys[3];

6656 6657
    WriteBatch batch(0, 0, /*protection_bytes_per_key=*/0,
                     user_timestamp_size_);
6658 6659 6660 6661 6662 6663
    Status s;
    for (int i = 0; i < 3; i++) {
      keys[i] = key.ToString() + suffixes[i];
      batch.Delete(keys[i]);
    }

6664 6665 6666 6667
    std::unique_ptr<char[]> ts_guard;
    if (user_timestamp_size_ > 0) {
      ts_guard.reset(new char[user_timestamp_size_]);
      Slice ts = mock_app_clock_->Allocate(ts_guard.get());
6668 6669
      s = batch.UpdateTimestamps(
          ts, [this](uint32_t) { return user_timestamp_size_; });
6670 6671 6672 6673 6674 6675 6676
      if (!s.ok()) {
        fprintf(stderr, "assign timestamp to batch: %s\n",
                s.ToString().c_str());
        ErrorExit();
      }
    }

6677
    s = db->Write(writeoptions, &batch);
6678 6679 6680 6681 6682
    return s;
  }

  // Given a key K and value V, this gets values for K+"0", K+"1" and K+"2"
  // in the same snapshot, and verifies that all the values are identical.
6683
  // ASSUMES that PutMany was used to put (K, V) into the DB.
6684
  Status GetMany(DB* db, const Slice& key, std::string* value) {
6685 6686 6687 6688
    std::string suffixes[3] = {"0", "1", "2"};
    std::string keys[3];
    Slice key_slices[3];
    std::string values[3];
6689
    ReadOptions readoptionscopy = read_options_;
6690 6691 6692 6693 6694 6695 6696 6697 6698

    std::unique_ptr<char[]> ts_guard;
    Slice ts;
    if (user_timestamp_size_ > 0) {
      ts_guard.reset(new char[user_timestamp_size_]);
      ts = mock_app_clock_->Allocate(ts_guard.get());
      readoptionscopy.timestamp = &ts;
    }

6699
    readoptionscopy.snapshot = db->GetSnapshot();
6700 6701 6702 6703
    Status s;
    for (int i = 0; i < 3; i++) {
      keys[i] = key.ToString() + suffixes[i];
      key_slices[i] = keys[i];
6704
      s = db->Get(readoptionscopy, key_slices[i], value);
6705 6706 6707 6708 6709 6710 6711 6712 6713 6714 6715
      if (!s.ok() && !s.IsNotFound()) {
        fprintf(stderr, "get error: %s\n", s.ToString().c_str());
        values[i] = "";
        // we continue after error rather than exiting so that we can
        // find more errors if any
      } else if (s.IsNotFound()) {
        values[i] = "";
      } else {
        values[i] = *value;
      }
    }
6716
    db->ReleaseSnapshot(readoptionscopy.snapshot);
6717 6718 6719 6720 6721 6722 6723 6724 6725 6726 6727 6728 6729

    if ((values[0] != values[1]) || (values[1] != values[2])) {
      fprintf(stderr, "inconsistent values for key %s: %s, %s, %s\n",
              key.ToString().c_str(), values[0].c_str(), values[1].c_str(),
              values[2].c_str());
      // we continue after error rather than exiting so that we can
      // find more errors if any
    }

    return s;
  }

  // Differs from readrandomwriterandom in the following ways:
6730
  // (a) Uses GetMany/PutMany to read/write key values. Refer to those funcs.
6731 6732 6733 6734
  // (b) Does deletes as well (per FLAGS_deletepercent)
  // (c) In order to achieve high % of 'found' during lookups, and to do
  //     multiple writes (including puts and deletes) it uses upto
  //     FLAGS_numdistinct distinct keys instead of FLAGS_num distinct keys.
6735
  // (d) Does not have a MultiGet option.
6736 6737 6738
  void RandomWithVerify(ThreadState* thread) {
    RandomGenerator gen;
    std::string value;
6739
    int64_t found = 0;
6740 6741 6742
    int get_weight = 0;
    int put_weight = 0;
    int delete_weight = 0;
6743 6744 6745
    int64_t gets_done = 0;
    int64_t puts_done = 0;
    int64_t deletes_done = 0;
6746

6747 6748
    std::unique_ptr<const char[]> key_guard;
    Slice key = AllocateKey(&key_guard);
L
Lei Jin 已提交
6749

6750
    // the number of iterations is the larger of read_ or write_
6751
    for (int64_t i = 0; i < readwrites_; i++) {
6752
      DB* db = SelectDB(thread);
6753
      if (get_weight == 0 && put_weight == 0 && delete_weight == 0) {
6754
        // one batch completed, reinitialize for next batch
6755 6756 6757 6758
        get_weight = FLAGS_readwritepercent;
        delete_weight = FLAGS_deletepercent;
        put_weight = 100 - get_weight - delete_weight;
      }
L
Lei Jin 已提交
6759 6760
      GenerateKeyFromInt(thread->rand.Next() % FLAGS_numdistinct,
          FLAGS_numdistinct, &key);
6761 6762
      if (get_weight > 0) {
        // do all the gets first
6763
        Status s = GetMany(db, key, &value);
6764
        if (!s.ok() && !s.IsNotFound()) {
6765
          fprintf(stderr, "getmany error: %s\n", s.ToString().c_str());
6766 6767 6768 6769 6770 6771 6772
          // we continue after error rather than exiting so that we can
          // find more errors if any
        } else if (!s.IsNotFound()) {
          found++;
        }
        get_weight--;
        gets_done++;
6773
        thread->stats.FinishedOps(&db_, db_.db, 1, kRead);
6774 6775 6776
      } else if (put_weight > 0) {
        // then do all the corresponding number of puts
        // for all the gets we have done earlier
6777
        Status s = PutMany(db, write_options_, key, gen.Generate());
6778
        if (!s.ok()) {
6779
          fprintf(stderr, "putmany error: %s\n", s.ToString().c_str());
6780 6781 6782 6783
          exit(1);
        }
        put_weight--;
        puts_done++;
6784
        thread->stats.FinishedOps(&db_, db_.db, 1, kWrite);
6785
      } else if (delete_weight > 0) {
6786
        Status s = DeleteMany(db, write_options_, key);
6787
        if (!s.ok()) {
6788
          fprintf(stderr, "deletemany error: %s\n", s.ToString().c_str());
6789 6790 6791 6792
          exit(1);
        }
        delete_weight--;
        deletes_done++;
6793
        thread->stats.FinishedOps(&db_, db_.db, 1, kDelete);
6794 6795
      }
    }
D
Daniel Black 已提交
6796
    char msg[128];
6797
    snprintf(msg, sizeof(msg),
6798 6799
             "( get:%" PRIu64 " put:%" PRIu64 " del:%" PRIu64 " total:%" \
             PRIu64 " found:%" PRIu64 ")",
6800 6801 6802 6803
             gets_done, puts_done, deletes_done, readwrites_, found);
    thread->stats.AddMessage(msg);
  }

X
Xing Jin 已提交
6804
  // This is different from ReadWhileWriting because it does not use
6805
  // an extra thread.
6806
  void ReadRandomWriteRandom(ThreadState* thread) {
6807
    ReadOptions options = read_options_;
6808 6809
    RandomGenerator gen;
    std::string value;
6810
    int64_t found = 0;
6811 6812
    int get_weight = 0;
    int put_weight = 0;
6813 6814
    int64_t reads_done = 0;
    int64_t writes_done = 0;
M
Mark Callaghan 已提交
6815 6816
    Duration duration(FLAGS_duration, readwrites_);

6817 6818
    std::unique_ptr<const char[]> key_guard;
    Slice key = AllocateKey(&key_guard);
L
Lei Jin 已提交
6819

6820 6821 6822 6823 6824
    std::unique_ptr<char[]> ts_guard;
    if (user_timestamp_size_ > 0) {
      ts_guard.reset(new char[user_timestamp_size_]);
    }

6825
    // the number of iterations is the larger of read_ or write_
M
Mark Callaghan 已提交
6826
    while (!duration.Done(1)) {
6827
      DB* db = SelectDB(thread);
L
Lei Jin 已提交
6828
      GenerateKeyFromInt(thread->rand.Next() % FLAGS_num, FLAGS_num, &key);
6829
      if (get_weight == 0 && put_weight == 0) {
X
Xing Jin 已提交
6830
        // one batch completed, reinitialize for next batch
6831 6832 6833 6834 6835
        get_weight = FLAGS_readwritepercent;
        put_weight = 100 - get_weight;
      }
      if (get_weight > 0) {
        // do all the gets first
6836 6837 6838 6839 6840 6841
        Slice ts;
        if (user_timestamp_size_ > 0) {
          ts = mock_app_clock_->GetTimestampForRead(thread->rand,
                                                    ts_guard.get());
          options.timestamp = &ts;
        }
6842
        Status s = db->Get(options, key, &value);
6843 6844 6845 6846 6847 6848 6849
        if (!s.ok() && !s.IsNotFound()) {
          fprintf(stderr, "get error: %s\n", s.ToString().c_str());
          // we continue after error rather than exiting so that we can
          // find more errors if any
        } else if (!s.IsNotFound()) {
          found++;
        }
6850 6851
        get_weight--;
        reads_done++;
6852
        thread->stats.FinishedOps(nullptr, db, 1, kRead);
6853 6854 6855
      } else  if (put_weight > 0) {
        // then do all the corresponding number of puts
        // for all the gets we have done earlier
6856
        Status s;
6857
        if (user_timestamp_size_ > 0) {
6858 6859 6860 6861
          Slice ts = mock_app_clock_->Allocate(ts_guard.get());
          s = db->Put(write_options_, key, ts, gen.Generate());
        } else {
          s = db->Put(write_options_, key, gen.Generate());
6862
        }
6863 6864
        if (!s.ok()) {
          fprintf(stderr, "put error: %s\n", s.ToString().c_str());
6865
          ErrorExit();
6866 6867 6868
        }
        put_weight--;
        writes_done++;
6869
        thread->stats.FinishedOps(nullptr, db, 1, kWrite);
6870 6871 6872
      }
    }
    char msg[100];
6873 6874
    snprintf(msg, sizeof(msg), "( reads:%" PRIu64 " writes:%" PRIu64 \
             " total:%" PRIu64 " found:%" PRIu64 ")",
6875
             reads_done, writes_done, readwrites_, found);
6876 6877 6878
    thread->stats.AddMessage(msg);
  }

M
Mark Callaghan 已提交
6879 6880 6881
  //
  // Read-modify-write for random keys
  void UpdateRandom(ThreadState* thread) {
6882
    ReadOptions options = read_options_;
M
Mark Callaghan 已提交
6883 6884
    RandomGenerator gen;
    std::string value;
6885
    int64_t found = 0;
6886
    int64_t bytes = 0;
M
Mark Callaghan 已提交
6887 6888
    Duration duration(FLAGS_duration, readwrites_);

6889 6890
    std::unique_ptr<const char[]> key_guard;
    Slice key = AllocateKey(&key_guard);
6891 6892 6893 6894
    std::unique_ptr<char[]> ts_guard;
    if (user_timestamp_size_ > 0) {
      ts_guard.reset(new char[user_timestamp_size_]);
    }
M
Mark Callaghan 已提交
6895 6896
    // the number of iterations is the larger of read_ or write_
    while (!duration.Done(1)) {
6897
      DB* db = SelectDB(thread);
L
Lei Jin 已提交
6898
      GenerateKeyFromInt(thread->rand.Next() % FLAGS_num, FLAGS_num, &key);
6899 6900 6901 6902 6903 6904
      Slice ts;
      if (user_timestamp_size_ > 0) {
        // Read with newest timestamp because we are doing rmw.
        ts = mock_app_clock_->Allocate(ts_guard.get());
        options.timestamp = &ts;
      }
M
Mark Callaghan 已提交
6905

6906 6907 6908
      auto status = db->Get(options, key, &value);
      if (status.ok()) {
        ++found;
6909
        bytes += key.size() + value.size() + user_timestamp_size_;
6910
      } else if (!status.IsNotFound()) {
I
Igor Canadi 已提交
6911 6912
        fprintf(stderr, "Get returned an error: %s\n",
                status.ToString().c_str());
6913
        abort();
M
Mark Callaghan 已提交
6914 6915
      }

6916 6917
      if (thread->shared->write_rate_limiter) {
        thread->shared->write_rate_limiter->Request(
6918
            key.size() + value.size(), Env::IO_HIGH, nullptr /*stats*/,
6919 6920 6921
            RateLimiter::OpType::kWrite);
      }

6922
      Slice val = gen.Generate();
6923
      Status s;
6924 6925
      if (user_timestamp_size_ > 0) {
        ts = mock_app_clock_->Allocate(ts_guard.get());
6926 6927 6928
        s = db->Put(write_options_, key, ts, val);
      } else {
        s = db->Put(write_options_, key, val);
6929
      }
M
Mark Callaghan 已提交
6930 6931 6932 6933
      if (!s.ok()) {
        fprintf(stderr, "put error: %s\n", s.ToString().c_str());
        exit(1);
      }
6934
      bytes += key.size() + val.size() + user_timestamp_size_;
6935
      thread->stats.FinishedOps(nullptr, db, 1, kUpdate);
M
Mark Callaghan 已提交
6936 6937
    }
    char msg[100];
6938
    snprintf(msg, sizeof(msg),
6939
             "( updates:%" PRIu64 " found:%" PRIu64 ")", readwrites_, found);
6940
    thread->stats.AddBytes(bytes);
M
Mark Callaghan 已提交
6941 6942 6943
    thread->stats.AddMessage(msg);
  }

P
Pooya Shareghi 已提交
6944 6945 6946 6947 6948
  // Read-XOR-write for random keys. Xors the existing value with a randomly
  // generated value, and stores the result. Assuming A in the array of bytes
  // representing the existing value, we generate an array B of the same size,
  // then compute C = A^B as C[i]=A[i]^B[i], and store C
  void XORUpdateRandom(ThreadState* thread) {
6949
    ReadOptions options = read_options_;
P
Pooya Shareghi 已提交
6950 6951 6952 6953 6954 6955 6956 6957 6958
    RandomGenerator gen;
    std::string existing_value;
    int64_t found = 0;
    Duration duration(FLAGS_duration, readwrites_);

    BytesXOROperator xor_operator;

    std::unique_ptr<const char[]> key_guard;
    Slice key = AllocateKey(&key_guard);
6959 6960 6961 6962
    std::unique_ptr<char[]> ts_guard;
    if (user_timestamp_size_ > 0) {
      ts_guard.reset(new char[user_timestamp_size_]);
    }
P
Pooya Shareghi 已提交
6963 6964 6965 6966
    // the number of iterations is the larger of read_ or write_
    while (!duration.Done(1)) {
      DB* db = SelectDB(thread);
      GenerateKeyFromInt(thread->rand.Next() % FLAGS_num, FLAGS_num, &key);
6967 6968 6969 6970 6971
      Slice ts;
      if (user_timestamp_size_ > 0) {
        ts = mock_app_clock_->Allocate(ts_guard.get());
        options.timestamp = &ts;
      }
P
Pooya Shareghi 已提交
6972 6973 6974 6975 6976 6977 6978 6979 6980 6981

      auto status = db->Get(options, key, &existing_value);
      if (status.ok()) {
        ++found;
      } else if (!status.IsNotFound()) {
        fprintf(stderr, "Get returned an error: %s\n",
                status.ToString().c_str());
        exit(1);
      }

6982
      Slice value = gen.Generate(static_cast<unsigned int>(existing_value.size()));
P
Pooya Shareghi 已提交
6983 6984 6985 6986 6987 6988 6989 6990 6991
      std::string new_value;

      if (status.ok()) {
        Slice existing_value_slice = Slice(existing_value);
        xor_operator.XOR(&existing_value_slice, value, &new_value);
      } else {
        xor_operator.XOR(nullptr, value, &new_value);
      }

6992
      Status s;
6993 6994
      if (user_timestamp_size_ > 0) {
        ts = mock_app_clock_->Allocate(ts_guard.get());
6995 6996 6997
        s = db->Put(write_options_, key, ts, Slice(new_value));
      } else {
        s = db->Put(write_options_, key, Slice(new_value));
6998
      }
P
Pooya Shareghi 已提交
6999 7000
      if (!s.ok()) {
        fprintf(stderr, "put error: %s\n", s.ToString().c_str());
7001
        ErrorExit();
P
Pooya Shareghi 已提交
7002 7003 7004 7005 7006 7007 7008 7009 7010
      }
      thread->stats.FinishedOps(nullptr, db, 1);
    }
    char msg[100];
    snprintf(msg, sizeof(msg),
             "( updates:%" PRIu64 " found:%" PRIu64 ")", readwrites_, found);
    thread->stats.AddMessage(msg);
  }

D
Deon Nicholas 已提交
7011 7012 7013 7014
  // Read-modify-write for random keys.
  // Each operation causes the key grow by value_size (simulating an append).
  // Generally used for benchmarking against merges of similar type
  void AppendRandom(ThreadState* thread) {
7015
    ReadOptions options = read_options_;
D
Deon Nicholas 已提交
7016 7017
    RandomGenerator gen;
    std::string value;
7018
    int64_t found = 0;
7019
    int64_t bytes = 0;
D
Deon Nicholas 已提交
7020

7021 7022
    std::unique_ptr<const char[]> key_guard;
    Slice key = AllocateKey(&key_guard);
7023 7024 7025 7026
    std::unique_ptr<char[]> ts_guard;
    if (user_timestamp_size_ > 0) {
      ts_guard.reset(new char[user_timestamp_size_]);
    }
D
Deon Nicholas 已提交
7027 7028 7029
    // The number of iterations is the larger of read_ or write_
    Duration duration(FLAGS_duration, readwrites_);
    while (!duration.Done(1)) {
7030
      DB* db = SelectDB(thread);
L
Lei Jin 已提交
7031
      GenerateKeyFromInt(thread->rand.Next() % FLAGS_num, FLAGS_num, &key);
7032 7033 7034 7035 7036
      Slice ts;
      if (user_timestamp_size_ > 0) {
        ts = mock_app_clock_->Allocate(ts_guard.get());
        options.timestamp = &ts;
      }
D
Deon Nicholas 已提交
7037

7038 7039 7040
      auto status = db->Get(options, key, &value);
      if (status.ok()) {
        ++found;
7041
        bytes += key.size() + value.size() + user_timestamp_size_;
7042
      } else if (!status.IsNotFound()) {
I
Igor Canadi 已提交
7043 7044
        fprintf(stderr, "Get returned an error: %s\n",
                status.ToString().c_str());
7045
        abort();
D
Deon Nicholas 已提交
7046 7047 7048 7049 7050 7051
      } else {
        // If not existing, then just assume an empty string of data
        value.clear();
      }

      // Update the value (by appending data)
7052
      Slice operand = gen.Generate();
D
Deon Nicholas 已提交
7053
      if (value.size() > 0) {
7054
        // Use a delimiter to match the semantics for StringAppendOperator
D
Deon Nicholas 已提交
7055 7056 7057 7058
        value.append(1,',');
      }
      value.append(operand.data(), operand.size());

7059
      Status s;
7060 7061
      if (user_timestamp_size_ > 0) {
        ts = mock_app_clock_->Allocate(ts_guard.get());
7062 7063 7064 7065
        s = db->Put(write_options_, key, ts, value);
      } else {
        // Write back to the database
        s = db->Put(write_options_, key, value);
7066
      }
D
Deon Nicholas 已提交
7067 7068
      if (!s.ok()) {
        fprintf(stderr, "put error: %s\n", s.ToString().c_str());
7069
        ErrorExit();
D
Deon Nicholas 已提交
7070
      }
7071
      bytes += key.size() + value.size() + user_timestamp_size_;
7072
      thread->stats.FinishedOps(nullptr, db, 1, kUpdate);
D
Deon Nicholas 已提交
7073
    }
L
Lei Jin 已提交
7074

D
Deon Nicholas 已提交
7075
    char msg[100];
7076 7077
    snprintf(msg, sizeof(msg), "( updates:%" PRIu64 " found:%" PRIu64 ")",
            readwrites_, found);
7078
    thread->stats.AddBytes(bytes);
D
Deon Nicholas 已提交
7079 7080 7081 7082 7083 7084 7085 7086 7087 7088
    thread->stats.AddMessage(msg);
  }

  // Read-modify-write for random keys (using MergeOperator)
  // The merge operator to use should be defined by FLAGS_merge_operator
  // Adjust FLAGS_value_size so that the keys are reasonable for this operator
  // Assumes that the merge operator is non-null (i.e.: is well-defined)
  //
  // For example, use FLAGS_merge_operator="uint64add" and FLAGS_value_size=8
  // to simulate random additions over 64-bit integers using merge.
7089 7090 7091
  //
  // The number of merges on the same key can be controlled by adjusting
  // FLAGS_merge_keys.
D
Deon Nicholas 已提交
7092 7093
  void MergeRandom(ThreadState* thread) {
    RandomGenerator gen;
7094
    int64_t bytes = 0;
7095 7096
    std::unique_ptr<const char[]> key_guard;
    Slice key = AllocateKey(&key_guard);
D
Deon Nicholas 已提交
7097 7098 7099
    // The number of iterations is the larger of read_ or write_
    Duration duration(FLAGS_duration, readwrites_);
    while (!duration.Done(1)) {
7100 7101 7102
      DBWithColumnFamilies* db_with_cfh = SelectDBWithCfh(thread);
      int64_t key_rand = thread->rand.Next() % merge_keys_;
      GenerateKeyFromInt(key_rand, merge_keys_, &key);
D
Deon Nicholas 已提交
7103

7104
      Status s;
7105
      Slice val = gen.Generate();
7106 7107 7108
      if (FLAGS_num_column_families > 1) {
        s = db_with_cfh->db->Merge(write_options_,
                                   db_with_cfh->GetCfh(key_rand), key,
7109
                                   val);
7110 7111 7112
      } else {
        s = db_with_cfh->db->Merge(write_options_,
                                   db_with_cfh->db->DefaultColumnFamily(), key,
7113
                                   val);
7114
      }
D
Deon Nicholas 已提交
7115 7116 7117 7118 7119

      if (!s.ok()) {
        fprintf(stderr, "merge error: %s\n", s.ToString().c_str());
        exit(1);
      }
7120
      bytes += key.size() + val.size();
7121
      thread->stats.FinishedOps(nullptr, db_with_cfh->db, 1, kMerge);
D
Deon Nicholas 已提交
7122 7123 7124 7125
    }

    // Print some statistics
    char msg[100];
7126
    snprintf(msg, sizeof(msg), "( updates:%" PRIu64 ")", readwrites_);
7127
    thread->stats.AddBytes(bytes);
D
Deon Nicholas 已提交
7128 7129 7130
    thread->stats.AddMessage(msg);
  }

7131 7132 7133 7134 7135 7136 7137 7138 7139 7140
  // Read and merge random keys. The amount of reads and merges are controlled
  // by adjusting FLAGS_num and FLAGS_mergereadpercent. The number of distinct
  // keys (and thus also the number of reads and merges on the same key) can be
  // adjusted with FLAGS_merge_keys.
  //
  // As with MergeRandom, the merge operator to use should be defined by
  // FLAGS_merge_operator.
  void ReadRandomMergeRandom(ThreadState* thread) {
    RandomGenerator gen;
    std::string value;
7141 7142 7143
    int64_t num_hits = 0;
    int64_t num_gets = 0;
    int64_t num_merges = 0;
7144 7145
    size_t max_length = 0;

7146 7147
    std::unique_ptr<const char[]> key_guard;
    Slice key = AllocateKey(&key_guard);
7148 7149 7150
    // the number of iterations is the larger of read_ or write_
    Duration duration(FLAGS_duration, readwrites_);
    while (!duration.Done(1)) {
7151
      DB* db = SelectDB(thread);
L
Lei Jin 已提交
7152
      GenerateKeyFromInt(thread->rand.Next() % merge_keys_, merge_keys_, &key);
7153 7154 7155 7156

      bool do_merge = int(thread->rand.Next() % 100) < FLAGS_mergereadpercent;

      if (do_merge) {
7157
        Status s = db->Merge(write_options_, key, gen.Generate());
7158 7159 7160 7161 7162
        if (!s.ok()) {
          fprintf(stderr, "merge error: %s\n", s.ToString().c_str());
          exit(1);
        }
        num_merges++;
7163
        thread->stats.FinishedOps(nullptr, db, 1, kMerge);
7164
      } else {
7165
        Status s = db->Get(read_options_, key, &value);
7166 7167 7168 7169 7170 7171 7172 7173 7174 7175 7176
        if (value.length() > max_length)
          max_length = value.length();

        if (!s.ok() && !s.IsNotFound()) {
          fprintf(stderr, "get error: %s\n", s.ToString().c_str());
          // we continue after error rather than exiting so that we can
          // find more errors if any
        } else if (!s.IsNotFound()) {
          num_hits++;
        }
        num_gets++;
7177
        thread->stats.FinishedOps(nullptr, db, 1, kRead);
7178 7179
      }
    }
L
Lei Jin 已提交
7180

7181 7182
    char msg[100];
    snprintf(msg, sizeof(msg),
S
sdong 已提交
7183 7184
             "(reads:%" PRIu64 " merges:%" PRIu64 " total:%" PRIu64
             " hits:%" PRIu64 " maxlength:%" ROCKSDB_PRIszt ")",
7185 7186 7187 7188
             num_gets, num_merges, readwrites_, num_hits, max_length);
    thread->stats.AddMessage(msg);
  }

T
Tomislav Novak 已提交
7189 7190 7191 7192 7193 7194 7195
  void WriteSeqSeekSeq(ThreadState* thread) {
    writes_ = FLAGS_num;
    DoWrite(thread, SEQUENTIAL);
    // exclude writes from the ops/sec calculation
    thread->stats.Start(thread->tid);

    DB* db = SelectDB(thread);
7196
    ReadOptions read_opts = read_options_;
7197 7198 7199 7200 7201 7202 7203 7204
    std::unique_ptr<char[]> ts_guard;
    Slice ts;
    if (user_timestamp_size_ > 0) {
      ts_guard.reset(new char[user_timestamp_size_]);
      ts = mock_app_clock_->GetTimestampForRead(thread->rand, ts_guard.get());
      read_opts.timestamp = &ts;
    }
    std::unique_ptr<Iterator> iter(db->NewIterator(read_opts));
T
Tomislav Novak 已提交
7205

7206 7207
    std::unique_ptr<const char[]> key_guard;
    Slice key = AllocateKey(&key_guard);
T
Tomislav Novak 已提交
7208 7209 7210 7211
    for (int64_t i = 0; i < FLAGS_num; ++i) {
      GenerateKeyFromInt(i, FLAGS_num, &key);
      iter->Seek(key);
      assert(iter->Valid() && iter->key() == key);
7212
      thread->stats.FinishedOps(nullptr, db, 1, kSeek);
T
Tomislav Novak 已提交
7213

7214
      for (int j = 0; j < FLAGS_seek_nexts && i + 1 < FLAGS_num; ++j) {
M
Mark Callaghan 已提交
7215 7216 7217 7218 7219
        if (!FLAGS_reverse_iterator) {
          iter->Next();
        } else {
          iter->Prev();
        }
T
Tomislav Novak 已提交
7220 7221
        GenerateKeyFromInt(++i, FLAGS_num, &key);
        assert(iter->Valid() && iter->key() == key);
7222
        thread->stats.FinishedOps(nullptr, db, 1, kSeek);
T
Tomislav Novak 已提交
7223 7224 7225 7226
      }

      iter->Seek(key);
      assert(iter->Valid() && iter->key() == key);
7227
      thread->stats.FinishedOps(nullptr, db, 1, kSeek);
T
Tomislav Novak 已提交
7228 7229 7230
    }
  }

7231 7232 7233 7234 7235 7236 7237 7238 7239 7240 7241 7242 7243 7244 7245 7246 7247 7248 7249 7250 7251 7252 7253 7254 7255 7256 7257 7258 7259 7260 7261 7262 7263 7264 7265 7266 7267 7268 7269 7270 7271 7272 7273 7274 7275 7276 7277 7278 7279 7280 7281 7282 7283 7284 7285 7286 7287 7288 7289 7290 7291 7292 7293 7294 7295 7296 7297 7298 7299 7300 7301 7302 7303 7304 7305 7306 7307 7308 7309 7310 7311 7312 7313 7314 7315 7316 7317 7318 7319 7320 7321
  bool binary_search(std::vector<int>& data, int start, int end, int key) {
    if (data.empty()) return false;
    if (start > end) return false;
    int mid = start + (end - start) / 2;
    if (mid > static_cast<int>(data.size()) - 1) return false;
    if (data[mid] == key) {
      return true;
    } else if (data[mid] > key) {
      return binary_search(data, start, mid - 1, key);
    } else {
      return binary_search(data, mid + 1, end, key);
    }
  }

  // Does a bunch of merge operations for a key(key1) where the merge operand
  // is a sorted list. Next performance comparison is done between doing a Get
  // for key1 followed by searching for another key(key2) in the large sorted
  // list vs calling GetMergeOperands for key1 and then searching for the key2
  // in all the sorted sub-lists. Later case is expected to be a lot faster.
  void GetMergeOperands(ThreadState* thread) {
    DB* db = SelectDB(thread);
    const int kTotalValues = 100000;
    const int kListSize = 100;
    std::string key = "my_key";
    std::string value;

    for (int i = 1; i < kTotalValues; i++) {
      if (i % kListSize == 0) {
        // Remove trailing ','
        value.pop_back();
        db->Merge(WriteOptions(), key, value);
        value.clear();
      } else {
        value.append(std::to_string(i)).append(",");
      }
    }

    SortList s;
    std::vector<int> data;
    // This value can be experimented with and it will demonstrate the
    // perf difference between doing a Get and searching for lookup_key in the
    // resultant large sorted list vs doing GetMergeOperands and searching
    // for lookup_key within this resultant sorted sub-lists.
    int lookup_key = 1;

    // Get API call
    std::cout << "--- Get API call --- \n";
    PinnableSlice p_slice;
    uint64_t st = FLAGS_env->NowNanos();
    db->Get(ReadOptions(), db->DefaultColumnFamily(), key, &p_slice);
    s.MakeVector(data, p_slice);
    bool found =
        binary_search(data, 0, static_cast<int>(data.size() - 1), lookup_key);
    std::cout << "Found key? " << std::to_string(found) << "\n";
    uint64_t sp = FLAGS_env->NowNanos();
    std::cout << "Get: " << (sp - st) / 1000000000.0 << " seconds\n";
    std::string* dat_ = p_slice.GetSelf();
    std::cout << "Sample data from Get API call: " << dat_->substr(0, 10)
              << "\n";
    data.clear();

    // GetMergeOperands API call
    std::cout << "--- GetMergeOperands API --- \n";
    std::vector<PinnableSlice> a_slice((kTotalValues / kListSize) + 1);
    st = FLAGS_env->NowNanos();
    int number_of_operands = 0;
    GetMergeOperandsOptions get_merge_operands_options;
    get_merge_operands_options.expected_max_number_of_operands =
        (kTotalValues / 100) + 1;
    db->GetMergeOperands(ReadOptions(), db->DefaultColumnFamily(), key,
                         a_slice.data(), &get_merge_operands_options,
                         &number_of_operands);
    for (PinnableSlice& psl : a_slice) {
      s.MakeVector(data, psl);
      found =
          binary_search(data, 0, static_cast<int>(data.size() - 1), lookup_key);
      data.clear();
      if (found) break;
    }
    std::cout << "Found key? " << std::to_string(found) << "\n";
    sp = FLAGS_env->NowNanos();
    std::cout << "Get Merge operands: " << (sp - st) / 1000000000.0
              << " seconds \n";
    int to_print = 0;
    std::cout << "Sample data from GetMergeOperands API call: ";
    for (PinnableSlice& psl : a_slice) {
      std::cout << "List: " << to_print << " : " << *psl.GetSelf() << "\n";
      if (to_print++ > 2) break;
    }
  }

7322
#ifndef ROCKSDB_LITE
7323 7324 7325 7326
  void VerifyChecksum(ThreadState* thread) {
    DB* db = SelectDB(thread);
    ReadOptions ro;
    ro.adaptive_readahead = FLAGS_adaptive_readahead;
7327
    ro.async_io = FLAGS_async_io;
7328 7329 7330 7331 7332 7333 7334 7335 7336 7337 7338 7339 7340 7341
    ro.rate_limiter_priority =
        FLAGS_rate_limit_user_ops ? Env::IO_USER : Env::IO_TOTAL;
    ro.readahead_size = FLAGS_readahead_size;
    Status s = db->VerifyChecksum(ro);
    if (!s.ok()) {
      fprintf(stderr, "VerifyChecksum() failed: %s\n", s.ToString().c_str());
      exit(1);
    }
  }

  void VerifyFileChecksums(ThreadState* thread) {
    DB* db = SelectDB(thread);
    ReadOptions ro;
    ro.adaptive_readahead = FLAGS_adaptive_readahead;
7342
    ro.async_io = FLAGS_async_io;
7343 7344 7345 7346 7347 7348 7349 7350 7351 7352 7353
    ro.rate_limiter_priority =
        FLAGS_rate_limit_user_ops ? Env::IO_USER : Env::IO_TOTAL;
    ro.readahead_size = FLAGS_readahead_size;
    Status s = db->VerifyFileChecksums(ro);
    if (!s.ok()) {
      fprintf(stderr, "VerifyFileChecksums() failed: %s\n",
              s.ToString().c_str());
      exit(1);
    }
  }

A
agiardullo 已提交
7354 7355 7356 7357 7358 7359 7360 7361 7362 7363 7364 7365 7366 7367
  // This benchmark stress tests Transactions.  For a given --duration (or
  // total number of --writes, a Transaction will perform a read-modify-write
  // to increment the value of a key in each of N(--transaction-sets) sets of
  // keys (where each set has --num keys).  If --threads is set, this will be
  // done in parallel.
  //
  // To test transactions, use --transaction_db=true.  Not setting this
  // parameter
  // will run the same benchmark without transactions.
  //
  // RandomTransactionVerify() will then validate the correctness of the results
  // by checking if the sum of all keys in each set is the same.
  void RandomTransaction(ThreadState* thread) {
    Duration duration(FLAGS_duration, readwrites_);
S
SherlockNoMad 已提交
7368
    uint16_t num_prefix_ranges = static_cast<uint16_t>(FLAGS_transaction_sets);
A
agiardullo 已提交
7369
    uint64_t transactions_done = 0;
A
agiardullo 已提交
7370 7371 7372 7373 7374 7375

    if (num_prefix_ranges == 0 || num_prefix_ranges > 9999) {
      fprintf(stderr, "invalid value for transaction_sets\n");
      abort();
    }

A
agiardullo 已提交
7376 7377 7378 7379 7380
    TransactionOptions txn_options;
    txn_options.lock_timeout = FLAGS_transaction_lock_timeout;
    txn_options.set_snapshot = FLAGS_transaction_set_snapshot;

    RandomTransactionInserter inserter(&thread->rand, write_options_,
7381
                                       read_options_, FLAGS_num,
A
agiardullo 已提交
7382 7383
                                       num_prefix_ranges);

A
agiardullo 已提交
7384 7385 7386 7387 7388 7389 7390 7391
    if (FLAGS_num_multi_db > 1) {
      fprintf(stderr,
              "Cannot run RandomTransaction benchmark with "
              "FLAGS_multi_db > 1.");
      abort();
    }

    while (!duration.Done(1)) {
A
agiardullo 已提交
7392
      bool success;
A
agiardullo 已提交
7393

A
agiardullo 已提交
7394 7395
      // RandomTransactionInserter will attempt to insert a key for each
      // # of FLAGS_transaction_sets
A
agiardullo 已提交
7396
      if (FLAGS_optimistic_transaction_db) {
A
agiardullo 已提交
7397
        success = inserter.OptimisticTransactionDBInsert(db_.opt_txn_db);
A
agiardullo 已提交
7398 7399
      } else if (FLAGS_transaction_db) {
        TransactionDB* txn_db = reinterpret_cast<TransactionDB*>(db_.db);
A
agiardullo 已提交
7400
        success = inserter.TransactionDBInsert(txn_db, txn_options);
A
agiardullo 已提交
7401
      } else {
A
agiardullo 已提交
7402
        success = inserter.DBInsert(db_.db);
A
agiardullo 已提交
7403 7404
      }

A
agiardullo 已提交
7405 7406 7407 7408
      if (!success) {
        fprintf(stderr, "Unexpected error: %s\n",
                inserter.GetLastStatus().ToString().c_str());
        abort();
7409 7410
      }

A
agiardullo 已提交
7411
      thread->stats.FinishedOps(nullptr, db_.db, 1, kOthers);
A
agiardullo 已提交
7412 7413 7414 7415
      transactions_done++;
    }

    char msg[100];
A
agiardullo 已提交
7416
    if (FLAGS_optimistic_transaction_db || FLAGS_transaction_db) {
A
agiardullo 已提交
7417 7418
      snprintf(msg, sizeof(msg),
               "( transactions:%" PRIu64 " aborts:%" PRIu64 ")",
A
agiardullo 已提交
7419
               transactions_done, inserter.GetFailureCount());
A
agiardullo 已提交
7420 7421 7422 7423 7424
    } else {
      snprintf(msg, sizeof(msg), "( batches:%" PRIu64 " )", transactions_done);
    }
    thread->stats.AddMessage(msg);

7425
    if (FLAGS_perf_level > ROCKSDB_NAMESPACE::PerfLevel::kDisable) {
7426 7427
      thread->stats.AddMessage(std::string("PERF_CONTEXT:\n") +
                               get_perf_context()->ToString());
A
agiardullo 已提交
7428
    }
7429
    thread->stats.AddBytes(static_cast<int64_t>(inserter.GetBytesInserted()));
A
agiardullo 已提交
7430 7431 7432 7433 7434 7435
  }

  // Verifies consistency of data after RandomTransaction() has been run.
  // Since each iteration of RandomTransaction() incremented a key in each set
  // by the same value, the sum of the keys in each set should be the same.
  void RandomTransactionVerify() {
A
agiardullo 已提交
7436
    if (!FLAGS_transaction_db && !FLAGS_optimistic_transaction_db) {
A
agiardullo 已提交
7437 7438 7439 7440
      // transactions not used, nothing to verify.
      return;
    }

A
agiardullo 已提交
7441
    Status s =
S
SherlockNoMad 已提交
7442 7443
        RandomTransactionInserter::Verify(db_.db,
                            static_cast<uint16_t>(FLAGS_transaction_sets));
A
agiardullo 已提交
7444

A
agiardullo 已提交
7445 7446 7447 7448
    if (s.ok()) {
      fprintf(stdout, "RandomTransactionVerify Success.\n");
    } else {
      fprintf(stdout, "RandomTransactionVerify FAILED!!\n");
A
agiardullo 已提交
7449 7450
    }
  }
7451
#endif  // ROCKSDB_LITE
A
agiardullo 已提交
7452

A
Andres Noetzli 已提交
7453 7454 7455 7456 7457 7458 7459 7460
  // Writes and deletes random keys without overwriting keys.
  //
  // This benchmark is intended to partially replicate the behavior of MyRocks
  // secondary indices: All data is stored in keys and updates happen by
  // deleting the old version of the key and inserting the new version.
  void RandomReplaceKeys(ThreadState* thread) {
    std::unique_ptr<const char[]> key_guard;
    Slice key = AllocateKey(&key_guard);
7461 7462 7463 7464
    std::unique_ptr<char[]> ts_guard;
    if (user_timestamp_size_ > 0) {
      ts_guard.reset(new char[user_timestamp_size_]);
    }
A
Andres Noetzli 已提交
7465 7466 7467 7468 7469 7470 7471 7472
    std::vector<uint32_t> counters(FLAGS_numdistinct, 0);
    size_t max_counter = 50;
    RandomGenerator gen;

    Status s;
    DB* db = SelectDB(thread);
    for (int64_t i = 0; i < FLAGS_numdistinct; i++) {
      GenerateKeyFromInt(i * max_counter, FLAGS_num, &key);
7473
      if (user_timestamp_size_ > 0) {
7474 7475 7476 7477
        Slice ts = mock_app_clock_->Allocate(ts_guard.get());
        s = db->Put(write_options_, key, ts, gen.Generate());
      } else {
        s = db->Put(write_options_, key, gen.Generate());
7478
      }
A
Andres Noetzli 已提交
7479 7480 7481 7482 7483 7484 7485 7486 7487 7488 7489 7490 7491 7492 7493 7494 7495 7496
      if (!s.ok()) {
        fprintf(stderr, "Operation failed: %s\n", s.ToString().c_str());
        exit(1);
      }
    }

    db->GetSnapshot();

    std::default_random_engine generator;
    std::normal_distribution<double> distribution(FLAGS_numdistinct / 2.0,
                                                  FLAGS_stddev);
    Duration duration(FLAGS_duration, FLAGS_num);
    while (!duration.Done(1)) {
      int64_t rnd_id = static_cast<int64_t>(distribution(generator));
      int64_t key_id = std::max(std::min(FLAGS_numdistinct - 1, rnd_id),
                                static_cast<int64_t>(0));
      GenerateKeyFromInt(key_id * max_counter + counters[key_id], FLAGS_num,
                         &key);
7497
      if (user_timestamp_size_ > 0) {
7498 7499 7500 7501 7502 7503
        Slice ts = mock_app_clock_->Allocate(ts_guard.get());
        s = FLAGS_use_single_deletes ? db->SingleDelete(write_options_, key, ts)
                                     : db->Delete(write_options_, key, ts);
      } else {
        s = FLAGS_use_single_deletes ? db->SingleDelete(write_options_, key)
                                     : db->Delete(write_options_, key);
7504
      }
A
Andres Noetzli 已提交
7505 7506 7507 7508
      if (s.ok()) {
        counters[key_id] = (counters[key_id] + 1) % max_counter;
        GenerateKeyFromInt(key_id * max_counter + counters[key_id], FLAGS_num,
                           &key);
7509
        if (user_timestamp_size_ > 0) {
7510 7511 7512 7513
          Slice ts = mock_app_clock_->Allocate(ts_guard.get());
          s = db->Put(write_options_, key, ts, Slice());
        } else {
          s = db->Put(write_options_, key, Slice());
7514
        }
A
Andres Noetzli 已提交
7515 7516 7517 7518 7519 7520 7521
      }

      if (!s.ok()) {
        fprintf(stderr, "Operation failed: %s\n", s.ToString().c_str());
        exit(1);
      }

7522
      thread->stats.FinishedOps(nullptr, db, 1, kOthers);
A
Andres Noetzli 已提交
7523 7524 7525 7526 7527 7528 7529 7530 7531 7532
    }

    char msg[200];
    snprintf(msg, sizeof(msg),
             "use single deletes: %d, "
             "standard deviation: %lf\n",
             FLAGS_use_single_deletes, FLAGS_stddev);
    thread->stats.AddMessage(msg);
  }

7533 7534 7535 7536 7537 7538 7539 7540
  void TimeSeriesReadOrDelete(ThreadState* thread, bool do_deletion) {
    int64_t read = 0;
    int64_t found = 0;
    int64_t bytes = 0;

    Iterator* iter = nullptr;
    // Only work on single database
    assert(db_.db != nullptr);
7541
    iter = db_.db->NewIterator(read_options_);
7542 7543 7544 7545 7546 7547 7548 7549 7550 7551 7552 7553 7554 7555 7556

    std::unique_ptr<const char[]> key_guard;
    Slice key = AllocateKey(&key_guard);

    char value_buffer[256];
    while (true) {
      {
        MutexLock l(&thread->shared->mu);
        if (thread->shared->num_done >= 1) {
          // Write thread have finished
          break;
        }
      }
      if (!FLAGS_use_tailing_iterator) {
        delete iter;
7557
        iter = db_.db->NewIterator(read_options_);
7558 7559 7560 7561 7562 7563 7564 7565 7566 7567 7568 7569 7570 7571 7572 7573 7574 7575 7576 7577 7578 7579 7580 7581 7582 7583 7584 7585 7586 7587 7588 7589 7590 7591 7592 7593
      }
      // Pick a Iterator to use

      int64_t key_id = thread->rand.Next() % FLAGS_key_id_range;
      GenerateKeyFromInt(key_id, FLAGS_num, &key);
      // Reset last 8 bytes to 0
      char* start = const_cast<char*>(key.data());
      start += key.size() - 8;
      memset(start, 0, 8);
      ++read;

      bool key_found = false;
      // Seek the prefix
      for (iter->Seek(key); iter->Valid() && iter->key().starts_with(key);
           iter->Next()) {
        key_found = true;
        // Copy out iterator's value to make sure we read them.
        if (do_deletion) {
          bytes += iter->key().size();
          if (KeyExpired(timestamp_emulator_.get(), iter->key())) {
            thread->stats.FinishedOps(&db_, db_.db, 1, kDelete);
            db_.db->Delete(write_options_, iter->key());
          } else {
            break;
          }
        } else {
          bytes += iter->key().size() + iter->value().size();
          thread->stats.FinishedOps(&db_, db_.db, 1, kRead);
          Slice value = iter->value();
          memcpy(value_buffer, value.data(),
                 std::min(value.size(), sizeof(value_buffer)));

          assert(iter->status().ok());
        }
      }
      found += key_found;
7594 7595

      if (thread->shared->read_rate_limiter.get() != nullptr) {
7596 7597
        thread->shared->read_rate_limiter->Request(
            1, Env::IO_HIGH, nullptr /* stats */, RateLimiter::OpType::kRead);
7598
      }
7599 7600 7601 7602 7603 7604 7605 7606
    }
    delete iter;

    char msg[100];
    snprintf(msg, sizeof(msg), "(%" PRIu64 " of %" PRIu64 " found)", found,
             read);
    thread->stats.AddBytes(bytes);
    thread->stats.AddMessage(msg);
7607
    if (FLAGS_perf_level > ROCKSDB_NAMESPACE::PerfLevel::kDisable) {
7608 7609
      thread->stats.AddMessage(std::string("PERF_CONTEXT:\n") +
                               get_perf_context()->ToString());
7610 7611 7612 7613 7614 7615 7616 7617 7618 7619 7620 7621 7622 7623 7624 7625 7626 7627 7628 7629 7630 7631 7632 7633 7634 7635 7636 7637 7638 7639 7640 7641 7642 7643 7644 7645 7646 7647 7648 7649 7650 7651 7652 7653 7654
    }
  }

  void TimeSeriesWrite(ThreadState* thread) {
    // Special thread that keeps writing until other threads are done.
    RandomGenerator gen;
    int64_t bytes = 0;

    // Don't merge stats from this thread with the readers.
    thread->stats.SetExcludeFromMerge();

    std::unique_ptr<RateLimiter> write_rate_limiter;
    if (FLAGS_benchmark_write_rate_limit > 0) {
      write_rate_limiter.reset(
          NewGenericRateLimiter(FLAGS_benchmark_write_rate_limit));
    }

    std::unique_ptr<const char[]> key_guard;
    Slice key = AllocateKey(&key_guard);

    Duration duration(FLAGS_duration, writes_);
    while (!duration.Done(1)) {
      DB* db = SelectDB(thread);

      uint64_t key_id = thread->rand.Next() % FLAGS_key_id_range;
      // Write key id
      GenerateKeyFromInt(key_id, FLAGS_num, &key);
      // Write timestamp

      char* start = const_cast<char*>(key.data());
      char* pos = start + 8;
      int bytes_to_fill =
          std::min(key_size_ - static_cast<int>(pos - start), 8);
      uint64_t timestamp_value = timestamp_emulator_->Get();
      if (port::kLittleEndian) {
        for (int i = 0; i < bytes_to_fill; ++i) {
          pos[i] = (timestamp_value >> ((bytes_to_fill - i - 1) << 3)) & 0xFF;
        }
      } else {
        memcpy(pos, static_cast<void*>(&timestamp_value), bytes_to_fill);
      }

      timestamp_emulator_->Inc();

      Status s;
7655 7656
      Slice val = gen.Generate();
      s = db->Put(write_options_, key, val);
7657 7658 7659

      if (!s.ok()) {
        fprintf(stderr, "put error: %s\n", s.ToString().c_str());
7660
        ErrorExit();
7661
      }
7662
      bytes = key.size() + val.size();
7663 7664 7665 7666 7667
      thread->stats.FinishedOps(&db_, db_.db, 1, kWrite);
      thread->stats.AddBytes(bytes);

      if (FLAGS_benchmark_write_rate_limit > 0) {
        write_rate_limiter->Request(
7668
            key.size() + val.size(), Env::IO_HIGH,
7669
            nullptr /* stats */, RateLimiter::OpType::kWrite);
7670 7671 7672 7673 7674 7675 7676 7677 7678 7679 7680 7681 7682 7683 7684 7685
      }
    }
  }

  void TimeSeries(ThreadState* thread) {
    if (thread->tid > 0) {
      bool do_deletion = FLAGS_expire_style == "delete" &&
                         thread->tid <= FLAGS_num_deletion_threads;
      TimeSeriesReadOrDelete(thread, do_deletion);
    } else {
      TimeSeriesWrite(thread);
      thread->stats.Stop();
      thread->stats.Report("timeseries write");
    }
  }

7686
  void Compact(ThreadState* thread) {
7687
    DB* db = SelectDB(thread);
7688
    CompactRangeOptions cro;
7689 7690
    cro.bottommost_level_compaction =
        BottommostLevelCompaction::kForceOptimized;
7691
    db->CompactRange(cro, nullptr, nullptr);
J
jorlow@chromium.org 已提交
7692 7693
  }

7694 7695 7696 7697 7698 7699 7700 7701 7702
  void CompactAll() {
    if (db_.db != nullptr) {
      db_.db->CompactRange(CompactRangeOptions(), nullptr, nullptr);
    }
    for (const auto& db_with_cfh : multi_dbs_) {
      db_with_cfh.db->CompactRange(CompactRangeOptions(), nullptr, nullptr);
    }
  }

7703 7704 7705 7706 7707 7708 7709 7710 7711 7712 7713 7714 7715 7716 7717 7718 7719 7720 7721 7722 7723 7724 7725 7726 7727 7728 7729
#ifndef ROCKSDB_LITE
  void WaitForCompactionHelper(DBWithColumnFamilies& db) {
    // This is an imperfect way of waiting for compaction. The loop and sleep
    // is done because a thread that finishes a compaction job should get a
    // chance to pickup a new compaction job.

    std::vector<std::string> keys = {DB::Properties::kMemTableFlushPending,
                                     DB::Properties::kNumRunningFlushes,
                                     DB::Properties::kCompactionPending,
                                     DB::Properties::kNumRunningCompactions};

    fprintf(stdout, "waitforcompaction(%s): started\n",
            db.db->GetName().c_str());

    while (true) {
      bool retry = false;

      for (const auto& k : keys) {
        uint64_t v;
        if (!db.db->GetIntProperty(k, &v)) {
          fprintf(stderr, "waitforcompaction(%s): GetIntProperty(%s) failed\n",
                  db.db->GetName().c_str(), k.c_str());
          exit(1);
        } else if (v > 0) {
          fprintf(stdout,
                  "waitforcompaction(%s): active(%s). Sleep 10 seconds\n",
                  db.db->GetName().c_str(), k.c_str());
7730
          FLAGS_env->SleepForMicroseconds(10 * 1000000);
7731 7732 7733 7734 7735 7736 7737 7738 7739 7740 7741 7742 7743 7744 7745
          retry = true;
          break;
        }
      }

      if (!retry) {
        fprintf(stdout, "waitforcompaction(%s): finished\n",
                db.db->GetName().c_str());
        return;
      }
    }
  }

  void WaitForCompaction() {
    // Give background threads a chance to wake
7746
    FLAGS_env->SleepForMicroseconds(5 * 1000000);
7747 7748 7749 7750 7751 7752 7753 7754 7755 7756 7757 7758 7759 7760 7761 7762 7763 7764 7765 7766 7767 7768 7769 7770 7771 7772 7773 7774 7775 7776 7777 7778 7779 7780 7781 7782 7783 7784 7785 7786 7787 7788 7789 7790 7791 7792 7793 7794 7795 7796 7797 7798 7799 7800 7801 7802 7803 7804 7805 7806 7807 7808 7809 7810 7811 7812 7813 7814 7815 7816 7817 7818 7819 7820 7821 7822 7823 7824 7825 7826 7827 7828 7829 7830 7831 7832 7833 7834 7835 7836 7837 7838 7839 7840 7841 7842 7843 7844 7845 7846 7847 7848 7849 7850 7851 7852 7853 7854 7855 7856 7857 7858 7859 7860 7861 7862 7863

    // I am skeptical that this check race free. I hope that checking twice
    // reduces the chance.
    if (db_.db != nullptr) {
      WaitForCompactionHelper(db_);
      WaitForCompactionHelper(db_);
    } else {
      for (auto& db_with_cfh : multi_dbs_) {
        WaitForCompactionHelper(db_with_cfh);
        WaitForCompactionHelper(db_with_cfh);
      }
    }
  }

  bool CompactLevelHelper(DBWithColumnFamilies& db_with_cfh, int from_level) {
    std::vector<LiveFileMetaData> files;
    db_with_cfh.db->GetLiveFilesMetaData(&files);

    assert(from_level == 0 || from_level == 1);

    int real_from_level = from_level;
    if (real_from_level > 0) {
      // With dynamic leveled compaction the first level with data beyond L0
      // might not be L1.
      real_from_level = std::numeric_limits<int>::max();

      for (auto& f : files) {
        if (f.level > 0 && f.level < real_from_level) real_from_level = f.level;
      }

      if (real_from_level == std::numeric_limits<int>::max()) {
        fprintf(stdout, "compact%d found 0 files to compact\n", from_level);
        return true;
      }
    }

    // The goal is to compact from from_level to the level that follows it,
    // and with dynamic leveled compaction the next level might not be
    // real_from_level+1
    int next_level = std::numeric_limits<int>::max();

    std::vector<std::string> files_to_compact;
    for (auto& f : files) {
      if (f.level == real_from_level)
        files_to_compact.push_back(f.name);
      else if (f.level > real_from_level && f.level < next_level)
        next_level = f.level;
    }

    if (files_to_compact.empty()) {
      fprintf(stdout, "compact%d found 0 files to compact\n", from_level);
      return true;
    } else if (next_level == std::numeric_limits<int>::max()) {
      // There is no data beyond real_from_level. So we are done.
      fprintf(stdout, "compact%d found no data beyond L%d\n", from_level,
              real_from_level);
      return true;
    }

    fprintf(stdout, "compact%d found %d files to compact from L%d to L%d\n",
            from_level, static_cast<int>(files_to_compact.size()),
            real_from_level, next_level);

    ROCKSDB_NAMESPACE::CompactionOptions options;
    // Lets RocksDB use the configured compression for this level
    options.compression = ROCKSDB_NAMESPACE::kDisableCompressionOption;

    ROCKSDB_NAMESPACE::ColumnFamilyDescriptor cfDesc;
    db_with_cfh.db->DefaultColumnFamily()->GetDescriptor(&cfDesc);
    options.output_file_size_limit = cfDesc.options.target_file_size_base;

    Status status =
        db_with_cfh.db->CompactFiles(options, files_to_compact, next_level);
    if (!status.ok()) {
      // This can fail for valid reasons including the operation was aborted
      // or a filename is invalid because background compaction removed it.
      // Having read the current cases for which an error is raised I prefer
      // not to figure out whether an exception should be thrown here.
      fprintf(stderr, "compact%d CompactFiles failed: %s\n", from_level,
              status.ToString().c_str());
      return false;
    }
    return true;
  }

  void CompactLevel(int from_level) {
    if (db_.db != nullptr) {
      while (!CompactLevelHelper(db_, from_level)) WaitForCompaction();
    }
    for (auto& db_with_cfh : multi_dbs_) {
      while (!CompactLevelHelper(db_with_cfh, from_level)) WaitForCompaction();
    }
  }
#endif

  void Flush() {
    FlushOptions flush_opt;
    flush_opt.wait = true;

    if (db_.db != nullptr) {
      Status s = db_.db->Flush(flush_opt, db_.cfh);
      if (!s.ok()) {
        fprintf(stderr, "Flush failed: %s\n", s.ToString().c_str());
        exit(1);
      }
    } else {
      for (const auto& db_with_cfh : multi_dbs_) {
        Status s = db_with_cfh.db->Flush(flush_opt, db_with_cfh.cfh);
        if (!s.ok()) {
          fprintf(stderr, "Flush failed: %s\n", s.ToString().c_str());
          exit(1);
        }
      }
    }
    fprintf(stdout, "flush memtable\n");
  }

S
Siying Dong 已提交
7864 7865 7866 7867 7868 7869 7870 7871 7872
  void ResetStats() {
    if (db_.db != nullptr) {
      db_.db->ResetStats();
    }
    for (const auto& db_with_cfh : multi_dbs_) {
      db_with_cfh.db->ResetStats();
    }
  }

7873 7874 7875 7876 7877 7878 7879 7880 7881 7882 7883 7884 7885 7886 7887 7888 7889 7890 7891 7892 7893 7894 7895 7896 7897 7898 7899 7900 7901 7902 7903 7904 7905
  void PrintStatsHistory() {
    if (db_.db != nullptr) {
      PrintStatsHistoryImpl(db_.db, false);
    }
    for (const auto& db_with_cfh : multi_dbs_) {
      PrintStatsHistoryImpl(db_with_cfh.db, true);
    }
  }

  void PrintStatsHistoryImpl(DB* db, bool print_header) {
    if (print_header) {
      fprintf(stdout, "\n==== DB: %s ===\n", db->GetName().c_str());
    }

    std::unique_ptr<StatsHistoryIterator> shi;
    Status s = db->GetStatsHistory(0, port::kMaxUint64, &shi);
    if (!s.ok()) {
      fprintf(stdout, "%s\n", s.ToString().c_str());
      return;
    }
    assert(shi);
    while (shi->Valid()) {
      uint64_t stats_time = shi->GetStatsTime();
      fprintf(stdout, "------ %s ------\n",
              TimeToHumanString(static_cast<int>(stats_time)).c_str());
      for (auto& entry : shi->GetStatsMap()) {
        fprintf(stdout, " %" PRIu64 "   %s  %" PRIu64 "\n", stats_time,
                entry.first.c_str(), entry.second);
      }
      shi->Next();
    }
  }

S
Sanjay Ghemawat 已提交
7906
  void PrintStats(const char* key) {
7907 7908
    if (db_.db != nullptr) {
      PrintStats(db_.db, key, false);
7909
    }
7910 7911
    for (const auto& db_with_cfh : multi_dbs_) {
      PrintStats(db_with_cfh.db, key, true);
7912 7913 7914 7915 7916 7917 7918
    }
  }

  void PrintStats(DB* db, const char* key, bool print_header = false) {
    if (print_header) {
      fprintf(stdout, "\n==== DB: %s ===\n", db->GetName().c_str());
    }
7919
    std::string stats;
7920
    if (!db->GetProperty(key, &stats)) {
7921
      stats = "(failed)";
7922
    }
7923
    fprintf(stdout, "\n%s\n", stats.c_str());
7924
  }
7925

7926 7927 7928 7929 7930 7931 7932 7933 7934 7935 7936 7937 7938 7939 7940 7941 7942 7943 7944 7945 7946 7947 7948 7949
  void PrintStats(const std::vector<std::string>& keys) {
    if (db_.db != nullptr) {
      PrintStats(db_.db, keys);
    }
    for (const auto& db_with_cfh : multi_dbs_) {
      PrintStats(db_with_cfh.db, keys, true);
    }
  }

  void PrintStats(DB* db, const std::vector<std::string>& keys,
                  bool print_header = false) {
    if (print_header) {
      fprintf(stdout, "\n==== DB: %s ===\n", db->GetName().c_str());
    }

    for (const auto& key : keys) {
      std::string stats;
      if (!db->GetProperty(key, &stats)) {
        stats = "(failed)";
      }
      fprintf(stdout, "%s: %s\n", key.c_str(), stats.c_str());
    }
  }

7950 7951
#ifndef ROCKSDB_LITE

7952 7953 7954 7955 7956 7957 7958 7959
  void Replay(ThreadState* thread) {
    if (db_.db != nullptr) {
      Replay(thread, &db_);
    }
  }

  void Replay(ThreadState* /*thread*/, DBWithColumnFamilies* db_with_cfh) {
    Status s;
7960
    std::unique_ptr<TraceReader> trace_reader;
7961 7962 7963 7964 7965 7966 7967 7968 7969 7970
    s = NewFileTraceReader(FLAGS_env, EnvOptions(), FLAGS_trace_file,
                           &trace_reader);
    if (!s.ok()) {
      fprintf(
          stderr,
          "Encountered an error creating a TraceReader from the trace file. "
          "Error: %s\n",
          s.ToString().c_str());
      exit(1);
    }
7971 7972 7973 7974 7975 7976 7977 7978 7979 7980 7981 7982 7983 7984 7985 7986 7987
    std::unique_ptr<Replayer> replayer;
    s = db_with_cfh->db->NewDefaultReplayer(db_with_cfh->cfh,
                                            std::move(trace_reader), &replayer);
    if (!s.ok()) {
      fprintf(stderr,
              "Encountered an error creating a default Replayer. "
              "Error: %s\n",
              s.ToString().c_str());
      exit(1);
    }
    s = replayer->Prepare();
    if (!s.ok()) {
      fprintf(stderr, "Prepare for replay failed. Error: %s\n",
              s.ToString().c_str());
    }
    s = replayer->Replay(
        ReplayOptions(static_cast<uint32_t>(FLAGS_trace_replay_threads),
7988 7989
                      FLAGS_trace_replay_fast_forward),
        nullptr);
7990
    replayer.reset();
7991
    if (s.ok()) {
7992
      fprintf(stdout, "Replay completed from trace_file: %s\n",
7993 7994
              FLAGS_trace_file.c_str());
    } else {
7995
      fprintf(stderr, "Replay failed. Error: %s\n", s.ToString().c_str());
7996 7997
    }
  }
7998 7999

#endif  // ROCKSDB_LITE
J
jorlow@chromium.org 已提交
8000 8001
};

8002
int db_bench_tool(int argc, char** argv) {
8003
  ROCKSDB_NAMESPACE::port::InstallStackTraceHandler();
8004
  ConfigOptions config_options;
8005 8006 8007 8008 8009 8010
  static bool initialized = false;
  if (!initialized) {
    SetUsageMessage(std::string("\nUSAGE:\n") + std::string(argv[0]) +
                    " [OPTIONS]...");
    initialized = true;
  }
8011
  ParseCommandLineFlags(&argc, &argv, true);
8012 8013
  FLAGS_compaction_style_e =
      (ROCKSDB_NAMESPACE::CompactionStyle)FLAGS_compaction_style;
8014 8015 8016 8017 8018 8019 8020
#ifndef ROCKSDB_LITE
  if (FLAGS_statistics && !FLAGS_statistics_string.empty()) {
    fprintf(stderr,
            "Cannot provide both --statistics and --statistics_string.\n");
    exit(1);
  }
  if (!FLAGS_statistics_string.empty()) {
8021 8022
    Status s = Statistics::CreateFromString(config_options,
                                            FLAGS_statistics_string, &dbstats);
8023
    if (dbstats == nullptr) {
8024 8025 8026
      fprintf(stderr,
              "No Statistics registered matching string: %s status=%s\n",
              FLAGS_statistics_string.c_str(), s.ToString().c_str());
8027 8028 8029 8030
      exit(1);
    }
  }
#endif  // ROCKSDB_LITE
8031
  if (FLAGS_statistics) {
8032
    dbstats = ROCKSDB_NAMESPACE::CreateDBStatistics();
J
jorlow@chromium.org 已提交
8033
  }
S
Siying Dong 已提交
8034
  if (dbstats) {
8035
    dbstats->set_stats_level(static_cast<StatsLevel>(FLAGS_stats_level));
S
Siying Dong 已提交
8036
  }
8037 8038
  FLAGS_compaction_pri_e =
      (ROCKSDB_NAMESPACE::CompactionPri)FLAGS_compaction_pri;
J
jorlow@chromium.org 已提交
8039

8040
  std::vector<std::string> fanout = ROCKSDB_NAMESPACE::StringSplit(
I
Igor Canadi 已提交
8041
      FLAGS_max_bytes_for_level_multiplier_additional, ',');
8042
  for (size_t j = 0; j < fanout.size(); j++) {
8043
    FLAGS_max_bytes_for_level_multiplier_additional_v.push_back(
S
sdong 已提交
8044 8045 8046 8047 8048
#ifndef CYGWIN
        std::stoi(fanout[j]));
#else
        stoi(fanout[j]));
#endif
8049 8050 8051 8052 8053
  }

  FLAGS_compression_type_e =
    StringToCompressionType(FLAGS_compression_type.c_str());

8054 8055 8056
  FLAGS_wal_compression_e =
      StringToCompressionType(FLAGS_wal_compression.c_str());

8057 8058 8059
  FLAGS_lru_secondary_cache_compression_type_e = StringToCompressionType(
      FLAGS_lru_secondary_cache_compression_type.c_str());

8060
#ifndef ROCKSDB_LITE
8061
  // Stacked BlobDB
8062 8063 8064
  FLAGS_blob_db_compression_type_e =
    StringToCompressionType(FLAGS_blob_db_compression_type.c_str());

8065
  int env_opts = !FLAGS_env_uri.empty() + !FLAGS_fs_uri.empty();
8066
  if (env_opts > 1) {
8067
    fprintf(stderr, "Error: --env_uri and --fs_uri are mutually exclusive\n");
8068
    exit(1);
8069 8070
  }

8071
  if (env_opts == 1) {
8072
    Status s = Env::CreateFromUri(config_options, FLAGS_env_uri, FLAGS_fs_uri,
8073 8074 8075
                                  &FLAGS_env, &env_guard);
    if (!s.ok()) {
      fprintf(stderr, "Failed creating env: %s\n", s.ToString().c_str());
8076 8077
      exit(1);
    }
S
sdong 已提交
8078
  } else if (FLAGS_simulate_hdd || FLAGS_simulate_hybrid_fs_file != "") {
8079 8080 8081 8082
    //**TODO: Make the simulate fs something that can be loaded
    // from the ObjectRegistry...
    static std::shared_ptr<ROCKSDB_NAMESPACE::Env> composite_env =
        NewCompositeEnv(std::make_shared<SimulatedHybridFileSystem>(
S
sdong 已提交
8083 8084 8085 8086
            FileSystem::Default(), FLAGS_simulate_hybrid_fs_file,
            /*throughput_multiplier=*/
            int{FLAGS_simulate_hybrid_hdd_multipliers},
            /*is_full_fs_warm=*/FLAGS_simulate_hdd));
8087
    FLAGS_env = composite_env.get();
8088
  }
8089 8090 8091

  // Let -readonly imply -use_existing_db
  FLAGS_use_existing_db |= FLAGS_readonly;
8092
#endif  // ROCKSDB_LITE
8093

8094 8095 8096 8097 8098 8099 8100
  if (FLAGS_use_existing_keys && !FLAGS_use_existing_db) {
    fprintf(stderr,
            "`-use_existing_db` must be true for `-use_existing_keys` to be "
            "settable\n");
    exit(1);
  }

8101
  if (!strcasecmp(FLAGS_compaction_fadvice.c_str(), "NONE"))
8102
    FLAGS_compaction_fadvice_e = ROCKSDB_NAMESPACE::Options::NONE;
8103
  else if (!strcasecmp(FLAGS_compaction_fadvice.c_str(), "NORMAL"))
8104
    FLAGS_compaction_fadvice_e = ROCKSDB_NAMESPACE::Options::NORMAL;
8105
  else if (!strcasecmp(FLAGS_compaction_fadvice.c_str(), "SEQUENTIAL"))
8106
    FLAGS_compaction_fadvice_e = ROCKSDB_NAMESPACE::Options::SEQUENTIAL;
8107
  else if (!strcasecmp(FLAGS_compaction_fadvice.c_str(), "WILLNEED"))
8108
    FLAGS_compaction_fadvice_e = ROCKSDB_NAMESPACE::Options::WILLNEED;
8109 8110 8111 8112 8113
  else {
    fprintf(stdout, "Unknown compaction fadvice:%s\n",
            FLAGS_compaction_fadvice.c_str());
  }

8114 8115 8116
  FLAGS_value_size_distribution_type_e =
    StringToDistributionType(FLAGS_value_size_distribution_type.c_str());

8117 8118 8119
  // Note options sanitization may increase thread pool sizes according to
  // max_background_flushes/max_background_compactions/max_background_jobs
  FLAGS_env->SetBackgroundThreads(FLAGS_num_high_pri_threads,
8120
                                  ROCKSDB_NAMESPACE::Env::Priority::HIGH);
8121
  FLAGS_env->SetBackgroundThreads(FLAGS_num_bottom_pri_threads,
8122
                                  ROCKSDB_NAMESPACE::Env::Priority::BOTTOM);
8123
  FLAGS_env->SetBackgroundThreads(FLAGS_num_low_pri_threads,
8124
                                  ROCKSDB_NAMESPACE::Env::Priority::LOW);
8125

H
heyongqiang 已提交
8126
  // Choose a location for the test database if none given with --db=<path>
8127 8128
  if (FLAGS_db.empty()) {
    std::string default_db_path;
8129
    FLAGS_env->GetTestDirectory(&default_db_path);
8130 8131
    default_db_path += "/dbbench";
    FLAGS_db = default_db_path;
H
heyongqiang 已提交
8132 8133
  }

8134 8135 8136 8137 8138 8139
  if (FLAGS_stats_interval_seconds > 0) {
    // When both are set then FLAGS_stats_interval determines the frequency
    // at which the timer is checked for FLAGS_stats_interval_seconds
    FLAGS_stats_interval = 1000;
  }

8140 8141 8142 8143 8144
  if (FLAGS_seek_missing_prefix && FLAGS_prefix_size <= 8) {
    fprintf(stderr, "prefix_size > 8 required by --seek_missing_prefix\n");
    exit(1);
  }

8145
  ROCKSDB_NAMESPACE::Benchmark benchmark;
J
jorlow@chromium.org 已提交
8146
  benchmark.Run();
8147

Z
Zhongyi Xie 已提交
8148
#ifndef ROCKSDB_LITE
8149 8150
  if (FLAGS_print_malloc_stats) {
    std::string stats_string;
8151
    ROCKSDB_NAMESPACE::DumpMallocStats(&stats_string);
8152 8153
    fprintf(stdout, "Malloc stats:\n%s\n", stats_string.c_str());
  }
Z
Zhongyi Xie 已提交
8154
#endif  // ROCKSDB_LITE
8155

J
jorlow@chromium.org 已提交
8156 8157
  return 0;
}
8158
}  // namespace ROCKSDB_NAMESPACE
J
Jonathan Wiepert 已提交
8159
#endif