db_bench_tool.cc 236.0 KB
Newer Older
1
//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
S
Siying Dong 已提交
2 3 4
//  This source code is licensed under both the GPLv2 (found in the
//  COPYING file in the root directory) and Apache 2.0 License
//  (found in the LICENSE.Apache file in the root directory).
5
//
J
jorlow@chromium.org 已提交
6 7 8 9
// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file. See the AUTHORS file for names of contributors.

J
Jonathan Wiepert 已提交
10
#ifdef GFLAGS
11 12 13 14
#ifdef NUMA
#include <numa.h>
#include <numaif.h>
#endif
D
Dmitri Smirnov 已提交
15
#ifndef OS_WIN
16
#include <unistd.h>
D
Dmitri Smirnov 已提交
17
#endif
18
#include <fcntl.h>
19
#include <cinttypes>
20
#include <math.h>
J
jorlow@chromium.org 已提交
21 22
#include <stdio.h>
#include <stdlib.h>
23
#include <sys/types.h>
24 25
#include <atomic>
#include <condition_variable>
26
#include <cstddef>
S
Siying Dong 已提交
27
#include <memory>
28 29
#include <mutex>
#include <thread>
30
#include <unordered_map>
31

32
#include "db/db_impl/db_impl.h"
33
#include "db/malloc_stats.h"
J
jorlow@chromium.org 已提交
34
#include "db/version_set.h"
A
agiardullo 已提交
35
#include "hdfs/env_hdfs.h"
36 37
#include "monitoring/histogram.h"
#include "monitoring/statistics.h"
38
#include "options/cf_options.h"
A
agiardullo 已提交
39 40
#include "port/port.h"
#include "port/stack_trace.h"
41 42 43
#include "rocksdb/cache.h"
#include "rocksdb/db.h"
#include "rocksdb/env.h"
44
#include "rocksdb/filter_policy.h"
A
agiardullo 已提交
45 46 47
#include "rocksdb/memtablerep.h"
#include "rocksdb/options.h"
#include "rocksdb/perf_context.h"
48
#include "rocksdb/persistent_cache.h"
S
sdong 已提交
49
#include "rocksdb/rate_limiter.h"
A
agiardullo 已提交
50
#include "rocksdb/slice.h"
51
#include "rocksdb/slice_transform.h"
52
#include "rocksdb/stats_history.h"
53
#include "rocksdb/utilities/object_registry.h"
A
agiardullo 已提交
54
#include "rocksdb/utilities/optimistic_transaction_db.h"
55
#include "rocksdb/utilities/options_util.h"
56
#include "rocksdb/utilities/sim_cache.h"
A
agiardullo 已提交
57 58
#include "rocksdb/utilities/transaction.h"
#include "rocksdb/utilities/transaction_db.h"
A
agiardullo 已提交
59
#include "rocksdb/write_batch.h"
60 61
#include "test_util/testutil.h"
#include "test_util/transaction_test_util.h"
S
Siying Dong 已提交
62
#include "util/cast_util.h"
I
Igor Canadi 已提交
63
#include "util/compression.h"
A
agiardullo 已提交
64
#include "util/crc32c.h"
A
Andrew Kryczka 已提交
65
#include "util/gflags_compat.h"
66
#include "util/mutexlock.h"
J
jorlow@chromium.org 已提交
67
#include "util/random.h"
68
#include "util/stderr_logger.h"
A
agiardullo 已提交
69
#include "util/string_util.h"
I
xxHash  
Igor Canadi 已提交
70
#include "util/xxhash.h"
71
#include "utilities/blob_db/blob_db.h"
D
Deon Nicholas 已提交
72
#include "utilities/merge_operators.h"
P
Pooya Shareghi 已提交
73
#include "utilities/merge_operators/bytesxor.h"
74
#include "utilities/merge_operators/sortlist.h"
75
#include "utilities/persistent_cache/block_cache_tier.h"
J
jorlow@chromium.org 已提交
76

D
Dmitri Smirnov 已提交
77
#ifdef OS_WIN
S
sdong 已提交
78
#include <io.h>  // open/close
D
Dmitri Smirnov 已提交
79 80
#endif

A
Andrew Kryczka 已提交
81 82 83
using GFLAGS_NAMESPACE::ParseCommandLineFlags;
using GFLAGS_NAMESPACE::RegisterFlagValidator;
using GFLAGS_NAMESPACE::SetUsageMessage;
T
Tyler Harter 已提交
84

85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101
DEFINE_string(
    benchmarks,
    "fillseq,"
    "fillseqdeterministic,"
    "fillsync,"
    "fillrandom,"
    "filluniquerandomdeterministic,"
    "overwrite,"
    "readrandom,"
    "newiterator,"
    "newiteratorwhilewriting,"
    "seekrandom,"
    "seekrandomwhilewriting,"
    "seekrandomwhilemerging,"
    "readseq,"
    "readreverse,"
    "compact,"
102
    "compactall,"
103
    "multireadrandom,"
104
    "mixgraph,"
105 106 107 108 109
    "readseq,"
    "readtocache,"
    "readreverse,"
    "readwhilewriting,"
    "readwhilemerging,"
Y
Yi Wu 已提交
110
    "readwhilescanning,"
111 112
    "readrandomwriterandom,"
    "updaterandom,"
P
Pooya Shareghi 已提交
113
    "xorupdaterandom,"
114 115 116 117 118 119 120 121 122 123
    "randomwithverify,"
    "fill100K,"
    "crc32c,"
    "xxhash,"
    "compress,"
    "uncompress,"
    "acquireload,"
    "fillseekseq,"
    "randomtransaction,"
    "randomreplacekeys,"
124 125
    "timeseries,"
    "getmergeoperands",
126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153

    "Comma-separated list of operations to run in the specified"
    " order. Available benchmarks:\n"
    "\tfillseq       -- write N values in sequential key"
    " order in async mode\n"
    "\tfillseqdeterministic       -- write N values in the specified"
    " key order and keep the shape of the LSM tree\n"
    "\tfillrandom    -- write N values in random key order in async"
    " mode\n"
    "\tfilluniquerandomdeterministic       -- write N values in a random"
    " key order and keep the shape of the LSM tree\n"
    "\toverwrite     -- overwrite N values in random key order in"
    " async mode\n"
    "\tfillsync      -- write N/100 values in random key order in "
    "sync mode\n"
    "\tfill100K      -- write N/1000 100K values in random order in"
    " async mode\n"
    "\tdeleteseq     -- delete N keys in sequential order\n"
    "\tdeleterandom  -- delete N keys in random order\n"
    "\treadseq       -- read N times sequentially\n"
    "\treadtocache   -- 1 thread reading database sequentially\n"
    "\treadreverse   -- read N times in reverse order\n"
    "\treadrandom    -- read N times in random order\n"
    "\treadmissing   -- read N missing keys in random order\n"
    "\treadwhilewriting      -- 1 writer, N threads doing random "
    "reads\n"
    "\treadwhilemerging      -- 1 merger, N threads doing random "
    "reads\n"
Y
Yi Wu 已提交
154 155
    "\treadwhilescanning     -- 1 thread doing full table scan, "
    "N threads doing random reads\n"
156 157 158 159
    "\treadrandomwriterandom -- N threads doing random-read, "
    "random-write\n"
    "\tupdaterandom  -- N threads doing read-modify-write for random "
    "keys\n"
P
Pooya Shareghi 已提交
160 161
    "\txorupdaterandom  -- N threads doing read-XOR-write for "
    "random keys\n"
162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187
    "\tappendrandom  -- N threads doing read-modify-write with "
    "growing values\n"
    "\tmergerandom   -- same as updaterandom/appendrandom using merge"
    " operator. "
    "Must be used with merge_operator\n"
    "\treadrandommergerandom -- perform N random read-or-merge "
    "operations. Must be used with merge_operator\n"
    "\tnewiterator   -- repeated iterator creation\n"
    "\tseekrandom    -- N random seeks, call Next seek_nexts times "
    "per seek\n"
    "\tseekrandomwhilewriting -- seekrandom and 1 thread doing "
    "overwrite\n"
    "\tseekrandomwhilemerging -- seekrandom and 1 thread doing "
    "merge\n"
    "\tcrc32c        -- repeated crc32c of 4K of data\n"
    "\txxhash        -- repeated xxHash of 4K of data\n"
    "\tacquireload   -- load N*1000 times\n"
    "\tfillseekseq   -- write N values in sequential key, then read "
    "them by seeking to each key\n"
    "\trandomtransaction     -- execute N random transactions and "
    "verify correctness\n"
    "\trandomreplacekeys     -- randomly replaces N keys by deleting "
    "the old version and putting the new version\n\n"
    "\ttimeseries            -- 1 writer generates time series data "
    "and multiple readers doing random reads on id\n\n"
    "Meta operations:\n"
A
Aaron Gao 已提交
188 189
    "\tcompact     -- Compact the entire DB; If multiple, randomly choose one\n"
    "\tcompactall  -- Compact the entire DB\n"
190
    "\tstats       -- Print DB stats\n"
S
Siying Dong 已提交
191
    "\tresetstats  -- Reset DB stats\n"
192 193
    "\tlevelstats  -- Print the number of files and bytes per level\n"
    "\tsstables    -- Print sstable info\n"
194
    "\theapprofile -- Dump a heap profile (if supported by this port)\n"
195 196 197 198 199 200 201
    "\treplay      -- replay the trace file specified with trace_file\n"
    "\tgetmergeoperands -- Insert lots of merge records which are a list of "
    "sorted ints for a key and then compare performance of lookup for another "
    "key "
    "by doing a Get followed by binary searching in the large sorted list vs "
    "doing a GetMergeOperands and binary searching in the operands which are"
    "sorted sub-lists. The MergeOperator used is sortlist.h\n");
202 203 204 205 206 207 208 209

DEFINE_int64(num, 1000000, "Number of key/values to place in database");

DEFINE_int64(numdistinct, 1000,
             "Number of distinct keys to use. Used in RandomWithVerify to "
             "read/write on fewer keys so that gets are more likely to find the"
             " key and puts are more likely to update the same key");

210 211 212 213
DEFINE_int64(merge_keys, -1,
             "Number of distinct keys to use for MergeRandom and "
             "ReadRandomMergeRandom. "
             "If negative, there will be FLAGS_num keys.");
214
DEFINE_int32(num_column_families, 1, "Number of Column Families to use.");
215

216
DEFINE_int32(
217
    num_hot_column_families, 0,
218 219 220 221 222
    "Number of Hot Column Families. If more than 0, only write to this "
    "number of column families. After finishing all the writes to them, "
    "create new set of column families and insert to them. Only used "
    "when num_column_families > 1.");

223 224 225 226 227 228 229 230 231
DEFINE_string(column_family_distribution, "",
              "Comma-separated list of percentages, where the ith element "
              "indicates the probability of an op using the ith column family. "
              "The number of elements must be `num_hot_column_families` if "
              "specified; otherwise, it must be `num_column_families`. The "
              "sum of elements must be 100. E.g., if `num_column_families=4`, "
              "and `num_hot_column_families=0`, a valid list could be "
              "\"10,20,30,40\".");

232 233 234
DEFINE_int64(reads, -1, "Number of read operations to do.  "
             "If negative, do FLAGS_num reads.");

Y
Yueh-Hsuan Chiang 已提交
235 236 237
DEFINE_int64(deletes, -1, "Number of delete operations to do.  "
             "If negative, do FLAGS_num deletions.");

L
Lei Jin 已提交
238 239
DEFINE_int32(bloom_locality, 0, "Control bloom filter probes locality");

240 241 242 243 244 245 246 247 248
DEFINE_int64(seed, 0, "Seed base for random number generators. "
             "When 0 it is deterministic.");

DEFINE_int32(threads, 1, "Number of concurrent threads to run.");

DEFINE_int32(duration, 0, "Time in seconds for the random-ops tests to run."
             " When 0 then num & reads determine the test duration");

DEFINE_int32(value_size, 100, "Size of each value");
T
Tyler Harter 已提交
249

250 251
DEFINE_int32(seek_nexts, 0,
             "How many times to call Next() after Seek() in "
252 253
             "fillseekseq, seekrandom, seekrandomwhilewriting and "
             "seekrandomwhilemerging");
T
Tomislav Novak 已提交
254

M
Mark Callaghan 已提交
255 256 257 258
DEFINE_bool(reverse_iterator, false,
            "When true use Prev rather than Next for iterators that do "
            "Seek and then Next");

259 260 261 262
DEFINE_int64(max_scan_distance, 0,
             "Used to define iterate_upper_bound (or iterate_lower_bound "
             "if FLAGS_reverse_iterator is set to true) when value is nonzero");

263
DEFINE_bool(use_uint64_comparator, false, "use Uint64 user comparator");
264

265 266
DEFINE_int64(batch_size, 1, "Batch size");

A
Andrew Kryczka 已提交
267
static bool ValidateKeySize(const char* /*flagname*/, int32_t /*value*/) {
268 269
  return true;
}
270

271 272
static bool ValidateUint32Range(const char* flagname, uint64_t value) {
  if (value > std::numeric_limits<uint32_t>::max()) {
273
    fprintf(stderr, "Invalid value for --%s: %lu, overflow\n", flagname,
274 275 276 277 278 279
            (unsigned long)value);
    return false;
  }
  return true;
}

280
DEFINE_int32(key_size, 16, "size of each key");
281

282 283 284
DEFINE_int32(num_multi_db, 0,
             "Number of DBs used in the benchmark. 0 means single DB.");

285 286
DEFINE_double(compression_ratio, 0.5, "Arrange to generate values that shrink"
              " to this fraction of their original size after compression");
J
jorlow@chromium.org 已提交
287

288 289
DEFINE_double(read_random_exp_range, 0.0,
              "Read random's key will be generated using distribution of "
290
              "num * exp(-r) where r is uniform number from 0 to this value. "
291 292 293
              "The larger the number is, the more skewed the reads are. "
              "Only used in readrandom and multireadrandom benchmarks.");

294
DEFINE_bool(histogram, false, "Print histogram of operation timings");
J
jorlow@chromium.org 已提交
295

296 297 298 299 300 301 302 303
DEFINE_bool(enable_numa, false,
            "Make operations aware of NUMA architecture and bind memory "
            "and cpus corresponding to nodes together. In NUMA, memory "
            "in same node as CPUs are closer when compared to memory in "
            "other nodes. Reads can be faster when the process is bound to "
            "CPU and memory of same node. Use \"$numactl --hardware\" command "
            "to see NUMA memory architecture.");

304 305 306
DEFINE_int64(db_write_buffer_size, rocksdb::Options().db_write_buffer_size,
             "Number of bytes to buffer in all memtables before compacting");

307 308 309
DEFINE_bool(cost_write_buffer_to_cache, false,
            "The usage of memtable is costed to the block cache");

310
DEFINE_int64(write_buffer_size, rocksdb::Options().write_buffer_size,
311
             "Number of bytes to buffer in memtable before compacting");
312

313 314 315
DEFINE_int32(max_write_buffer_number,
             rocksdb::Options().max_write_buffer_number,
             "The number of in-memory memtables. Each memtable is of size"
Y
Yanqin Jin 已提交
316
             " write_buffer_size bytes.");
317

318 319 320 321 322 323 324 325 326 327
DEFINE_int32(min_write_buffer_number_to_merge,
             rocksdb::Options().min_write_buffer_number_to_merge,
             "The minimum number of write buffers that will be merged together"
             "before writing to storage. This is cheap because it is an"
             "in-memory merge. If this feature is not enabled, then all these"
             "write buffers are flushed to L0 as separate files and this "
             "increases read amplification because a get request has to check"
             " in all of these files. Also, an in-memory merge may result in"
             " writing less data to storage if there are duplicate records "
             " in each of these individual write buffers.");
328

329 330 331 332 333 334 335 336 337 338 339 340 341 342
DEFINE_int32(max_write_buffer_number_to_maintain,
             rocksdb::Options().max_write_buffer_number_to_maintain,
             "The total maximum number of write buffers to maintain in memory "
             "including copies of buffers that have already been flushed. "
             "Unlike max_write_buffer_number, this parameter does not affect "
             "flushing. This controls the minimum amount of write history "
             "that will be available in memory for conflict checking when "
             "Transactions are used. If this value is too low, some "
             "transactions may fail at commit time due to not being able to "
             "determine whether there were any write conflicts. Setting this "
             "value to 0 will cause write buffers to be freed immediately "
             "after they are flushed.  If this value is set to -1, "
             "'max_write_buffer_number' will be used.");

343 344 345 346 347 348 349 350 351 352 353 354 355 356
DEFINE_int64(max_write_buffer_size_to_maintain,
             rocksdb::Options().max_write_buffer_size_to_maintain,
             "The total maximum size of write buffers to maintain in memory "
             "including copies of buffers that have already been flushed. "
             "Unlike max_write_buffer_number, this parameter does not affect "
             "flushing. This controls the minimum amount of write history "
             "that will be available in memory for conflict checking when "
             "Transactions are used. If this value is too low, some "
             "transactions may fail at commit time due to not being able to "
             "determine whether there were any write conflicts. Setting this "
             "value to 0 will cause write buffers to be freed immediately "
             "after they are flushed.  If this value is set to -1, "
             "'max_write_buffer_number' will be used.");

357 358 359 360 361
DEFINE_int32(max_background_jobs,
             rocksdb::Options().max_background_jobs,
             "The maximum number of concurrent background jobs that can occur "
             "in parallel.");

362 363 364 365
DEFINE_int32(num_bottom_pri_threads, 0,
             "The number of threads in the bottom-priority thread pool (used "
             "by universal compaction only).");

366 367 368 369 370 371 372 373
DEFINE_int32(num_high_pri_threads, 0,
             "The maximum number of concurrent background compactions"
             " that can occur in parallel.");

DEFINE_int32(num_low_pri_threads, 0,
             "The maximum number of concurrent background compactions"
             " that can occur in parallel.");

374 375 376 377
DEFINE_int32(max_background_compactions,
             rocksdb::Options().max_background_compactions,
             "The maximum number of concurrent background compactions"
             " that can occur in parallel.");
378

379
DEFINE_int32(base_background_compactions, -1, "DEPRECATED");
380

381
DEFINE_uint64(subcompactions, 1,
382 383 384
              "Maximum number of subcompactions to divide L0-L1 compactions "
              "into.");
static const bool FLAGS_subcompactions_dummy
T
Tamir Duberstein 已提交
385
    __attribute__((__unused__)) = RegisterFlagValidator(&FLAGS_subcompactions,
386
                                                    &ValidateUint32Range);
387

388 389 390 391 392
DEFINE_int32(max_background_flushes,
             rocksdb::Options().max_background_flushes,
             "The maximum number of concurrent background flushes"
             " that can occur in parallel.");

393 394
static rocksdb::CompactionStyle FLAGS_compaction_style_e;
DEFINE_int32(compaction_style, (int32_t) rocksdb::Options().compaction_style,
395
             "style of compaction: level-based, universal and fifo");
396

397
static rocksdb::CompactionPri FLAGS_compaction_pri_e;
398
DEFINE_int32(compaction_pri, (int32_t)rocksdb::Options().compaction_pri,
399 400
             "priority of files to compaction: by size or by data age");

401 402 403
DEFINE_int32(universal_size_ratio, 0,
             "Percentage flexibility while comparing file size"
             " (for universal compaction only).");
404

405 406
DEFINE_int32(universal_min_merge_width, 0, "The minimum number of files in a"
             " single compaction run (for universal compaction only).");
407

408 409
DEFINE_int32(universal_max_merge_width, 0, "The max number of files to compact"
             " in universal style compaction");
410

411 412
DEFINE_int32(universal_max_size_amplification_percent, 0,
             "The max size amplification for universal style compaction");
413

414 415 416 417
DEFINE_int32(universal_compression_size_percent, -1,
             "The percentage of the database to compress for universal "
             "compaction. -1 means compress everything.");

418
DEFINE_bool(universal_allow_trivial_move, false,
419
            "Allow trivial move in universal compaction.");
420

Y
Yi Wu 已提交
421 422 423 424 425 426 427 428
DEFINE_int64(cache_size, 8 << 20,  // 8MB
             "Number of bytes to use as a cache of uncompressed data");

DEFINE_int32(cache_numshardbits, 6,
             "Number of shards for the block cache"
             " is 2 ** cache_numshardbits. Negative means use default settings."
             " This is applied only if FLAGS_cache_size is non-negative.");

429 430 431 432 433
DEFINE_double(cache_high_pri_pool_ratio, 0.0,
              "Ratio of block cache reserve for high pri blocks. "
              "If > 0.0, we also enable "
              "cache_index_and_filter_blocks_with_high_priority.");

Y
Yi Wu 已提交
434 435
DEFINE_bool(use_clock_cache, false,
            "Replace default LRU block cache with clock cache.");
436 437 438

DEFINE_int64(simcache_size, -1,
             "Number of bytes to use as a simcache of "
Y
Yi Wu 已提交
439
             "uncompressed data. Nagative value disables simcache.");
J
jorlow@chromium.org 已提交
440

441 442 443
DEFINE_bool(cache_index_and_filter_blocks, false,
            "Cache index/filter blocks in block cache.");

444 445 446
DEFINE_bool(partition_index_and_filters, false,
            "Partition index and filter blocks.");

447 448
DEFINE_bool(partition_index, false, "Partition index blocks");

449 450 451 452 453 454 455 456 457
DEFINE_int64(metadata_block_size,
             rocksdb::BlockBasedTableOptions().metadata_block_size,
             "Max partition size when partitioning index/filters");

// The default reduces the overhead of reading time with flash. With HDD, which
// offers much less throughput, however, this number better to be set to 1.
DEFINE_int32(ops_between_duration_checks, 1000,
             "Check duration limit every x ops");

458 459 460
DEFINE_bool(pin_l0_filter_and_index_blocks_in_cache, false,
            "Pin index/filter blocks of L0 files in block cache.");

461 462 463 464
DEFINE_bool(
    pin_top_level_index_and_filter, false,
    "Pin top-level index of partitioned index/filter blocks in block cache.");

465 466
DEFINE_int32(block_size,
             static_cast<int32_t>(rocksdb::BlockBasedTableOptions().block_size),
467
             "Number of bytes in a block.");
468

469 470 471 472 473
DEFINE_int32(
    format_version,
    static_cast<int32_t>(rocksdb::BlockBasedTableOptions().format_version),
    "Format version of SST files.");

474 475
DEFINE_int32(block_restart_interval,
             rocksdb::BlockBasedTableOptions().block_restart_interval,
476
             "Number of keys between restart points "
477 478 479 480 481 482
             "for delta encoding of keys in data block.");

DEFINE_int32(index_block_restart_interval,
             rocksdb::BlockBasedTableOptions().index_block_restart_interval,
             "Number of keys between restart points "
             "for delta encoding of keys in index block.");
483

484 485 486 487
DEFINE_int32(read_amp_bytes_per_bit,
             rocksdb::BlockBasedTableOptions().read_amp_bytes_per_bit,
             "Number of bytes per bit to be used in block read-amp bitmap");

488 489 490 491
DEFINE_bool(enable_index_compression,
            rocksdb::BlockBasedTableOptions().enable_index_compression,
            "Compress the index block");

492 493 494
DEFINE_bool(block_align, rocksdb::BlockBasedTableOptions().block_align,
            "Align data blocks on page size");

495 496 497 498 499 500 501 502 503 504
DEFINE_bool(use_data_block_hash_index, false,
            "if use kDataBlockBinaryAndHash "
            "instead of kDataBlockBinarySearch. "
            "This is valid if only we use BlockTable");

DEFINE_double(data_block_hash_table_util_ratio, 0.75,
              "util ratio for data block hash index table. "
              "This is only valid if use_data_block_hash_index is "
              "set to true");

505 506 507
DEFINE_int64(compressed_cache_size, -1,
             "Number of bytes to use as a cache of compressed data.");

508 509 510 511
DEFINE_int64(row_cache_size, 0,
             "Number of bytes to use as a cache of individual rows"
             " (0 = disabled).");

512 513 514
DEFINE_int32(open_files, rocksdb::Options().max_open_files,
             "Maximum number of files to keep open at the same time"
             " (use default if == 0)");
515

516 517 518 519
DEFINE_int32(file_opening_threads, rocksdb::Options().max_file_opening_threads,
             "If open_files is set to -1, this option set the number of "
             "threads that will be used to open files during DB::Open()");

A
Andrew Kryczka 已提交
520
DEFINE_bool(new_table_reader_for_compaction_inputs, true,
521 522 523 524
             "If true, uses a separate file handle for compaction inputs");

DEFINE_int32(compaction_readahead_size, 0, "Compaction readahead size");

D
Dmitri Smirnov 已提交
525 526
DEFINE_int32(random_access_max_buffer_size, 1024 * 1024,
             "Maximum windows randomaccess buffer size");
527

I
Islam AbdelRahman 已提交
528 529
DEFINE_int32(writable_file_max_buffer_size, 1024 * 1024,
             "Maximum write buffer for Writable File");
530

531 532
DEFINE_int32(bloom_bits, -1, "Bloom filter bits per key. Negative means"
             " use default settings.");
533 534 535
DEFINE_double(memtable_bloom_size_ratio, 0,
              "Ratio of memtable size used for bloom filter. 0 means no bloom "
              "filter.");
536 537
DEFINE_bool(memtable_whole_key_filtering, false,
            "Try to use whole key bloom filter in memtables.");
538 539
DEFINE_bool(memtable_use_huge_page, false,
            "Try to use huge page in memtables.");
S
Sanjay Ghemawat 已提交
540

541 542 543
DEFINE_bool(use_existing_db, false, "If true, do not destroy the existing"
            " database.  If you set this flag and also specify a benchmark that"
            " wants a fresh database, that benchmark will fail.");
544

545 546 547 548 549 550 551 552
DEFINE_bool(use_existing_keys, false,
            "If true, uses existing keys in the DB, "
            "rather than generating new ones. This involves some startup "
            "latency to load all keys into memory. It is supported for the "
            "same read/overwrite benchmarks as `-use_existing_db=true`, which "
            "must also be set for this flag to be enabled. When this flag is "
            "set, the value for `-num` will be ignored.");

553 554 555 556 557
DEFINE_bool(show_table_properties, false,
            "If true, then per-level table"
            " properties will be printed on every stats-interval when"
            " stats_interval is set and stats_per_interval is on.");

558
DEFINE_string(db, "", "Use the db with the following name.");
559

560 561 562 563 564 565 566 567 568 569 570 571 572 573
// Read cache flags

DEFINE_string(read_cache_path, "",
              "If not empty string, a read cache will be used in this path");

DEFINE_int64(read_cache_size, 4LL * 1024 * 1024 * 1024,
             "Maximum size of the read cache");

DEFINE_bool(read_cache_direct_write, true,
            "Whether to use Direct IO for writing to the read cache");

DEFINE_bool(read_cache_direct_read, true,
            "Whether to use Direct IO for reading from read cache");

574
DEFINE_bool(use_keep_filter, false, "Whether to use a noop compaction filter");
575

576 577 578 579 580 581 582 583
static bool ValidateCacheNumshardbits(const char* flagname, int32_t value) {
  if (value >= 20) {
    fprintf(stderr, "Invalid value for --%s: %d, must be < 20\n",
            flagname, value);
    return false;
  }
  return true;
}
584

585 586
DEFINE_bool(verify_checksum, true,
            "Verify checksum for every block read"
587
            " from storage");
588

589
DEFINE_bool(statistics, false, "Database statistics");
S
Siying Dong 已提交
590 591
DEFINE_int32(stats_level, rocksdb::StatsLevel::kExceptDetailedTimers,
             "stats level for statistics");
592
DEFINE_string(statistics_string, "", "Serialized statistics string");
593
static class std::shared_ptr<rocksdb::Statistics> dbstats;
594

595 596
DEFINE_int64(writes, -1, "Number of write operations to do. If negative, do"
             " --num reads.");
H
heyongqiang 已提交
597

598 599
DEFINE_bool(finish_after_writes, false, "Write thread terminates after all writes are finished");

600
DEFINE_bool(sync, false, "Sync all writes to disk");
H
heyongqiang 已提交
601

602
DEFINE_bool(use_fsync, false, "If true, issue fsync instead of fdatasync");
M
Mark Callaghan 已提交
603

604
DEFINE_bool(disable_wal, false, "If true, do not write WAL for write.");
605

L
Lei Jin 已提交
606 607
DEFINE_string(wal_dir, "", "If not empty, use the given dir for WAL");

608 609 610
DEFINE_string(truth_db, "/dev/shm/truth_db/dbbench",
              "Truth key/values used when using verify");

611
DEFINE_int32(num_levels, 7, "The total number of levels");
H
heyongqiang 已提交
612

613 614
DEFINE_int64(target_file_size_base, rocksdb::Options().target_file_size_base,
             "Target file size at level-1");
H
heyongqiang 已提交
615

616 617
DEFINE_int32(target_file_size_multiplier,
             rocksdb::Options().target_file_size_multiplier,
618
             "A multiplier to compute target level-N file size (N >= 2)");
619

620 621 622
DEFINE_uint64(max_bytes_for_level_base,
              rocksdb::Options().max_bytes_for_level_base,
              "Max bytes for level-1");
H
heyongqiang 已提交
623

624 625 626
DEFINE_bool(level_compaction_dynamic_level_bytes, false,
            "Whether level size base is dynamic");

627 628
DEFINE_double(max_bytes_for_level_multiplier, 10,
              "A multiplier to compute max bytes for level-N (N >= 2)");
H
heyongqiang 已提交
629

630 631 632
static std::vector<int> FLAGS_max_bytes_for_level_multiplier_additional_v;
DEFINE_string(max_bytes_for_level_multiplier_additional, "",
              "A vector that specifies additional fanout per level");
633

634 635 636
DEFINE_int32(level0_stop_writes_trigger,
             rocksdb::Options().level0_stop_writes_trigger,
             "Number of files in level-0"
637
             " that will trigger put stop.");
638

639 640 641
DEFINE_int32(level0_slowdown_writes_trigger,
             rocksdb::Options().level0_slowdown_writes_trigger,
             "Number of files in level-0"
642
             " that will slow down writes.");
643

644 645 646
DEFINE_int32(level0_file_num_compaction_trigger,
             rocksdb::Options().level0_file_num_compaction_trigger,
             "Number of files in level-0"
647
             " when compactions start");
648

649 650 651 652 653 654 655 656 657 658 659 660 661
static bool ValidateInt32Percent(const char* flagname, int32_t value) {
  if (value <= 0 || value>=100) {
    fprintf(stderr, "Invalid value for --%s: %d, 0< pct <100 \n",
            flagname, value);
    return false;
  }
  return true;
}
DEFINE_int32(readwritepercent, 90, "Ratio of reads to reads/writes (expressed"
             " as percentage) for the ReadRandomWriteRandom workload. The "
             "default value 90 means 90% operations out of all reads and writes"
             " operations are reads. In other words, 9 gets for every 1 put.");

662 663 664 665 666
DEFINE_int32(mergereadpercent, 70, "Ratio of merges to merges&reads (expressed"
             " as percentage) for the ReadRandomMergeRandom workload. The"
             " default value 70 means 70% out of all read and merge operations"
             " are merges. In other words, 7 merges for every 3 gets.");

667 668 669 670 671 672
DEFINE_int32(deletepercent, 2, "Percentage of deletes out of reads/writes/"
             "deletes (used in RandomWithVerify only). RandomWithVerify "
             "calculates writepercent as (100 - FLAGS_readwritepercent - "
             "deletepercent), so deletepercent must be smaller than (100 - "
             "FLAGS_readwritepercent)");

673 674 675 676 677
DEFINE_bool(optimize_filters_for_hits, false,
            "Optimizes bloom filters for workloads for most lookups return "
            "a value. For now this doesn't create bloom filters for the max "
            "level of the LSM to reduce metadata that should fit in RAM. ");

I
Igor Canadi 已提交
678 679
DEFINE_uint64(delete_obsolete_files_period_micros, 0,
              "Ignored. Left here for backward compatibility");
680

681 682 683
DEFINE_int64(writes_before_delete_range, 0,
             "Number of writes before DeleteRange is called regularly.");

A
Andrew Kryczka 已提交
684
DEFINE_int64(writes_per_range_tombstone, 0,
685
             "Number of writes between range tombstones");
A
Andrew Kryczka 已提交
686 687 688 689 690 691 692

DEFINE_int64(range_tombstone_width, 100, "Number of keys in tombstone's range");

DEFINE_int64(max_num_range_tombstones, 0,
             "Maximum number of range tombstones "
             "to insert.");

693 694 695
DEFINE_bool(expand_range_tombstones, false,
            "Expand range tombstone into sequential regular tombstones.");

696
#ifndef ROCKSDB_LITE
697
// Transactions Options
A
agiardullo 已提交
698
DEFINE_bool(optimistic_transaction_db, false,
A
agiardullo 已提交
699 700 701
            "Open a OptimisticTransactionDB instance. "
            "Required for randomtransaction benchmark.");

A
agiardullo 已提交
702 703 704 705
DEFINE_bool(transaction_db, false,
            "Open a TransactionDB instance. "
            "Required for randomtransaction benchmark.");

A
agiardullo 已提交
706 707 708 709
DEFINE_uint64(transaction_sets, 2,
              "Number of keys each transaction will "
              "modify (use in RandomTransaction only).  Max: 9999");

710 711 712 713
DEFINE_bool(transaction_set_snapshot, false,
            "Setting to true will have each transaction call SetSnapshot()"
            " upon creation.");

A
agiardullo 已提交
714 715 716 717
DEFINE_int32(transaction_sleep, 0,
             "Max microseconds to sleep in between "
             "reading and writing a value (used in RandomTransaction only). ");

718 719 720
DEFINE_uint64(transaction_lock_timeout, 100,
              "If using a transaction_db, specifies the lock wait timeout in"
              " milliseconds before failing a transaction waiting on a lock");
721 722 723 724 725 726 727 728 729
DEFINE_string(
    options_file, "",
    "The path to a RocksDB options file.  If specified, then db_bench will "
    "run with the RocksDB options in the default column family of the "
    "specified options file. "
    "Note that with this setting, db_bench will ONLY accept the following "
    "RocksDB options related command-line arguments, all other arguments "
    "that are related to RocksDB options will be ignored:\n"
    "\t--use_existing_db\n"
730
    "\t--use_existing_keys\n"
731 732 733 734 735 736
    "\t--statistics\n"
    "\t--row_cache_size\n"
    "\t--row_cache_numshardbits\n"
    "\t--enable_io_prio\n"
    "\t--dump_malloc_stats\n"
    "\t--num_multi_db\n");
737

738
// FIFO Compaction Options
739 740
DEFINE_uint64(fifo_compaction_max_table_files_size_mb, 0,
              "The limit of total table file sizes to trigger FIFO compaction");
741

742 743
DEFINE_bool(fifo_compaction_allow_compaction, true,
            "Allow compaction in FIFO compaction.");
744

S
Sagar Vemuri 已提交
745
DEFINE_uint64(fifo_compaction_ttl, 0, "TTL for the SST Files in seconds.");
746 747 748 749

// Blob DB Options
DEFINE_bool(use_blob_db, false,
            "Open a BlobDB instance. "
S
Sagar Vemuri 已提交
750 751 752 753 754 755
            "Required for large value benchmark.");

DEFINE_bool(blob_db_enable_gc, false, "Enable BlobDB garbage collection.");

DEFINE_bool(blob_db_is_fifo, false, "Enable FIFO eviction strategy in BlobDB.");

Y
Yi Wu 已提交
756
DEFINE_uint64(blob_db_max_db_size, 0,
S
Sagar Vemuri 已提交
757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772
              "Max size limit of the directory where blob files are stored.");

DEFINE_uint64(blob_db_max_ttl_range, 86400,
              "TTL range to generate BlobDB data (in seconds).");

DEFINE_uint64(blob_db_ttl_range_secs, 3600,
              "TTL bucket size to use when creating blob files.");

DEFINE_uint64(blob_db_min_blob_size, 0,
              "Smallest blob to store in a file. Blobs smaller than this "
              "will be inlined with the key in the LSM tree.");

DEFINE_uint64(blob_db_bytes_per_sync, 0, "Bytes to sync blob file at.");

DEFINE_uint64(blob_db_file_size, 256 * 1024 * 1024,
              "Target size of each blob file.");
773

774 775 776 777 778 779 780 781 782 783 784 785 786
// Secondary DB instance Options
DEFINE_bool(use_secondary_db, false,
            "Open a RocksDB secondary instance. A primary instance can be "
            "running in another db_bench process.");

DEFINE_string(secondary_path, "",
              "Path to a directory used by the secondary instance to store "
              "private files, e.g. info log.");

DEFINE_int32(secondary_update_interval, 5,
             "Secondary instance attempts to catch up with the primary every "
             "secondary_update_interval seconds.");

787
#endif  // ROCKSDB_LITE
788

789
DEFINE_bool(report_bg_io_stats, false,
790 791
            "Measure times spents on I/Os while in compactions. ");

792 793 794
DEFINE_bool(use_stderr_info_logger, false,
            "Write info logs to stderr instead of to LOG file. ");

795 796
DEFINE_string(trace_file, "", "Trace workload to a file. ");

797 798 799
DEFINE_int32(trace_replay_fast_forward, 1,
             "Fast forward trace replay, must >= 1. ");

800 801 802 803 804 805 806 807 808 809 810
DEFINE_int32(block_cache_trace_sampling_frequency, 1,
             "Block cache trace sampling frequency, termed s. It uses spatial "
             "downsampling and samples accesses to one out of s blocks.");
DEFINE_int64(
    block_cache_trace_max_trace_file_size_in_bytes,
    uint64_t{64} * 1024 * 1024 * 1024,
    "The maximum block cache trace file size in bytes. Block cache accesses "
    "will not be logged if the trace file size exceeds this threshold. Default "
    "is 64 GB.");
DEFINE_string(block_cache_trace_file, "", "Block cache trace file path.");

811
static enum rocksdb::CompressionType StringToCompressionType(const char* ctype) {
812 813 814 815 816 817 818 819 820 821
  assert(ctype);

  if (!strcasecmp(ctype, "none"))
    return rocksdb::kNoCompression;
  else if (!strcasecmp(ctype, "snappy"))
    return rocksdb::kSnappyCompression;
  else if (!strcasecmp(ctype, "zlib"))
    return rocksdb::kZlibCompression;
  else if (!strcasecmp(ctype, "bzip2"))
    return rocksdb::kBZip2Compression;
A
Albert Strasheim 已提交
822 823 824 825
  else if (!strcasecmp(ctype, "lz4"))
    return rocksdb::kLZ4Compression;
  else if (!strcasecmp(ctype, "lz4hc"))
    return rocksdb::kLZ4HCCompression;
826 827
  else if (!strcasecmp(ctype, "xpress"))
    return rocksdb::kXpressCompression;
828
  else if (!strcasecmp(ctype, "zstd"))
S
sdong 已提交
829
    return rocksdb::kZSTD;
830 831

  fprintf(stdout, "Cannot parse compression type '%s'\n", ctype);
832
  return rocksdb::kSnappyCompression;  // default value
833
}
834

835
static std::string ColumnFamilyName(size_t i) {
836 837 838 839
  if (i == 0) {
    return rocksdb::kDefaultColumnFamilyName;
  } else {
    char name[100];
S
sdong 已提交
840
    snprintf(name, sizeof(name), "column_family_name_%06zu", i);
841 842 843
    return std::string(name);
  }
}
I
Igor Canadi 已提交
844

845 846 847
DEFINE_string(compression_type, "snappy",
              "Algorithm to use to compress the database");
static enum rocksdb::CompressionType FLAGS_compression_type_e =
848
    rocksdb::kSnappyCompression;
849

850 851
DEFINE_int64(sample_for_compression, 0, "Sample every N block for compression");

852 853 854 855
DEFINE_int32(compression_level, rocksdb::CompressionOptions().level,
             "Compression level. The meaning of this value is library-"
             "dependent. If unset, we try to use the default for the library "
             "specified in `--compression_type`");
856

857 858
DEFINE_int32(compression_max_dict_bytes,
             rocksdb::CompressionOptions().max_dict_bytes,
859 860 861
             "Maximum size of dictionary used to prime the compression "
             "library.");

862 863
DEFINE_int32(compression_zstd_max_train_bytes,
             rocksdb::CompressionOptions().zstd_max_train_bytes,
A
Andrew Kryczka 已提交
864 865 866
             "Maximum size of training data passed to zstd's dictionary "
             "trainer.");

867 868 869 870 871 872 873 874 875 876 877 878 879 880 881
DEFINE_int32(min_level_to_compress, -1, "If non-negative, compression starts"
             " from this level. Levels with number < min_level_to_compress are"
             " not compressed. Otherwise, apply compression_type to "
             "all levels.");

static bool ValidateTableCacheNumshardbits(const char* flagname,
                                           int32_t value) {
  if (0 >= value || value > 20) {
    fprintf(stderr, "Invalid value for --%s: %d, must be  0 < val <= 20\n",
            flagname, value);
    return false;
  }
  return true;
}
DEFINE_int32(table_cache_numshardbits, 4, "");
882

883 884 885 886 887 888
#ifndef ROCKSDB_LITE
DEFINE_string(env_uri, "", "URI for registry Env lookup. Mutually exclusive"
              " with --hdfs.");
#endif  // ROCKSDB_LITE
DEFINE_string(hdfs, "", "Name of hdfs environment. Mutually exclusive with"
              " --env_uri.");
889
static rocksdb::Env* FLAGS_env = rocksdb::Env::Default();
890

891 892
DEFINE_int64(stats_interval, 0, "Stats are reported every N operations when "
             "this is greater than zero. When 0 the interval grows over time.");
893

894 895 896
DEFINE_int64(stats_interval_seconds, 0, "Report stats every N seconds. This "
             "overrides stats_interval when both are > 0.");

897 898
DEFINE_int32(stats_per_interval, 0, "Reports additional stats per interval when"
             " this is greater than 0.");
899

900 901 902 903 904 905 906 907
DEFINE_int64(report_interval_seconds, 0,
             "If greater than zero, it will write simple stats in CVS format "
             "to --report_file every N seconds");

DEFINE_string(report_file, "report.csv",
              "Filename where some simple stats are reported to (if "
              "--report_interval_seconds is bigger than 0)");

908 909 910 911
DEFINE_int32(thread_status_per_interval, 0,
             "Takes and report a snapshot of the current status of each thread"
             " when this is greater than 0.");

912
DEFINE_int32(perf_level, rocksdb::PerfLevel::kDisable, "Level of perf collection");
913

914
static bool ValidateRateLimit(const char* flagname, double value) {
D
Dmitri Smirnov 已提交
915
  const double EPSILON = 1e-10;
916 917 918 919 920 921 922
  if ( value < -EPSILON ) {
    fprintf(stderr, "Invalid value for --%s: %12.6f, must be >= 0.0\n",
            flagname, value);
    return false;
  }
  return true;
}
923
DEFINE_double(soft_rate_limit, 0.0, "DEPRECATED");
J
Jim Paton 已提交
924

925 926
DEFINE_double(hard_rate_limit, 0.0, "DEPRECATED");

927 928 929
DEFINE_uint64(soft_pending_compaction_bytes_limit, 64ull * 1024 * 1024 * 1024,
              "Slowdown writes if pending compaction bytes exceed this number");

930
DEFINE_uint64(hard_pending_compaction_bytes_limit, 128ull * 1024 * 1024 * 1024,
931
              "Stop writes if pending compaction bytes exceed this number");
932

S
sdong 已提交
933
DEFINE_uint64(delayed_write_rate, 8388608u,
S
sdong 已提交
934 935 936
              "Limited bytes allowed to DB when soft_rate_limit or "
              "level0_slowdown_writes_trigger triggers");

937 938 939
DEFINE_bool(enable_pipelined_write, true,
            "Allow WAL and memtable writes to be pipelined");

M
Maysam Yabandeh 已提交
940 941 942
DEFINE_bool(unordered_write, false,
            "Allow WAL and memtable writes to be pipelined");

943
DEFINE_bool(allow_concurrent_memtable_write, true,
944 945
            "Allow multi-writers to update mem tables in parallel.");

946 947 948 949 950 951 952
DEFINE_bool(inplace_update_support, rocksdb::Options().inplace_update_support,
            "Support in-place memtable update for smaller or same-size values");

DEFINE_uint64(inplace_update_num_locks,
              rocksdb::Options().inplace_update_num_locks,
              "Number of RW locks to protect in-place memtable updates");

953
DEFINE_bool(enable_write_thread_adaptive_yield, true,
954 955 956 957 958 959 960 961 962 963
            "Use a yielding spin loop for brief writer thread waits.");

DEFINE_uint64(
    write_thread_max_yield_usec, 100,
    "Maximum microseconds for enable_write_thread_adaptive_yield operation.");

DEFINE_uint64(write_thread_slow_yield_usec, 3,
              "The threshold at which a slow yield is considered a signal that "
              "other processes or threads want the core.");

964 965 966
DEFINE_int32(rate_limit_delay_max_milliseconds, 1000,
             "When hard_rate_limit is set then this is the max time a put will"
             " be stalled.");
967

S
sdong 已提交
968 969
DEFINE_uint64(rate_limiter_bytes_per_sec, 0, "Set options.rate_limiter value.");

A
Andrew Kryczka 已提交
970 971 972 973
DEFINE_bool(rate_limiter_auto_tuned, false,
            "Enable dynamic adjustment of rate limit according to demand for "
            "background I/O");

974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992

DEFINE_bool(sine_write_rate, false,
            "Use a sine wave write_rate_limit");

DEFINE_uint64(sine_write_rate_interval_milliseconds, 10000,
              "Interval of which the sine wave write_rate_limit is recalculated");

DEFINE_double(sine_a, 1,
             "A in f(x) = A sin(bx + c) + d");

DEFINE_double(sine_b, 1,
             "B in f(x) = A sin(bx + c) + d");

DEFINE_double(sine_c, 0,
             "C in f(x) = A sin(bx + c) + d");

DEFINE_double(sine_d, 1,
             "D in f(x) = A sin(bx + c) + d");

993 994 995
DEFINE_bool(rate_limit_bg_reads, false,
            "Use options.rate_limiter on compaction reads");

996 997
DEFINE_uint64(
    benchmark_write_rate_limit, 0,
998 999
    "If non-zero, db_bench will rate-limit the writes going into RocksDB. This "
    "is the global rate in bytes/second.");
1000

1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046
// the parameters of mix_graph
DEFINE_double(key_dist_a, 0.0,
              "The parameter 'a' of key access distribution model "
              "f(x)=a*x^b");
DEFINE_double(key_dist_b, 0.0,
              "The parameter 'b' of key access distribution model "
              "f(x)=a*x^b");
DEFINE_double(value_theta, 0.0,
              "The parameter 'theta' of Generized Pareto Distribution "
              "f(x)=(1/sigma)*(1+k*(x-theta)/sigma)^-(1/k+1)");
DEFINE_double(value_k, 0.0,
              "The parameter 'k' of Generized Pareto Distribution "
              "f(x)=(1/sigma)*(1+k*(x-theta)/sigma)^-(1/k+1)");
DEFINE_double(value_sigma, 0.0,
              "The parameter 'theta' of Generized Pareto Distribution "
              "f(x)=(1/sigma)*(1+k*(x-theta)/sigma)^-(1/k+1)");
DEFINE_double(iter_theta, 0.0,
              "The parameter 'theta' of Generized Pareto Distribution "
              "f(x)=(1/sigma)*(1+k*(x-theta)/sigma)^-(1/k+1)");
DEFINE_double(iter_k, 0.0,
              "The parameter 'k' of Generized Pareto Distribution "
              "f(x)=(1/sigma)*(1+k*(x-theta)/sigma)^-(1/k+1)");
DEFINE_double(iter_sigma, 0.0,
              "The parameter 'sigma' of Generized Pareto Distribution "
              "f(x)=(1/sigma)*(1+k*(x-theta)/sigma)^-(1/k+1)");
DEFINE_double(mix_get_ratio, 1.0,
              "The ratio of Get queries of mix_graph workload");
DEFINE_double(mix_put_ratio, 0.0,
              "The ratio of Put queries of mix_graph workload");
DEFINE_double(mix_seek_ratio, 0.0,
              "The ratio of Seek queries of mix_graph workload");
DEFINE_int64(mix_max_scan_len, 10000, "The max scan length of Iterator");
DEFINE_int64(mix_ave_kv_size, 512,
             "The average key-value size of this workload");
DEFINE_int64(mix_max_value_size, 1024, "The max value size of this workload");
DEFINE_double(
    sine_mix_rate_noise, 0.0,
    "Add the noise ratio to the sine rate, it is between 0.0 and 1.0");
DEFINE_bool(sine_mix_rate, false,
            "Enable the sine QPS control on the mix workload");
DEFINE_uint64(
    sine_mix_rate_interval_milliseconds, 10000,
    "Interval of which the sine wave read_rate_limit is recalculated");
DEFINE_int64(mix_accesses, -1,
             "The total query accesses of mix_graph workload");

1047 1048 1049 1050 1051
DEFINE_uint64(
    benchmark_read_rate_limit, 0,
    "If non-zero, db_bench will rate-limit the reads from RocksDB. This "
    "is the global rate in ops/second.");

1052 1053
DEFINE_uint64(max_compaction_bytes, rocksdb::Options().max_compaction_bytes,
              "Max bytes allowed in one compaction");
1054

1055
#ifndef ROCKSDB_LITE
1056
DEFINE_bool(readonly, false, "Run read only benchmarks.");
Z
Zhongyi Xie 已提交
1057 1058 1059

DEFINE_bool(print_malloc_stats, false,
            "Print malloc stats to stdout after benchmarks finish.");
1060
#endif  // ROCKSDB_LITE
H
heyongqiang 已提交
1061

1062
DEFINE_bool(disable_auto_compactions, false, "Do not auto trigger compactions");
1063

1064 1065 1066
DEFINE_uint64(wal_ttl_seconds, 0, "Set the TTL for the WAL Files in seconds.");
DEFINE_uint64(wal_size_limit_MB, 0, "Set the size limit for the WAL Files"
              " in MB.");
1067
DEFINE_uint64(max_total_wal_size, 0, "Set total max WAL size");
1068

1069
DEFINE_bool(mmap_read, rocksdb::Options().allow_mmap_reads,
1070
            "Allow reads to occur via mmap-ing files");
1071

1072
DEFINE_bool(mmap_write, rocksdb::Options().allow_mmap_writes,
1073
            "Allow writes to occur via mmap-ing files");
1074

1075
DEFINE_bool(use_direct_reads, rocksdb::Options().use_direct_reads,
1076 1077
            "Use O_DIRECT for reading data");

1078 1079
DEFINE_bool(use_direct_io_for_flush_and_compaction,
            rocksdb::Options().use_direct_io_for_flush_and_compaction,
1080
            "Use O_DIRECT for background flush and compaction writes");
A
Aaron Gao 已提交
1081

1082 1083
DEFINE_bool(advise_random_on_open, rocksdb::Options().advise_random_on_open,
            "Advise random access on table file open");
1084

1085 1086 1087
DEFINE_string(compaction_fadvice, "NORMAL",
              "Access pattern advice when a file is compacted");
static auto FLAGS_compaction_fadvice_e =
1088
  rocksdb::Options().access_hint_on_compaction_start;
1089

I
Igor Canadi 已提交
1090 1091 1092
DEFINE_bool(use_tailing_iterator, false,
            "Use tailing iterator to access a series of keys instead of get");

1093 1094 1095 1096
DEFINE_bool(use_adaptive_mutex, rocksdb::Options().use_adaptive_mutex,
            "Use adaptive mutex");

DEFINE_uint64(bytes_per_sync,  rocksdb::Options().bytes_per_sync,
1097
              "Allows OS to incrementally sync SST files to disk while they are"
1098 1099
              " being written, in the background. Issue one request for every"
              " bytes_per_sync written. 0 turns it off.");
1100 1101 1102 1103 1104 1105

DEFINE_uint64(wal_bytes_per_sync,  rocksdb::Options().wal_bytes_per_sync,
              "Allows OS to incrementally sync WAL files to disk while they are"
              " being written, in the background. Issue one request for every"
              " wal_bytes_per_sync written. 0 turns it off.");

A
Andres Noetzli 已提交
1106 1107 1108 1109 1110 1111 1112
DEFINE_bool(use_single_deletes, true,
            "Use single deletes (used in RandomReplaceKeys only).");

DEFINE_double(stddev, 2000.0,
              "Standard deviation of normal distribution used for picking keys"
              " (used in RandomReplaceKeys only).");

1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130
DEFINE_int32(key_id_range, 100000,
             "Range of possible value of key id (used in TimeSeries only).");

DEFINE_string(expire_style, "none",
              "Style to remove expired time entries. Can be one of the options "
              "below: none (do not expired data), compaction_filter (use a "
              "compaction filter to remove expired data), delete (seek IDs and "
              "remove expired data) (used in TimeSeries only).");

DEFINE_uint64(
    time_range, 100000,
    "Range of timestamp that store in the database (used in TimeSeries"
    " only).");

DEFINE_int32(num_deletion_threads, 1,
             "Number of threads to do deletion (used in TimeSeries and delete "
             "expire_style only).");

1131 1132 1133
DEFINE_int32(max_successive_merges, 0, "Maximum number of successive merge"
             " operations on a key in the memtable");

1134 1135 1136 1137 1138 1139 1140 1141
static bool ValidatePrefixSize(const char* flagname, int32_t value) {
  if (value < 0 || value>=2000000000) {
    fprintf(stderr, "Invalid value for --%s: %d. 0<= PrefixSize <=2000000000\n",
            flagname, value);
    return false;
  }
  return true;
}
1142

L
Lei Jin 已提交
1143 1144
DEFINE_int32(prefix_size, 0, "control the prefix size for HashSkipList and "
             "plain table");
1145 1146 1147
DEFINE_int64(keys_per_prefix, 0, "control average number of keys generated "
             "per prefix, 0 means no special handling of the prefix, "
             "i.e. use the prefix comes with the generated random number.");
1148 1149 1150 1151 1152 1153 1154 1155
DEFINE_bool(total_order_seek, false,
            "Enable total order seek regardless of index format.");
DEFINE_bool(prefix_same_as_start, false,
            "Enforce iterator to return keys with prefix same as seek key.");
DEFINE_bool(
    seek_missing_prefix, false,
    "Iterator seek to keys with non-exist prefixes. Require prefix_size > 8");

1156 1157 1158
DEFINE_int32(memtable_insert_with_hint_prefix_size, 0,
             "If non-zero, enable "
             "memtable insert with hint with the given prefix size.");
1159 1160
DEFINE_bool(enable_io_prio, false, "Lower the background flush/compaction "
            "threads' IO priority");
1161 1162
DEFINE_bool(enable_cpu_prio, false, "Lower the background flush/compaction "
            "threads' CPU priority");
1163 1164 1165
DEFINE_bool(identity_as_first_hash, false, "the first hash function of cuckoo "
            "table becomes an identity function. This is only valid when key "
            "is 8 bytes");
1166
DEFINE_bool(dump_malloc_stats, true, "Dump malloc stats in LOG ");
1167
DEFINE_uint64(stats_dump_period_sec, rocksdb::Options().stats_dump_period_sec,
1168
              "Gap between printing stats to log in seconds");
1169 1170 1171
DEFINE_uint64(stats_persist_period_sec,
              rocksdb::Options().stats_persist_period_sec,
              "Gap between persisting stats in seconds");
1172 1173
DEFINE_bool(persist_stats_to_disk, rocksdb::Options().persist_stats_to_disk,
            "whether to persist stats to disk");
1174 1175 1176
DEFINE_uint64(stats_history_buffer_size,
              rocksdb::Options().stats_history_buffer_size,
              "Max number of stats snapshots to keep in memory");
1177 1178 1179
DEFINE_int64(multiread_stride, 0,
             "Stride length for the keys in a MultiGet batch");
DEFINE_bool(multiread_batched, false, "Use the new MultiGet API");
J
Jim Paton 已提交
1180 1181 1182 1183

enum RepFactory {
  kSkipList,
  kPrefixHash,
1184
  kVectorRep,
1185
  kHashLinkedList,
J
Jim Paton 已提交
1186
};
I
Igor Canadi 已提交
1187

1188
static enum RepFactory StringToRepFactory(const char* ctype) {
1189 1190 1191 1192 1193 1194 1195 1196
  assert(ctype);

  if (!strcasecmp(ctype, "skip_list"))
    return kSkipList;
  else if (!strcasecmp(ctype, "prefix_hash"))
    return kPrefixHash;
  else if (!strcasecmp(ctype, "vector"))
    return kVectorRep;
1197 1198
  else if (!strcasecmp(ctype, "hash_linkedlist"))
    return kHashLinkedList;
1199 1200 1201 1202

  fprintf(stdout, "Cannot parse memreptable %s\n", ctype);
  return kSkipList;
}
I
Igor Canadi 已提交
1203

J
Jim Paton 已提交
1204
static enum RepFactory FLAGS_rep_factory;
1205
DEFINE_string(memtablerep, "skip_list", "");
1206
DEFINE_int64(hash_bucket_count, 1024 * 1024, "hash bucket count");
L
Lei Jin 已提交
1207 1208
DEFINE_bool(use_plain_table, false, "if use plain table "
            "instead of block-based table format");
1209 1210
DEFINE_bool(use_cuckoo_table, false, "if use cuckoo table format");
DEFINE_double(cuckoo_hash_ratio, 0.9, "Hash ratio for Cuckoo SST table.");
1211 1212 1213
DEFINE_bool(use_hash_search, false, "if use kHashSearch "
            "instead of kBinarySearch. "
            "This is valid if only we use BlockTable");
1214 1215 1216
DEFINE_bool(use_block_based_filter, false, "if use kBlockBasedFilter "
            "instead of kFullFilter for filter block. "
            "This is valid if only we use BlockTable");
1217 1218 1219 1220
DEFINE_string(merge_operator, "", "The merge operator to use with the database."
              "If a new merge operator is specified, be sure to use fresh"
              " database The possible merge operators are defined in"
              " utilities/merge_operators.h");
T
Tomislav Novak 已提交
1221 1222 1223
DEFINE_int32(skip_list_lookahead, 0, "Used with skip_list memtablerep; try "
             "linear search first for this many steps from the previous "
             "position");
1224 1225
DEFINE_bool(report_file_operations, false, "if report number of file "
            "operations");
1226
DEFINE_int32(readahead_size, 0, "Iterator readahead size");
D
Deon Nicholas 已提交
1227

T
Tamir Duberstein 已提交
1228
static const bool FLAGS_soft_rate_limit_dummy __attribute__((__unused__)) =
1229
    RegisterFlagValidator(&FLAGS_soft_rate_limit, &ValidateRateLimit);
K
kailiu 已提交
1230

T
Tamir Duberstein 已提交
1231
static const bool FLAGS_hard_rate_limit_dummy __attribute__((__unused__)) =
1232
    RegisterFlagValidator(&FLAGS_hard_rate_limit, &ValidateRateLimit);
K
kailiu 已提交
1233

T
Tamir Duberstein 已提交
1234
static const bool FLAGS_prefix_size_dummy __attribute__((__unused__)) =
1235
    RegisterFlagValidator(&FLAGS_prefix_size, &ValidatePrefixSize);
K
kailiu 已提交
1236

T
Tamir Duberstein 已提交
1237
static const bool FLAGS_key_size_dummy __attribute__((__unused__)) =
1238
    RegisterFlagValidator(&FLAGS_key_size, &ValidateKeySize);
K
kailiu 已提交
1239

T
Tamir Duberstein 已提交
1240
static const bool FLAGS_cache_numshardbits_dummy __attribute__((__unused__)) =
1241 1242
    RegisterFlagValidator(&FLAGS_cache_numshardbits,
                          &ValidateCacheNumshardbits);
K
kailiu 已提交
1243

T
Tamir Duberstein 已提交
1244
static const bool FLAGS_readwritepercent_dummy __attribute__((__unused__)) =
1245
    RegisterFlagValidator(&FLAGS_readwritepercent, &ValidateInt32Percent);
K
kailiu 已提交
1246

I
Igor Canadi 已提交
1247 1248 1249
DEFINE_int32(disable_seek_compaction, false,
             "Not used, left here for backwards compatibility");

T
Tamir Duberstein 已提交
1250
static const bool FLAGS_deletepercent_dummy __attribute__((__unused__)) =
1251
    RegisterFlagValidator(&FLAGS_deletepercent, &ValidateInt32Percent);
T
Tamir Duberstein 已提交
1252
static const bool FLAGS_table_cache_numshardbits_dummy __attribute__((__unused__)) =
1253 1254
    RegisterFlagValidator(&FLAGS_table_cache_numshardbits,
                          &ValidateTableCacheNumshardbits);
K
kailiu 已提交
1255

1256
namespace rocksdb {
J
jorlow@chromium.org 已提交
1257

1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279
namespace {
struct ReportFileOpCounters {
  std::atomic<int> open_counter_;
  std::atomic<int> read_counter_;
  std::atomic<int> append_counter_;
  std::atomic<uint64_t> bytes_read_;
  std::atomic<uint64_t> bytes_written_;
};

// A special Env to records and report file operations in db_bench
class ReportFileOpEnv : public EnvWrapper {
 public:
  explicit ReportFileOpEnv(Env* base) : EnvWrapper(base) { reset(); }

  void reset() {
    counters_.open_counter_ = 0;
    counters_.read_counter_ = 0;
    counters_.append_counter_ = 0;
    counters_.bytes_read_ = 0;
    counters_.bytes_written_ = 0;
  }

1280 1281
  Status NewSequentialFile(const std::string& f,
                           std::unique_ptr<SequentialFile>* r,
I
Igor Sugak 已提交
1282
                           const EnvOptions& soptions) override {
1283 1284
    class CountingFile : public SequentialFile {
     private:
1285
      std::unique_ptr<SequentialFile> target_;
1286 1287 1288
      ReportFileOpCounters* counters_;

     public:
1289
      CountingFile(std::unique_ptr<SequentialFile>&& target,
1290 1291 1292
                   ReportFileOpCounters* counters)
          : target_(std::move(target)), counters_(counters) {}

1293
      Status Read(size_t n, Slice* result, char* scratch) override {
1294 1295 1296 1297 1298 1299 1300
        counters_->read_counter_.fetch_add(1, std::memory_order_relaxed);
        Status rv = target_->Read(n, result, scratch);
        counters_->bytes_read_.fetch_add(result->size(),
                                         std::memory_order_relaxed);
        return rv;
      }

1301
      Status Skip(uint64_t n) override { return target_->Skip(n); }
1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312
    };

    Status s = target()->NewSequentialFile(f, r, soptions);
    if (s.ok()) {
      counters()->open_counter_.fetch_add(1, std::memory_order_relaxed);
      r->reset(new CountingFile(std::move(*r), counters()));
    }
    return s;
  }

  Status NewRandomAccessFile(const std::string& f,
1313
                             std::unique_ptr<RandomAccessFile>* r,
I
Igor Sugak 已提交
1314
                             const EnvOptions& soptions) override {
1315 1316
    class CountingFile : public RandomAccessFile {
     private:
1317
      std::unique_ptr<RandomAccessFile> target_;
1318 1319 1320
      ReportFileOpCounters* counters_;

     public:
1321
      CountingFile(std::unique_ptr<RandomAccessFile>&& target,
1322 1323
                   ReportFileOpCounters* counters)
          : target_(std::move(target)), counters_(counters) {}
1324 1325
      Status Read(uint64_t offset, size_t n, Slice* result,
                  char* scratch) const override {
1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341
        counters_->read_counter_.fetch_add(1, std::memory_order_relaxed);
        Status rv = target_->Read(offset, n, result, scratch);
        counters_->bytes_read_.fetch_add(result->size(),
                                         std::memory_order_relaxed);
        return rv;
      }
    };

    Status s = target()->NewRandomAccessFile(f, r, soptions);
    if (s.ok()) {
      counters()->open_counter_.fetch_add(1, std::memory_order_relaxed);
      r->reset(new CountingFile(std::move(*r), counters()));
    }
    return s;
  }

1342
  Status NewWritableFile(const std::string& f, std::unique_ptr<WritableFile>* r,
I
Igor Sugak 已提交
1343
                         const EnvOptions& soptions) override {
1344 1345
    class CountingFile : public WritableFile {
     private:
1346
      std::unique_ptr<WritableFile> target_;
1347 1348 1349
      ReportFileOpCounters* counters_;

     public:
1350
      CountingFile(std::unique_ptr<WritableFile>&& target,
1351 1352 1353
                   ReportFileOpCounters* counters)
          : target_(std::move(target)), counters_(counters) {}

I
Igor Sugak 已提交
1354
      Status Append(const Slice& data) override {
1355 1356 1357 1358 1359 1360 1361
        counters_->append_counter_.fetch_add(1, std::memory_order_relaxed);
        Status rv = target_->Append(data);
        counters_->bytes_written_.fetch_add(data.size(),
                                            std::memory_order_relaxed);
        return rv;
      }

1362
      Status Truncate(uint64_t size) override { return target_->Truncate(size); }
I
Igor Sugak 已提交
1363 1364 1365
      Status Close() override { return target_->Close(); }
      Status Flush() override { return target_->Flush(); }
      Status Sync() override { return target_->Sync(); }
1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384
    };

    Status s = target()->NewWritableFile(f, r, soptions);
    if (s.ok()) {
      counters()->open_counter_.fetch_add(1, std::memory_order_relaxed);
      r->reset(new CountingFile(std::move(*r), counters()));
    }
    return s;
  }

  // getter
  ReportFileOpCounters* counters() { return &counters_; }

 private:
  ReportFileOpCounters counters_;
};

}  // namespace

1385
// Helper for quickly generating random data.
J
jorlow@chromium.org 已提交
1386 1387 1388
class RandomGenerator {
 private:
  std::string data_;
1389
  unsigned int pos_;
J
jorlow@chromium.org 已提交
1390 1391 1392 1393 1394 1395 1396 1397

 public:
  RandomGenerator() {
    // We use a limited amount of data over and over again and ensure
    // that it is larger than the compression window (32KB), and also
    // large enough to serve all typical value sizes we want to write.
    Random rnd(301);
    std::string piece;
1398
    while (data_.size() < (unsigned)std::max(1048576, FLAGS_value_size)) {
J
jorlow@chromium.org 已提交
1399 1400 1401 1402 1403 1404 1405 1406
      // Add a short fragment that is as compressible as specified
      // by FLAGS_compression_ratio.
      test::CompressibleString(&rnd, FLAGS_compression_ratio, 100, &piece);
      data_.append(piece);
    }
    pos_ = 0;
  }

1407
  Slice Generate(unsigned int len) {
1408
    assert(len <= data_.size());
J
jorlow@chromium.org 已提交
1409 1410 1411 1412 1413 1414
    if (pos_ + len > data_.size()) {
      pos_ = 0;
    }
    pos_ += len;
    return Slice(data_.data() + pos_ - len, len);
  }
A
Anirban Rahut 已提交
1415 1416 1417 1418 1419 1420 1421 1422 1423

  Slice GenerateWithTTL(unsigned int len) {
    assert(len <= data_.size());
    if (pos_ + len > data_.size()) {
      pos_ = 0;
    }
    pos_ += len;
    return Slice(data_.data() + pos_ - len, len);
  }
1424
};
X
Xing Jin 已提交
1425

1426 1427 1428 1429 1430 1431 1432 1433
static void AppendWithSpace(std::string* str, Slice msg) {
  if (msg.empty()) return;
  if (!str->empty()) {
    str->push_back(' ');
  }
  str->append(msg.data(), msg.size());
}

1434 1435 1436
struct DBWithColumnFamilies {
  std::vector<ColumnFamilyHandle*> cfh;
  DB* db;
1437
#ifndef ROCKSDB_LITE
A
agiardullo 已提交
1438
  OptimisticTransactionDB* opt_txn_db;
1439
#endif  // ROCKSDB_LITE
1440 1441 1442 1443 1444 1445
  std::atomic<size_t> num_created;  // Need to be updated after all the
                                    // new entries in cfh are set.
  size_t num_hot;  // Number of column families to be queried at each moment.
                   // After each CreateNewCf(), another num_hot number of new
                   // Column families will be created and used to be queried.
  port::Mutex create_cf_mutex;  // Only one thread can execute CreateNewCf()
1446 1447
  std::vector<int> cfh_idx_to_prob;  // ith index holds probability of operating
                                     // on cfh[i].
1448

1449 1450 1451 1452 1453 1454
  DBWithColumnFamilies()
      : db(nullptr)
#ifndef ROCKSDB_LITE
        , opt_txn_db(nullptr)
#endif  // ROCKSDB_LITE
  {
1455
    cfh.clear();
1456 1457
    num_created = 0;
    num_hot = 0;
1458
  }
1459 1460 1461 1462

  DBWithColumnFamilies(const DBWithColumnFamilies& other)
      : cfh(other.cfh),
        db(other.db),
1463
#ifndef ROCKSDB_LITE
A
agiardullo 已提交
1464
        opt_txn_db(other.opt_txn_db),
1465
#endif  // ROCKSDB_LITE
1466
        num_created(other.num_created.load()),
1467 1468 1469
        num_hot(other.num_hot),
        cfh_idx_to_prob(other.cfh_idx_to_prob) {
  }
1470

A
agiardullo 已提交
1471 1472 1473 1474
  void DeleteDBs() {
    std::for_each(cfh.begin(), cfh.end(),
                  [](ColumnFamilyHandle* cfhi) { delete cfhi; });
    cfh.clear();
1475
#ifndef ROCKSDB_LITE
A
agiardullo 已提交
1476 1477 1478
    if (opt_txn_db) {
      delete opt_txn_db;
      opt_txn_db = nullptr;
A
agiardullo 已提交
1479 1480
    } else {
      delete db;
1481
      db = nullptr;
A
agiardullo 已提交
1482
    }
1483 1484
#else
    delete db;
A
agiardullo 已提交
1485
    db = nullptr;
1486
#endif  // ROCKSDB_LITE
A
agiardullo 已提交
1487 1488
  }

1489 1490
  ColumnFamilyHandle* GetCfh(int64_t rand_num) {
    assert(num_hot > 0);
1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502
    size_t rand_offset = 0;
    if (!cfh_idx_to_prob.empty()) {
      assert(cfh_idx_to_prob.size() == num_hot);
      int sum = 0;
      while (sum + cfh_idx_to_prob[rand_offset] < rand_num % 100) {
        sum += cfh_idx_to_prob[rand_offset];
        ++rand_offset;
      }
      assert(rand_offset < cfh_idx_to_prob.size());
    } else {
      rand_offset = rand_num % num_hot;
    }
1503
    return cfh[num_created.load(std::memory_order_acquire) - num_hot +
1504
               rand_offset];
1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527
  }

  // stage: assume CF from 0 to stage * num_hot has be created. Need to create
  //        stage * num_hot + 1 to stage * (num_hot + 1).
  void CreateNewCf(ColumnFamilyOptions options, int64_t stage) {
    MutexLock l(&create_cf_mutex);
    if ((stage + 1) * num_hot <= num_created) {
      // Already created.
      return;
    }
    auto new_num_created = num_created + num_hot;
    assert(new_num_created <= cfh.size());
    for (size_t i = num_created; i < new_num_created; i++) {
      Status s =
          db->CreateColumnFamily(options, ColumnFamilyName(i), &(cfh[i]));
      if (!s.ok()) {
        fprintf(stderr, "create column family error: %s\n",
                s.ToString().c_str());
        abort();
      }
    }
    num_created.store(new_num_created, std::memory_order_release);
  }
1528 1529
};

1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552
// a class that reports stats to CSV file
class ReporterAgent {
 public:
  ReporterAgent(Env* env, const std::string& fname,
                uint64_t report_interval_secs)
      : env_(env),
        total_ops_done_(0),
        last_report_(0),
        report_interval_secs_(report_interval_secs),
        stop_(false) {
    auto s = env_->NewWritableFile(fname, &report_file_, EnvOptions());
    if (s.ok()) {
      s = report_file_->Append(Header() + "\n");
    }
    if (s.ok()) {
      s = report_file_->Flush();
    }
    if (!s.ok()) {
      fprintf(stderr, "Can't open %s: %s\n", fname.c_str(),
              s.ToString().c_str());
      abort();
    }

D
Dmitri Smirnov 已提交
1553
    reporting_thread_ = port::Thread([&]() { SleepAndReport(); });
1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611
  }

  ~ReporterAgent() {
    {
      std::unique_lock<std::mutex> lk(mutex_);
      stop_ = true;
      stop_cv_.notify_all();
    }
    reporting_thread_.join();
  }

  // thread safe
  void ReportFinishedOps(int64_t num_ops) {
    total_ops_done_.fetch_add(num_ops);
  }

 private:
  std::string Header() const { return "secs_elapsed,interval_qps"; }
  void SleepAndReport() {
    auto time_started = env_->NowMicros();
    while (true) {
      {
        std::unique_lock<std::mutex> lk(mutex_);
        if (stop_ ||
            stop_cv_.wait_for(lk, std::chrono::seconds(report_interval_secs_),
                              [&]() { return stop_; })) {
          // stopping
          break;
        }
        // else -> timeout, which means time for a report!
      }
      auto total_ops_done_snapshot = total_ops_done_.load();
      // round the seconds elapsed
      auto secs_elapsed =
          (env_->NowMicros() - time_started + kMicrosInSecond / 2) /
          kMicrosInSecond;
      std::string report = ToString(secs_elapsed) + "," +
                           ToString(total_ops_done_snapshot - last_report_) +
                           "\n";
      auto s = report_file_->Append(report);
      if (s.ok()) {
        s = report_file_->Flush();
      }
      if (!s.ok()) {
        fprintf(stderr,
                "Can't write to report file (%s), stopping the reporting\n",
                s.ToString().c_str());
        break;
      }
      last_report_ = total_ops_done_snapshot;
    }
  }

  Env* env_;
  std::unique_ptr<WritableFile> report_file_;
  std::atomic<int64_t> total_ops_done_;
  int64_t last_report_;
  const uint64_t report_interval_secs_;
D
Dmitri Smirnov 已提交
1612
  rocksdb::port::Thread reporting_thread_;
1613 1614 1615 1616 1617 1618
  std::mutex mutex_;
  // will notify on stop
  std::condition_variable stop_cv_;
  bool stop_;
};

1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647
enum OperationType : unsigned char {
  kRead = 0,
  kWrite,
  kDelete,
  kSeek,
  kMerge,
  kUpdate,
  kCompress,
  kUncompress,
  kCrc,
  kHash,
  kOthers
};

static std::unordered_map<OperationType, std::string, std::hash<unsigned char>>
                          OperationTypeString = {
  {kRead, "read"},
  {kWrite, "write"},
  {kDelete, "delete"},
  {kSeek, "seek"},
  {kMerge, "merge"},
  {kUpdate, "update"},
  {kCompress, "compress"},
  {kCompress, "uncompress"},
  {kCrc, "crc"},
  {kHash, "hash"},
  {kOthers, "op"}
};

1648
class CombinedStats;
1649 1650
class Stats {
 private:
1651
  int id_;
D
Dmitri Smirnov 已提交
1652
  uint64_t start_;
1653
  uint64_t sine_interval_;
D
Dmitri Smirnov 已提交
1654
  uint64_t finish_;
1655
  double seconds_;
D
Dmitri Smirnov 已提交
1656 1657 1658 1659 1660 1661
  uint64_t done_;
  uint64_t last_report_done_;
  uint64_t next_report_;
  uint64_t bytes_;
  uint64_t last_op_finish_;
  uint64_t last_report_finish_;
1662
  std::unordered_map<OperationType, std::shared_ptr<HistogramImpl>,
1663
                     std::hash<unsigned char>> hist_;
1664
  std::string message_;
1665
  bool exclude_from_merge_;
1666
  ReporterAgent* reporter_agent_;  // does not own
1667
  friend class CombinedStats;
1668 1669

 public:
1670
  Stats() { Start(-1); }
1671

1672 1673 1674 1675
  void SetReporterAgent(ReporterAgent* reporter_agent) {
    reporter_agent_ = reporter_agent;
  }

1676 1677 1678
  void Start(int id) {
    id_ = id;
    next_report_ = FLAGS_stats_interval ? FLAGS_stats_interval : 100;
1679
    last_op_finish_ = start_;
1680
    hist_.clear();
1681
    done_ = 0;
1682
    last_report_done_ = 0;
1683 1684
    bytes_ = 0;
    seconds_ = 0;
1685
    start_ = FLAGS_env->NowMicros();
1686
    sine_interval_ = FLAGS_env->NowMicros();
1687
    finish_ = start_;
1688
    last_report_finish_ = start_;
1689
    message_.clear();
1690 1691
    // When set, stats from this thread won't be merged with others.
    exclude_from_merge_ = false;
1692 1693 1694
  }

  void Merge(const Stats& other) {
1695 1696 1697
    if (other.exclude_from_merge_)
      return;

1698 1699 1700
    for (auto it = other.hist_.begin(); it != other.hist_.end(); ++it) {
      auto this_it = hist_.find(it->first);
      if (this_it != hist_.end()) {
1701
        this_it->second->Merge(*(other.hist_.at(it->first)));
1702 1703 1704 1705 1706
      } else {
        hist_.insert({ it->first, it->second });
      }
    }

1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717
    done_ += other.done_;
    bytes_ += other.bytes_;
    seconds_ += other.seconds_;
    if (other.start_ < start_) start_ = other.start_;
    if (other.finish_ > finish_) finish_ = other.finish_;

    // Just keep the messages from one thread
    if (message_.empty()) message_ = other.message_;
  }

  void Stop() {
1718
    finish_ = FLAGS_env->NowMicros();
1719 1720 1721 1722 1723 1724 1725
    seconds_ = (finish_ - start_) * 1e-6;
  }

  void AddMessage(Slice msg) {
    AppendWithSpace(&message_, msg);
  }

1726
  void SetId(int id) { id_ = id; }
1727
  void SetExcludeFromMerge() { exclude_from_merge_ = true; }
1728

1729 1730 1731 1732
  void PrintThreadStatus() {
    std::vector<ThreadStatus> thread_list;
    FLAGS_env->GetThreadList(&thread_list);

1733
    fprintf(stderr, "\n%18s %10s %12s %20s %13s %45s %12s %s\n",
1734
        "ThreadID", "ThreadType", "cfName", "Operation",
1735
        "ElapsedTime", "Stage", "State", "OperationProperties");
1736

1737 1738
    int64_t current_time = 0;
    Env::Default()->GetCurrentTime(&current_time);
1739
    for (auto ts : thread_list) {
1740
      fprintf(stderr, "%18" PRIu64 " %10s %12s %20s %13s %45s %12s",
1741 1742 1743 1744
          ts.thread_id,
          ThreadStatus::GetThreadTypeName(ts.thread_type).c_str(),
          ts.cf_name.c_str(),
          ThreadStatus::GetOperationName(ts.operation_type).c_str(),
1745
          ThreadStatus::MicrosToString(ts.op_elapsed_micros).c_str(),
1746
          ThreadStatus::GetOperationStageName(ts.operation_stage).c_str(),
1747
          ThreadStatus::GetStateName(ts.state_type).c_str());
1748 1749 1750 1751 1752 1753 1754 1755

      auto op_properties = ThreadStatus::InterpretOperationProperties(
          ts.operation_type, ts.op_properties);
      for (const auto& op_prop : op_properties) {
        fprintf(stderr, " %s %" PRIu64" |",
            op_prop.first.c_str(), op_prop.second);
      }
      fprintf(stderr, "\n");
1756 1757 1758
    }
  }

1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770
  void ResetSineInterval() {
    sine_interval_ = FLAGS_env->NowMicros();
  }

  uint64_t GetSineInterval() {
    return sine_interval_;
  }

  uint64_t GetStart() {
    return start_;
  }

1771 1772 1773 1774 1775
  void ResetLastOpTime() {
    // Set to now to avoid latency from calls to SleepForMicroseconds
    last_op_finish_ = FLAGS_env->NowMicros();
  }

1776 1777
  void FinishedOps(DBWithColumnFamilies* db_with_cfh, DB* db, int64_t num_ops,
                   enum OperationType op_type = kOthers) {
1778 1779 1780
    if (reporter_agent_) {
      reporter_agent_->ReportFinishedOps(num_ops);
    }
1781
    if (FLAGS_histogram) {
D
Dmitri Smirnov 已提交
1782 1783
      uint64_t now = FLAGS_env->NowMicros();
      uint64_t micros = now - last_op_finish_;
1784 1785 1786

      if (hist_.find(op_type) == hist_.end())
      {
1787 1788
        auto hist_temp = std::make_shared<HistogramImpl>();
        hist_.insert({op_type, std::move(hist_temp)});
1789
      }
1790
      hist_[op_type]->Add(micros);
1791

1792
      if (micros > 20000 && !FLAGS_stats_interval) {
D
Dmitri Smirnov 已提交
1793
        fprintf(stderr, "long op: %" PRIu64 " micros%30s\r", micros, "");
1794 1795 1796 1797 1798
        fflush(stderr);
      }
      last_op_finish_ = now;
    }

1799
    done_ += num_ops;
1800
    if (done_ >= next_report_) {
1801 1802 1803 1804 1805 1806 1807 1808
      if (!FLAGS_stats_interval) {
        if      (next_report_ < 1000)   next_report_ += 100;
        else if (next_report_ < 5000)   next_report_ += 500;
        else if (next_report_ < 10000)  next_report_ += 1000;
        else if (next_report_ < 50000)  next_report_ += 5000;
        else if (next_report_ < 100000) next_report_ += 10000;
        else if (next_report_ < 500000) next_report_ += 50000;
        else                            next_report_ += 100000;
1809
        fprintf(stderr, "... finished %" PRIu64 " ops%30s\r", done_, "");
1810
      } else {
D
Dmitri Smirnov 已提交
1811
        uint64_t now = FLAGS_env->NowMicros();
1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822
        int64_t usecs_since_last = now - last_report_finish_;

        // Determine whether to print status where interval is either
        // each N operations or each N seconds.

        if (FLAGS_stats_interval_seconds &&
            usecs_since_last < (FLAGS_stats_interval_seconds * 1000000)) {
          // Don't check again for this many operations
          next_report_ += FLAGS_stats_interval;

        } else {
1823

1824 1825 1826
          fprintf(stderr,
                  "%s ... thread %d: (%" PRIu64 ",%" PRIu64 ") ops and "
                  "(%.1f,%.1f) ops/second in (%.6f,%.6f) seconds\n",
D
Dmitri Smirnov 已提交
1827
                  FLAGS_env->TimeToString(now/1000000).c_str(),
1828 1829 1830 1831 1832 1833 1834 1835
                  id_,
                  done_ - last_report_done_, done_,
                  (done_ - last_report_done_) /
                  (usecs_since_last / 1000000.0),
                  done_ / ((now - start_) / 1000000.0),
                  (now - last_report_finish_) / 1000000.0,
                  (now - start_) / 1000000.0);

1836
          if (id_ == 0 && FLAGS_stats_per_interval) {
1837 1838 1839 1840 1841 1842 1843
            std::string stats;

            if (db_with_cfh && db_with_cfh->num_created.load()) {
              for (size_t i = 0; i < db_with_cfh->num_created.load(); ++i) {
                if (db->GetProperty(db_with_cfh->cfh[i], "rocksdb.cfstats",
                                    &stats))
                  fprintf(stderr, "%s\n", stats.c_str());
1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873
                if (FLAGS_show_table_properties) {
                  for (int level = 0; level < FLAGS_num_levels; ++level) {
                    if (db->GetProperty(
                            db_with_cfh->cfh[i],
                            "rocksdb.aggregated-table-properties-at-level" +
                                ToString(level),
                            &stats)) {
                      if (stats.find("# entries=0") == std::string::npos) {
                        fprintf(stderr, "Level[%d]: %s\n", level,
                                stats.c_str());
                      }
                    }
                  }
                }
              }
            } else if (db) {
              if (db->GetProperty("rocksdb.stats", &stats)) {
                fprintf(stderr, "%s\n", stats.c_str());
              }
              if (FLAGS_show_table_properties) {
                for (int level = 0; level < FLAGS_num_levels; ++level) {
                  if (db->GetProperty(
                          "rocksdb.aggregated-table-properties-at-level" +
                              ToString(level),
                          &stats)) {
                    if (stats.find("# entries=0") == std::string::npos) {
                      fprintf(stderr, "Level[%d]: %s\n", level, stats.c_str());
                    }
                  }
                }
1874 1875
              }
            }
1876
          }
M
Mark Callaghan 已提交
1877

1878 1879 1880 1881
          next_report_ += FLAGS_stats_interval;
          last_report_finish_ = now;
          last_report_done_ = done_;
        }
1882
      }
1883 1884 1885 1886
      if (id_ == 0 && FLAGS_thread_status_per_interval) {
        PrintThreadStatus();
      }
      fflush(stderr);
1887 1888 1889 1890 1891 1892 1893 1894 1895
    }
  }

  void AddBytes(int64_t n) {
    bytes_ += n;
  }

  void Report(const Slice& name) {
    // Pretend at least one op was done in case we are running a benchmark
1896
    // that does not call FinishedOps().
1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909
    if (done_ < 1) done_ = 1;

    std::string extra;
    if (bytes_ > 0) {
      // Rate is computed on actual elapsed time, not the sum of per-thread
      // elapsed times.
      double elapsed = (finish_ - start_) * 1e-6;
      char rate[100];
      snprintf(rate, sizeof(rate), "%6.1f MB/s",
               (bytes_ / 1048576.0) / elapsed);
      extra = rate;
    }
    AppendWithSpace(&extra, message_);
1910 1911
    double elapsed = (finish_ - start_) * 1e-6;
    double throughput = (double)done_/elapsed;
1912

D
Dhruba Borthakur 已提交
1913
    fprintf(stdout, "%-12s : %11.3f micros/op %ld ops/sec;%s%s\n",
1914
            name.ToString().c_str(),
1915
            seconds_ * 1e6 / done_,
D
Dhruba Borthakur 已提交
1916
            (long)throughput,
1917 1918 1919
            (extra.empty() ? "" : " "),
            extra.c_str());
    if (FLAGS_histogram) {
1920 1921 1922
      for (auto it = hist_.begin(); it != hist_.end(); ++it) {
        fprintf(stdout, "Microseconds per %s:\n%s\n",
                OperationTypeString[it->first].c_str(),
1923
                it->second->ToString().c_str());
1924
      }
1925
    }
1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940
    if (FLAGS_report_file_operations) {
      ReportFileOpEnv* env = static_cast<ReportFileOpEnv*>(FLAGS_env);
      ReportFileOpCounters* counters = env->counters();
      fprintf(stdout, "Num files opened: %d\n",
              counters->open_counter_.load(std::memory_order_relaxed));
      fprintf(stdout, "Num Read(): %d\n",
              counters->read_counter_.load(std::memory_order_relaxed));
      fprintf(stdout, "Num Append(): %d\n",
              counters->append_counter_.load(std::memory_order_relaxed));
      fprintf(stdout, "Num bytes read: %" PRIu64 "\n",
              counters->bytes_read_.load(std::memory_order_relaxed));
      fprintf(stdout, "Num bytes written: %" PRIu64 "\n",
              counters->bytes_written_.load(std::memory_order_relaxed));
      env->reset();
    }
1941 1942 1943 1944
    fflush(stdout);
  }
};

1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013
class CombinedStats {
 public:
  void AddStats(const Stats& stat) {
    uint64_t total_ops = stat.done_;
    uint64_t total_bytes_ = stat.bytes_;
    double elapsed;

    if (total_ops < 1) {
      total_ops = 1;
    }

    elapsed = (stat.finish_ - stat.start_) * 1e-6;
    throughput_ops_.emplace_back(total_ops / elapsed);

    if (total_bytes_ > 0) {
      double mbs = (total_bytes_ / 1048576.0);
      throughput_mbs_.emplace_back(mbs / elapsed);
    }
  }

  void Report(const std::string& bench_name) {
    const char* name = bench_name.c_str();
    int num_runs = static_cast<int>(throughput_ops_.size());

    if (throughput_mbs_.size() == throughput_ops_.size()) {
      fprintf(stdout,
              "%s [AVG    %d runs] : %d ops/sec; %6.1f MB/sec\n"
              "%s [MEDIAN %d runs] : %d ops/sec; %6.1f MB/sec\n",
              name, num_runs, static_cast<int>(CalcAvg(throughput_ops_)),
              CalcAvg(throughput_mbs_), name, num_runs,
              static_cast<int>(CalcMedian(throughput_ops_)),
              CalcMedian(throughput_mbs_));
    } else {
      fprintf(stdout,
              "%s [AVG    %d runs] : %d ops/sec\n"
              "%s [MEDIAN %d runs] : %d ops/sec\n",
              name, num_runs, static_cast<int>(CalcAvg(throughput_ops_)), name,
              num_runs, static_cast<int>(CalcMedian(throughput_ops_)));
    }
  }

 private:
  double CalcAvg(std::vector<double> data) {
    double avg = 0;
    for (double x : data) {
      avg += x;
    }
    avg = avg / data.size();
    return avg;
  }

  double CalcMedian(std::vector<double> data) {
    assert(data.size() > 0);
    std::sort(data.begin(), data.end());

    size_t mid = data.size() / 2;
    if (data.size() % 2 == 1) {
      // Odd number of entries
      return data[mid];
    } else {
      // Even number of entries
      return (data[mid] + data[mid - 1]) / 2;
    }
  }

  std::vector<double> throughput_ops_;
  std::vector<double> throughput_mbs_;
};

2014 2015 2016 2017 2018 2019 2020 2021 2022 2023
class TimestampEmulator {
 private:
  std::atomic<uint64_t> timestamp_;

 public:
  TimestampEmulator() : timestamp_(0) {}
  uint64_t Get() const { return timestamp_.load(); }
  void Inc() { timestamp_++; }
};

2024 2025 2026 2027 2028
// State shared by all concurrent executions of the same benchmark.
struct SharedState {
  port::Mutex mu;
  port::CondVar cv;
  int total;
2029
  int perf_level;
2030
  std::shared_ptr<RateLimiter> write_rate_limiter;
2031
  std::shared_ptr<RateLimiter> read_rate_limiter;
2032 2033 2034 2035 2036 2037 2038

  // Each thread goes through the following states:
  //    (1) initializing
  //    (2) waiting for others to be initialized
  //    (3) running
  //    (4) done

2039 2040
  long num_initialized;
  long num_done;
2041 2042
  bool start;

2043
  SharedState() : cv(&mu), perf_level(FLAGS_perf_level) { }
2044 2045 2046 2047 2048
};

// Per-thread state for concurrent executions of the same benchmark.
struct ThreadState {
  int tid;             // 0..n-1 when running in n threads
2049
  Random64 rand;         // Has different seeds for different threads
2050
  Stats stats;
2051
  SharedState* shared;
2052

A
Abhishek Kona 已提交
2053
  /* implicit */ ThreadState(int index)
2054
      : tid(index),
2055
        rand((FLAGS_seed ? FLAGS_seed : 1000) + index) {
2056 2057 2058
  }
};

M
Mark Callaghan 已提交
2059 2060
class Duration {
 public:
D
Dmitri Smirnov 已提交
2061
  Duration(uint64_t max_seconds, int64_t max_ops, int64_t ops_per_stage = 0) {
M
Mark Callaghan 已提交
2062 2063
    max_seconds_ = max_seconds;
    max_ops_= max_ops;
2064
    ops_per_stage_ = (ops_per_stage > 0) ? ops_per_stage : max_ops;
M
Mark Callaghan 已提交
2065 2066 2067 2068
    ops_ = 0;
    start_at_ = FLAGS_env->NowMicros();
  }

2069 2070
  int64_t GetStage() { return std::min(ops_, max_ops_ - 1) / ops_per_stage_; }

L
Lei Jin 已提交
2071
  bool Done(int64_t increment) {
2072
    if (increment <= 0) increment = 1;    // avoid Done(0) and infinite loops
M
Mark Callaghan 已提交
2073 2074 2075
    ops_ += increment;

    if (max_seconds_) {
2076
      // Recheck every appx 1000 ops (exact iff increment is factor of 1000)
2077 2078
      auto granularity = FLAGS_ops_between_duration_checks;
      if ((ops_ / granularity) != ((ops_ - increment) / granularity)) {
D
Dmitri Smirnov 已提交
2079 2080
        uint64_t now = FLAGS_env->NowMicros();
        return ((now - start_at_) / 1000000) >= max_seconds_;
M
Mark Callaghan 已提交
2081 2082 2083 2084 2085 2086 2087 2088 2089
      } else {
        return false;
      }
    } else {
      return ops_ > max_ops_;
    }
  }

 private:
D
Dmitri Smirnov 已提交
2090
  uint64_t max_seconds_;
2091
  int64_t max_ops_;
2092
  int64_t ops_per_stage_;
2093
  int64_t ops_;
D
Dmitri Smirnov 已提交
2094
  uint64_t start_at_;
M
Mark Callaghan 已提交
2095 2096
};

J
jorlow@chromium.org 已提交
2097 2098
class Benchmark {
 private:
2099 2100 2101
  std::shared_ptr<Cache> cache_;
  std::shared_ptr<Cache> compressed_cache_;
  std::shared_ptr<const FilterPolicy> filter_policy_;
T
Tyler Harter 已提交
2102
  const SliceTransform* prefix_extractor_;
2103 2104
  DBWithColumnFamilies db_;
  std::vector<DBWithColumnFamilies> multi_dbs_;
2105
  int64_t num_;
2106
  int value_size_;
2107
  int key_size_;
2108 2109
  int prefix_size_;
  int64_t keys_per_prefix_;
L
Lei Jin 已提交
2110
  int64_t entries_per_batch_;
2111
  int64_t writes_before_delete_range_;
A
Andrew Kryczka 已提交
2112 2113 2114
  int64_t writes_per_range_tombstone_;
  int64_t range_tombstone_width_;
  int64_t max_num_range_tombstones_;
2115
  WriteOptions write_options_;
2116
  Options open_options_;  // keep options around to properly destroy db later
2117
#ifndef ROCKSDB_LITE
2118
  TraceOptions trace_options_;
2119
  TraceOptions block_cache_trace_options_;
2120
#endif
2121
  int64_t reads_;
Y
Yueh-Hsuan Chiang 已提交
2122
  int64_t deletes_;
2123
  double read_random_exp_range_;
2124 2125 2126
  int64_t writes_;
  int64_t readwrites_;
  int64_t merge_keys_;
2127
  bool report_file_operations_;
Y
Yi Wu 已提交
2128
  bool use_blob_db_;
2129
  std::vector<std::string> keys_;
2130

2131 2132
  class ErrorHandlerListener : public EventListener {
   public:
S
Siying Dong 已提交
2133
#ifndef ROCKSDB_LITE
2134 2135 2136 2137 2138 2139
    ErrorHandlerListener()
        : mutex_(),
          cv_(&mutex_),
          no_auto_recovery_(false),
          recovery_complete_(false) {}

2140
    ~ErrorHandlerListener() override {}
2141 2142

    void OnErrorRecoveryBegin(BackgroundErrorReason /*reason*/,
2143 2144
                              Status /*bg_error*/,
                              bool* auto_recovery) override {
2145 2146 2147 2148 2149
      if (*auto_recovery && no_auto_recovery_) {
        *auto_recovery = false;
      }
    }

2150
    void OnErrorRecoveryCompleted(Status /*old_bg_error*/) override {
2151 2152 2153 2154 2155
      InstrumentedMutexLock l(&mutex_);
      recovery_complete_ = true;
      cv_.SignalAll();
    }

Y
Yi Wu 已提交
2156
    bool WaitForRecovery(uint64_t abs_time_us) {
2157 2158
      InstrumentedMutexLock l(&mutex_);
      if (!recovery_complete_) {
Y
Yi Wu 已提交
2159
        cv_.TimedWait(abs_time_us);
2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174
      }
      if (recovery_complete_) {
        recovery_complete_ = false;
        return true;
      }
      return false;
    }

    void EnableAutoRecovery(bool enable = true) { no_auto_recovery_ = !enable; }

   private:
    InstrumentedMutex mutex_;
    InstrumentedCondVar cv_;
    bool no_auto_recovery_;
    bool recovery_complete_;
S
Siying Dong 已提交
2175 2176 2177 2178
#else   // ROCKSDB_LITE
    bool WaitForRecovery(uint64_t /*abs_time_us*/) { return true; }
    void EnableAutoRecovery(bool /*enable*/) {}
#endif  // ROCKSDB_LITE
2179 2180 2181 2182
  };

  std::shared_ptr<ErrorHandlerListener> listener_;

2183 2184 2185 2186 2187 2188 2189 2190
  bool SanityCheck() {
    if (FLAGS_compression_ratio > 1) {
      fprintf(stderr, "compression_ratio should be between 0 and 1\n");
      return false;
    }
    return true;
  }

2191
  inline bool CompressSlice(const CompressionInfo& compression_info,
2192
                            const Slice& input, std::string* compressed) {
2193 2194 2195
    bool ok = true;
    switch (FLAGS_compression_type_e) {
      case rocksdb::kSnappyCompression:
2196
        ok = Snappy_Compress(compression_info, input.data(), input.size(),
2197
                             compressed);
2198 2199
        break;
      case rocksdb::kZlibCompression:
2200
        ok = Zlib_Compress(compression_info, 2, input.data(), input.size(),
2201
                           compressed);
2202 2203
        break;
      case rocksdb::kBZip2Compression:
2204
        ok = BZip2_Compress(compression_info, 2, input.data(), input.size(),
2205
                            compressed);
2206 2207
        break;
      case rocksdb::kLZ4Compression:
2208
        ok = LZ4_Compress(compression_info, 2, input.data(), input.size(),
2209
                          compressed);
2210 2211
        break;
      case rocksdb::kLZ4HCCompression:
2212
        ok = LZ4HC_Compress(compression_info, 2, input.data(), input.size(),
2213
                            compressed);
2214
        break;
2215 2216 2217 2218
      case rocksdb::kXpressCompression:
        ok = XPRESS_Compress(input.data(),
          input.size(), compressed);
        break;
S
sdong 已提交
2219
      case rocksdb::kZSTD:
2220
        ok = ZSTD_Compress(compression_info, input.data(), input.size(),
2221
                           compressed);
2222
        break;
2223 2224 2225 2226 2227 2228
      default:
        ok = false;
    }
    return ok;
  }

2229 2230
  void PrintHeader() {
    PrintEnvironment();
2231
    fprintf(stdout, "Keys:       %d bytes each\n", FLAGS_key_size);
2232 2233 2234
    fprintf(stdout, "Values:     %d bytes each (%d bytes after compression)\n",
            FLAGS_value_size,
            static_cast<int>(FLAGS_value_size * FLAGS_compression_ratio + 0.5));
2235 2236 2237
    fprintf(stdout, "Entries:    %" PRIu64 "\n", num_);
    fprintf(stdout, "Prefix:    %d bytes\n", FLAGS_prefix_size);
    fprintf(stdout, "Keys per prefix:    %" PRIu64 "\n", keys_per_prefix_);
2238
    fprintf(stdout, "RawSize:    %.1f MB (estimated)\n",
2239
            ((static_cast<int64_t>(FLAGS_key_size + FLAGS_value_size) * num_)
2240
             / 1048576.0));
2241
    fprintf(stdout, "FileSize:   %.1f MB (estimated)\n",
2242 2243
            (((FLAGS_key_size + FLAGS_value_size * FLAGS_compression_ratio)
              * num_)
2244
             / 1048576.0));
2245 2246
    fprintf(stdout, "Write rate: %" PRIu64 " bytes/second\n",
            FLAGS_benchmark_write_rate_limit);
2247 2248
    fprintf(stdout, "Read rate: %" PRIu64 " ops/second\n",
            FLAGS_benchmark_read_rate_limit);
2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260
    if (FLAGS_enable_numa) {
      fprintf(stderr, "Running in NUMA enabled mode.\n");
#ifndef NUMA
      fprintf(stderr, "NUMA is not defined in the system.\n");
      exit(1);
#else
      if (numa_available() == -1) {
        fprintf(stderr, "NUMA is not supported by the system.\n");
        exit(1);
      }
#endif
    }
2261

N
Nathan Bronson 已提交
2262 2263
    auto compression = CompressionTypeToString(FLAGS_compression_type_e);
    fprintf(stdout, "Compression: %s\n", compression.c_str());
2264 2265
    fprintf(stdout, "Compression sampling rate: %" PRId64 "\n",
            FLAGS_sample_for_compression);
2266

J
Jim Paton 已提交
2267 2268 2269 2270 2271 2272 2273 2274 2275 2276
    switch (FLAGS_rep_factory) {
      case kPrefixHash:
        fprintf(stdout, "Memtablerep: prefix_hash\n");
        break;
      case kSkipList:
        fprintf(stdout, "Memtablerep: skip_list\n");
        break;
      case kVectorRep:
        fprintf(stdout, "Memtablerep: vector\n");
        break;
2277 2278 2279
      case kHashLinkedList:
        fprintf(stdout, "Memtablerep: hash_linkedlist\n");
        break;
J
Jim Paton 已提交
2280
    }
2281
    fprintf(stdout, "Perf Level: %d\n", FLAGS_perf_level);
J
Jim Paton 已提交
2282

N
Nathan Bronson 已提交
2283
    PrintWarnings(compression.c_str());
2284 2285 2286
    fprintf(stdout, "------------------------------------------------\n");
  }

2287
  void PrintWarnings(const char* compression) {
2288 2289 2290 2291 2292 2293 2294 2295 2296
#if defined(__GNUC__) && !defined(__OPTIMIZE__)
    fprintf(stdout,
            "WARNING: Optimization is disabled: benchmarks unnecessarily slow\n"
            );
#endif
#ifndef NDEBUG
    fprintf(stdout,
            "WARNING: Assertions are enabled; benchmarks unnecessarily slow\n");
#endif
2297
    if (FLAGS_compression_type_e != rocksdb::kNoCompression) {
2298 2299
      // The test string should not be too small.
      const int len = FLAGS_block_size;
2300
      std::string input_str(len, 'y');
2301
      std::string compressed;
2302 2303 2304
      CompressionOptions opts;
      CompressionContext context(FLAGS_compression_type_e);
      CompressionInfo info(opts, context, CompressionDict::GetEmptyDict(),
2305 2306
                           FLAGS_compression_type_e,
                           FLAGS_sample_for_compression);
2307
      bool result = CompressSlice(info, Slice(input_str), &compressed);
2308 2309

      if (!result) {
2310 2311 2312 2313 2314
        fprintf(stdout, "WARNING: %s compression is not enabled\n",
                compression);
      } else if (compressed.size() >= input_str.size()) {
        fprintf(stdout, "WARNING: %s compression is not effective\n",
                compression);
2315
      }
2316
    }
2317 2318
  }

K
kailiu 已提交
2319 2320 2321 2322 2323 2324 2325
// Current the following isn't equivalent to OS_LINUX.
#if defined(__linux)
  static Slice TrimSpace(Slice s) {
    unsigned int start = 0;
    while (start < s.size() && isspace(s[start])) {
      start++;
    }
S
sdong 已提交
2326
    unsigned int limit = static_cast<unsigned int>(s.size());
K
kailiu 已提交
2327 2328 2329 2330 2331 2332 2333
    while (limit > start && isspace(s[limit-1])) {
      limit--;
    }
    return Slice(s.data() + start, limit - start);
  }
#endif

2334
  void PrintEnvironment() {
H
Hyunyoung Lee 已提交
2335
    fprintf(stderr, "RocksDB:    version %d.%d\n",
2336 2337 2338
            kMajorVersion, kMinorVersion);

#if defined(__linux)
2339
    time_t now = time(nullptr);
2340 2341 2342 2343 2344
    char buf[52];
    // Lint complains about ctime() usage, so replace it with ctime_r(). The
    // requirement is to provide a buffer which is at least 26 bytes.
    fprintf(stderr, "Date:       %s",
            ctime_r(&now, buf));  // ctime_r() adds newline
2345 2346

    FILE* cpuinfo = fopen("/proc/cpuinfo", "r");
2347
    if (cpuinfo != nullptr) {
2348 2349 2350 2351
      char line[1000];
      int num_cpus = 0;
      std::string cpu_type;
      std::string cache_size;
2352
      while (fgets(line, sizeof(line), cpuinfo) != nullptr) {
2353
        const char* sep = strchr(line, ':');
2354
        if (sep == nullptr) {
2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372
          continue;
        }
        Slice key = TrimSpace(Slice(line, sep - 1 - line));
        Slice val = TrimSpace(Slice(sep + 1));
        if (key == "model name") {
          ++num_cpus;
          cpu_type = val.ToString();
        } else if (key == "cache size") {
          cache_size = val.ToString();
        }
      }
      fclose(cpuinfo);
      fprintf(stderr, "CPU:        %d * %s\n", num_cpus, cpu_type.c_str());
      fprintf(stderr, "CPUCache:   %s\n", cache_size.c_str());
    }
#endif
  }

2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394
  static bool KeyExpired(const TimestampEmulator* timestamp_emulator,
                         const Slice& key) {
    const char* pos = key.data();
    pos += 8;
    uint64_t timestamp = 0;
    if (port::kLittleEndian) {
      int bytes_to_fill = 8;
      for (int i = 0; i < bytes_to_fill; ++i) {
        timestamp |= (static_cast<uint64_t>(static_cast<unsigned char>(pos[i]))
                      << ((bytes_to_fill - i - 1) << 3));
      }
    } else {
      memcpy(&timestamp, pos, sizeof(timestamp));
    }
    return timestamp_emulator->Get() - timestamp > FLAGS_time_range;
  }

  class ExpiredTimeFilter : public CompactionFilter {
   public:
    explicit ExpiredTimeFilter(
        const std::shared_ptr<TimestampEmulator>& timestamp_emulator)
        : timestamp_emulator_(timestamp_emulator) {}
A
Andrew Kryczka 已提交
2395 2396 2397
    bool Filter(int /*level*/, const Slice& key,
                const Slice& /*existing_value*/, std::string* /*new_value*/,
                bool* /*value_changed*/) const override {
2398 2399 2400 2401 2402 2403 2404 2405
      return KeyExpired(timestamp_emulator_.get(), key);
    }
    const char* Name() const override { return "ExpiredTimeFilter"; }

   private:
    std::shared_ptr<TimestampEmulator> timestamp_emulator_;
  };

2406 2407
  class KeepFilter : public CompactionFilter {
   public:
2408 2409 2410
    bool Filter(int /*level*/, const Slice& /*key*/, const Slice& /*value*/,
                std::string* /*new_value*/,
                bool* /*value_changed*/) const override {
2411 2412 2413
      return false;
    }

2414
    const char* Name() const override { return "KeepFilter"; }
2415 2416
  };

Y
Yi Wu 已提交
2417 2418 2419 2420 2421
  std::shared_ptr<Cache> NewCache(int64_t capacity) {
    if (capacity <= 0) {
      return nullptr;
    }
    if (FLAGS_use_clock_cache) {
2422 2423
      auto cache = NewClockCache(static_cast<size_t>(capacity),
                                 FLAGS_cache_numshardbits);
Y
Yi Wu 已提交
2424 2425 2426 2427 2428 2429
      if (!cache) {
        fprintf(stderr, "Clock cache not supported.");
        exit(1);
      }
      return cache;
    } else {
2430
      return NewLRUCache(
2431 2432
          static_cast<size_t>(capacity), FLAGS_cache_numshardbits,
          false /*strict_capacity_limit*/, FLAGS_cache_high_pri_pool_ratio);
Y
Yi Wu 已提交
2433 2434 2435
    }
  }

J
jorlow@chromium.org 已提交
2436
 public:
2437
  Benchmark()
Y
Yi Wu 已提交
2438 2439
      : cache_(NewCache(FLAGS_cache_size)),
        compressed_cache_(NewCache(FLAGS_compressed_cache_size)),
2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458
        filter_policy_(FLAGS_bloom_bits >= 0
                           ? NewBloomFilterPolicy(FLAGS_bloom_bits,
                                                  FLAGS_use_block_based_filter)
                           : nullptr),
        prefix_extractor_(NewFixedPrefixTransform(FLAGS_prefix_size)),
        num_(FLAGS_num),
        value_size_(FLAGS_value_size),
        key_size_(FLAGS_key_size),
        prefix_size_(FLAGS_prefix_size),
        keys_per_prefix_(FLAGS_keys_per_prefix),
        entries_per_batch_(1),
        reads_(FLAGS_reads < 0 ? FLAGS_num : FLAGS_reads),
        read_random_exp_range_(0.0),
        writes_(FLAGS_writes < 0 ? FLAGS_num : FLAGS_writes),
        readwrites_(
            (FLAGS_writes < 0 && FLAGS_reads < 0)
                ? FLAGS_num
                : ((FLAGS_writes > FLAGS_reads) ? FLAGS_writes : FLAGS_reads)),
        merge_keys_(FLAGS_merge_keys < 0 ? FLAGS_num : FLAGS_merge_keys),
Y
Yi Wu 已提交
2459 2460
        report_file_operations_(FLAGS_report_file_operations),
#ifndef ROCKSDB_LITE
2461
        use_blob_db_(FLAGS_use_blob_db)
Y
Yi Wu 已提交
2462
#else
2463
        use_blob_db_(false)
Y
Yi Wu 已提交
2464
#endif  // !ROCKSDB_LITE
2465
  {
2466 2467 2468 2469 2470 2471 2472 2473 2474 2475
    // use simcache instead of cache
    if (FLAGS_simcache_size >= 0) {
      if (FLAGS_cache_numshardbits >= 1) {
        cache_ =
            NewSimCache(cache_, FLAGS_simcache_size, FLAGS_cache_numshardbits);
      } else {
        cache_ = NewSimCache(cache_, FLAGS_simcache_size, 0);
      }
    }

2476 2477 2478 2479 2480 2481 2482 2483 2484 2485
    if (report_file_operations_) {
      if (!FLAGS_hdfs.empty()) {
        fprintf(stderr,
                "--hdfs and --report_file_operations cannot be enabled "
                "at the same time");
        exit(1);
      }
      FLAGS_env = new ReportFileOpEnv(rocksdb::Env::Default());
    }

2486 2487 2488 2489 2490
    if (FLAGS_prefix_size > FLAGS_key_size) {
      fprintf(stderr, "prefix size is larger than key size");
      exit(1);
    }

J
jorlow@chromium.org 已提交
2491
    std::vector<std::string> files;
2492
    FLAGS_env->GetChildren(FLAGS_db, &files);
2493
    for (size_t i = 0; i < files.size(); i++) {
J
jorlow@chromium.org 已提交
2494
      if (Slice(files[i]).starts_with("heap-")) {
2495
        FLAGS_env->DeleteFile(FLAGS_db + "/" + files[i]);
J
jorlow@chromium.org 已提交
2496 2497
      }
    }
2498
    if (!FLAGS_use_existing_db) {
2499 2500 2501 2502
      Options options;
      if (!FLAGS_wal_dir.empty()) {
        options.wal_dir = FLAGS_wal_dir;
      }
Y
Yi Wu 已提交
2503 2504 2505 2506 2507
#ifndef ROCKSDB_LITE
      if (use_blob_db_) {
        blob_db::DestroyBlobDB(FLAGS_db, options, blob_db::BlobDBOptions());
      }
#endif  // !ROCKSDB_LITE
2508
      DestroyDB(FLAGS_db, options);
2509 2510 2511 2512 2513 2514 2515 2516 2517 2518
      if (!FLAGS_wal_dir.empty()) {
        FLAGS_env->DeleteDir(FLAGS_wal_dir);
      }

      if (FLAGS_num_multi_db > 1) {
        FLAGS_env->CreateDir(FLAGS_db);
        if (!FLAGS_wal_dir.empty()) {
          FLAGS_env->CreateDir(FLAGS_wal_dir);
        }
      }
2519
    }
2520 2521

    listener_.reset(new ErrorHandlerListener());
J
jorlow@chromium.org 已提交
2522 2523 2524
  }

  ~Benchmark() {
A
agiardullo 已提交
2525
    db_.DeleteDBs();
T
Tyler Harter 已提交
2526
    delete prefix_extractor_;
I
Igor Canadi 已提交
2527 2528 2529 2530
    if (cache_.get() != nullptr) {
      // this will leak, but we're shutting down so nobody cares
      cache_->DisownData();
    }
J
jorlow@chromium.org 已提交
2531 2532
  }

2533
  Slice AllocateKey(std::unique_ptr<const char[]>* key_guard) {
2534 2535 2536
    char* data = new char[key_size_];
    const char* const_data = data;
    key_guard->reset(const_data);
2537
    return Slice(key_guard->get(), key_size_);
L
Lei Jin 已提交
2538 2539
  }

2540 2541
  // Generate key according to the given specification and random number.
  // The resulting key will have the following format (if keys_per_prefix_
2542
  // is positive), extra trailing bytes are either cut off or padded with '0'.
2543 2544 2545 2546 2547 2548 2549 2550 2551
  // The prefix value is derived from key value.
  //   ----------------------------
  //   | prefix 00000 | key 00000 |
  //   ----------------------------
  // If keys_per_prefix_ is 0, the key is simply a binary representation of
  // random number followed by trailing '0's
  //   ----------------------------
  //   |        key 00000         |
  //   ----------------------------
L
Lei Jin 已提交
2552
  void GenerateKeyFromInt(uint64_t v, int64_t num_keys, Slice* key) {
2553 2554 2555 2556 2557 2558 2559
    if (!keys_.empty()) {
      assert(FLAGS_use_existing_keys);
      assert(keys_.size() == static_cast<size_t>(num_keys));
      assert(v < static_cast<uint64_t>(num_keys));
      *key = keys_[v];
      return;
    }
L
Lei Jin 已提交
2560
    char* start = const_cast<char*>(key->data());
2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591
    char* pos = start;
    if (keys_per_prefix_ > 0) {
      int64_t num_prefix = num_keys / keys_per_prefix_;
      int64_t prefix = v % num_prefix;
      int bytes_to_fill = std::min(prefix_size_, 8);
      if (port::kLittleEndian) {
        for (int i = 0; i < bytes_to_fill; ++i) {
          pos[i] = (prefix >> ((bytes_to_fill - i - 1) << 3)) & 0xFF;
        }
      } else {
        memcpy(pos, static_cast<void*>(&prefix), bytes_to_fill);
      }
      if (prefix_size_ > 8) {
        // fill the rest with 0s
        memset(pos + 8, '0', prefix_size_ - 8);
      }
      pos += prefix_size_;
    }

    int bytes_to_fill = std::min(key_size_ - static_cast<int>(pos - start), 8);
    if (port::kLittleEndian) {
      for (int i = 0; i < bytes_to_fill; ++i) {
        pos[i] = (v >> ((bytes_to_fill - i - 1) << 3)) & 0xFF;
      }
    } else {
      memcpy(pos, static_cast<void*>(&v), bytes_to_fill);
    }
    pos += bytes_to_fill;
    if (key_size_ > pos - start) {
      memset(pos, '0', key_size_ - (pos - start));
    }
X
Xing Jin 已提交
2592 2593
  }

2594 2595 2596 2597 2598 2599 2600 2601 2602 2603 2604
  void GenerateKeyFromIntForSeek(uint64_t v, int64_t num_keys, Slice* key) {
    GenerateKeyFromInt(v, num_keys, key);
    if (FLAGS_seek_missing_prefix) {
      assert(prefix_size_ > 8);
      char* key_ptr = const_cast<char*>(key->data());
      // This rely on GenerateKeyFromInt filling paddings with '0's.
      // Putting a '1' will create a non-existing prefix.
      key_ptr[8] = '1';
    }
  }

2605 2606 2607 2608 2609 2610 2611 2612 2613 2614 2615 2616
  std::string GetPathForMultiple(std::string base_name, size_t id) {
    if (!base_name.empty()) {
#ifndef OS_WIN
      if (base_name.back() != '/') {
        base_name += '/';
      }
#else
      if (base_name.back() != '\\') {
        base_name += '\\';
      }
#endif
    }
2617
    return base_name + ToString(id);
2618 2619
  }

2620 2621 2622 2623 2624 2625 2626 2627 2628 2629 2630 2631 2632 2633 2634
  void VerifyDBFromDB(std::string& truth_db_name) {
    DBWithColumnFamilies truth_db;
    auto s = DB::OpenForReadOnly(open_options_, truth_db_name, &truth_db.db);
    if (!s.ok()) {
      fprintf(stderr, "open error: %s\n", s.ToString().c_str());
      exit(1);
    }
    ReadOptions ro;
    ro.total_order_seek = true;
    std::unique_ptr<Iterator> truth_iter(truth_db.db->NewIterator(ro));
    std::unique_ptr<Iterator> db_iter(db_.db->NewIterator(ro));
    // Verify that all the key/values in truth_db are retrivable in db with
    // ::Get
    fprintf(stderr, "Verifying db >= truth_db with ::Get...\n");
    for (truth_iter->SeekToFirst(); truth_iter->Valid(); truth_iter->Next()) {
2635 2636 2637 2638 2639
      std::string value;
      s = db_.db->Get(ro, truth_iter->key(), &value);
      assert(s.ok());
      // TODO(myabandeh): provide debugging hints
      assert(Slice(value) == truth_iter->value());
2640 2641 2642 2643 2644 2645 2646 2647 2648 2649 2650
    }
    // Verify that the db iterator does not give any extra key/value
    fprintf(stderr, "Verifying db == truth_db...\n");
    for (db_iter->SeekToFirst(), truth_iter->SeekToFirst(); db_iter->Valid();
         db_iter->Next(), truth_iter->Next()) {
      assert(truth_iter->Valid());
      assert(truth_iter->value() == db_iter->value());
    }
    // No more key should be left unchecked in truth_db
    assert(!truth_iter->Valid());
    fprintf(stderr, "...Verified\n");
2651 2652
  }

J
jorlow@chromium.org 已提交
2653
  void Run() {
2654 2655 2656
    if (!SanityCheck()) {
      exit(1);
    }
2657
    Open(&open_options_);
2658
    PrintHeader();
2659 2660
    std::stringstream benchmark_stream(FLAGS_benchmarks);
    std::string name;
2661
    std::unique_ptr<ExpiredTimeFilter> filter;
2662
    while (std::getline(benchmark_stream, name, ',')) {
X
Xing Jin 已提交
2663
      // Sanitize parameters
2664
      num_ = FLAGS_num;
2665
      reads_ = (FLAGS_reads < 0 ? FLAGS_num : FLAGS_reads);
2666
      writes_ = (FLAGS_writes < 0 ? FLAGS_num : FLAGS_writes);
Y
Yueh-Hsuan Chiang 已提交
2667
      deletes_ = (FLAGS_deletes < 0 ? FLAGS_num : FLAGS_deletes);
2668
      value_size_ = FLAGS_value_size;
2669
      key_size_ = FLAGS_key_size;
2670
      entries_per_batch_ = FLAGS_batch_size;
2671
      writes_before_delete_range_ = FLAGS_writes_before_delete_range;
A
Andrew Kryczka 已提交
2672 2673 2674
      writes_per_range_tombstone_ = FLAGS_writes_per_range_tombstone;
      range_tombstone_width_ = FLAGS_range_tombstone_width;
      max_num_range_tombstones_ = FLAGS_max_num_range_tombstones;
2675
      write_options_ = WriteOptions();
2676
      read_random_exp_range_ = FLAGS_read_random_exp_range;
2677 2678 2679
      if (FLAGS_sync) {
        write_options_.sync = true;
      }
H
heyongqiang 已提交
2680 2681
      write_options_.disableWAL = FLAGS_disable_wal;

2682
      void (Benchmark::*method)(ThreadState*) = nullptr;
A
agiardullo 已提交
2683 2684
      void (Benchmark::*post_process_method)() = nullptr;

2685
      bool fresh_db = false;
2686
      int num_threads = FLAGS_threads;
2687

2688 2689 2690 2691 2692 2693 2694 2695 2696 2697 2698 2699 2700 2701 2702 2703 2704 2705 2706 2707 2708 2709 2710 2711 2712 2713 2714 2715 2716 2717
      int num_repeat = 1;
      int num_warmup = 0;
      if (!name.empty() && *name.rbegin() == ']') {
        auto it = name.find('[');
        if (it == std::string::npos) {
          fprintf(stderr, "unknown benchmark arguments '%s'\n", name.c_str());
          exit(1);
        }
        std::string args = name.substr(it + 1);
        args.resize(args.size() - 1);
        name.resize(it);

        std::string bench_arg;
        std::stringstream args_stream(args);
        while (std::getline(args_stream, bench_arg, '-')) {
          if (bench_arg.empty()) {
            continue;
          }
          if (bench_arg[0] == 'X') {
            // Repeat the benchmark n times
            std::string num_str = bench_arg.substr(1);
            num_repeat = std::stoi(num_str);
          } else if (bench_arg[0] == 'W') {
            // Warm up the benchmark for n times
            std::string num_str = bench_arg.substr(1);
            num_warmup = std::stoi(num_str);
          }
        }
      }

2718 2719 2720 2721 2722 2723 2724 2725 2726 2727 2728 2729 2730 2731 2732 2733 2734 2735 2736 2737 2738 2739 2740 2741
      // Both fillseqdeterministic and filluniquerandomdeterministic
      // fill the levels except the max level with UNIQUE_RANDOM
      // and fill the max level with fillseq and filluniquerandom, respectively
      if (name == "fillseqdeterministic" ||
          name == "filluniquerandomdeterministic") {
        if (!FLAGS_disable_auto_compactions) {
          fprintf(stderr,
                  "Please disable_auto_compactions in FillDeterministic "
                  "benchmark\n");
          exit(1);
        }
        if (num_threads > 1) {
          fprintf(stderr,
                  "filldeterministic multithreaded not supported"
                  ", use 1 thread\n");
          num_threads = 1;
        }
        fresh_db = true;
        if (name == "fillseqdeterministic") {
          method = &Benchmark::WriteSeqDeterministic;
        } else {
          method = &Benchmark::WriteUniqueRandomDeterministic;
        }
      } else if (name == "fillseq") {
2742 2743
        fresh_db = true;
        method = &Benchmark::WriteSeq;
2744
      } else if (name == "fillbatch") {
2745 2746 2747
        fresh_db = true;
        entries_per_batch_ = 1000;
        method = &Benchmark::WriteSeq;
2748
      } else if (name == "fillrandom") {
2749 2750
        fresh_db = true;
        method = &Benchmark::WriteRandom;
2751
      } else if (name == "filluniquerandom") {
2752 2753
        fresh_db = true;
        if (num_threads > 1) {
2754 2755 2756
          fprintf(stderr,
                  "filluniquerandom multithreaded not supported"
                  ", use 1 thread");
2757
          num_threads = 1;
2758 2759
        }
        method = &Benchmark::WriteUniqueRandom;
2760
      } else if (name == "overwrite") {
2761
        method = &Benchmark::WriteRandom;
2762
      } else if (name == "fillsync") {
2763 2764 2765 2766
        fresh_db = true;
        num_ /= 1000;
        write_options_.sync = true;
        method = &Benchmark::WriteRandom;
2767
      } else if (name == "fill100K") {
2768 2769 2770 2771
        fresh_db = true;
        num_ /= 1000;
        value_size_ = 100 * 1000;
        method = &Benchmark::WriteRandom;
2772
      } else if (name == "readseq") {
2773
        method = &Benchmark::ReadSequential;
2774
      } else if (name == "readtocache") {
M
Mark Callaghan 已提交
2775 2776 2777
        method = &Benchmark::ReadSequential;
        num_threads = 1;
        reads_ = num_;
2778
      } else if (name == "readreverse") {
2779
        method = &Benchmark::ReadReverse;
2780
      } else if (name == "readrandom") {
2781 2782 2783 2784
        if (FLAGS_multiread_stride) {
          fprintf(stderr, "entries_per_batch = %" PRIi64 "\n",
                  entries_per_batch_);
        }
2785
        method = &Benchmark::ReadRandom;
2786
      } else if (name == "readrandomfast") {
L
Lei Jin 已提交
2787
        method = &Benchmark::ReadRandomFast;
2788
      } else if (name == "multireadrandom") {
M
mike@arpaia.co 已提交
2789 2790
        fprintf(stderr, "entries_per_batch = %" PRIi64 "\n",
                entries_per_batch_);
L
Lei Jin 已提交
2791
        method = &Benchmark::MultiReadRandom;
2792 2793
      } else if (name == "mixgraph") {
        method = &Benchmark::MixGraph;
2794
      } else if (name == "readmissing") {
L
Lei Jin 已提交
2795 2796
        ++key_size_;
        method = &Benchmark::ReadRandom;
2797
      } else if (name == "newiterator") {
2798
        method = &Benchmark::IteratorCreation;
2799
      } else if (name == "newiteratorwhilewriting") {
2800 2801
        num_threads++;  // Add extra thread for writing
        method = &Benchmark::IteratorCreationWhileWriting;
2802
      } else if (name == "seekrandom") {
S
Sanjay Ghemawat 已提交
2803
        method = &Benchmark::SeekRandom;
2804
      } else if (name == "seekrandomwhilewriting") {
L
Lei Jin 已提交
2805 2806
        num_threads++;  // Add extra thread for writing
        method = &Benchmark::SeekRandomWhileWriting;
2807
      } else if (name == "seekrandomwhilemerging") {
2808 2809
        num_threads++;  // Add extra thread for merging
        method = &Benchmark::SeekRandomWhileMerging;
2810
      } else if (name == "readrandomsmall") {
2811
        reads_ /= 1000;
2812
        method = &Benchmark::ReadRandom;
2813
      } else if (name == "deleteseq") {
S
Sanjay Ghemawat 已提交
2814
        method = &Benchmark::DeleteSeq;
2815
      } else if (name == "deleterandom") {
S
Sanjay Ghemawat 已提交
2816
        method = &Benchmark::DeleteRandom;
2817
      } else if (name == "readwhilewriting") {
2818 2819
        num_threads++;  // Add extra thread for writing
        method = &Benchmark::ReadWhileWriting;
2820
      } else if (name == "readwhilemerging") {
M
Mark Callaghan 已提交
2821 2822
        num_threads++;  // Add extra thread for writing
        method = &Benchmark::ReadWhileMerging;
Y
Yi Wu 已提交
2823 2824 2825
      } else if (name == "readwhilescanning") {
        num_threads++;  // Add extra thread for scaning
        method = &Benchmark::ReadWhileScanning;
2826
      } else if (name == "readrandomwriterandom") {
2827
        method = &Benchmark::ReadRandomWriteRandom;
2828
      } else if (name == "readrandommergerandom") {
2829 2830
        if (FLAGS_merge_operator.empty()) {
          fprintf(stdout, "%-12s : skipped (--merge_operator is unknown)\n",
2831
                  name.c_str());
L
Lei Jin 已提交
2832
          exit(1);
2833
        }
L
Lei Jin 已提交
2834
        method = &Benchmark::ReadRandomMergeRandom;
2835
      } else if (name == "updaterandom") {
M
Mark Callaghan 已提交
2836
        method = &Benchmark::UpdateRandom;
P
Pooya Shareghi 已提交
2837 2838
      } else if (name == "xorupdaterandom") {
        method = &Benchmark::XORUpdateRandom;
2839
      } else if (name == "appendrandom") {
D
Deon Nicholas 已提交
2840
        method = &Benchmark::AppendRandom;
2841
      } else if (name == "mergerandom") {
D
Deon Nicholas 已提交
2842 2843
        if (FLAGS_merge_operator.empty()) {
          fprintf(stdout, "%-12s : skipped (--merge_operator is unknown)\n",
2844
                  name.c_str());
L
Lei Jin 已提交
2845
          exit(1);
D
Deon Nicholas 已提交
2846
        }
L
Lei Jin 已提交
2847
        method = &Benchmark::MergeRandom;
2848
      } else if (name == "randomwithverify") {
2849
        method = &Benchmark::RandomWithVerify;
2850
      } else if (name == "fillseekseq") {
T
Tomislav Novak 已提交
2851
        method = &Benchmark::WriteSeqSeekSeq;
2852
      } else if (name == "compact") {
2853
        method = &Benchmark::Compact;
2854 2855
      } else if (name == "compactall") {
        CompactAll();
2856
      } else if (name == "crc32c") {
2857
        method = &Benchmark::Crc32c;
2858
      } else if (name == "xxhash") {
I
xxHash  
Igor Canadi 已提交
2859
        method = &Benchmark::xxHash;
2860
      } else if (name == "acquireload") {
2861
        method = &Benchmark::AcquireLoad;
2862
      } else if (name == "compress") {
A
Albert Strasheim 已提交
2863
        method = &Benchmark::Compress;
2864
      } else if (name == "uncompress") {
A
Albert Strasheim 已提交
2865
        method = &Benchmark::Uncompress;
2866
#ifndef ROCKSDB_LITE
2867
      } else if (name == "randomtransaction") {
A
agiardullo 已提交
2868 2869
        method = &Benchmark::RandomTransaction;
        post_process_method = &Benchmark::RandomTransactionVerify;
2870
#endif  // ROCKSDB_LITE
A
Andres Noetzli 已提交
2871 2872 2873
      } else if (name == "randomreplacekeys") {
        fresh_db = true;
        method = &Benchmark::RandomReplaceKeys;
2874 2875 2876 2877 2878 2879 2880 2881 2882
      } else if (name == "timeseries") {
        timestamp_emulator_.reset(new TimestampEmulator());
        if (FLAGS_expire_style == "compaction_filter") {
          filter.reset(new ExpiredTimeFilter(timestamp_emulator_));
          fprintf(stdout, "Compaction filter is used to remove expired data");
          open_options_.compaction_filter = filter.get();
        }
        fresh_db = true;
        method = &Benchmark::TimeSeries;
2883
      } else if (name == "stats") {
2884
        PrintStats("rocksdb.stats");
S
Siying Dong 已提交
2885 2886
      } else if (name == "resetstats") {
        ResetStats();
2887 2888
      } else if (name == "verify") {
        VerifyDBFromDB(FLAGS_truth_db);
2889
      } else if (name == "levelstats") {
2890
        PrintStats("rocksdb.levelstats");
2891
      } else if (name == "sstables") {
2892
        PrintStats("rocksdb.sstables");
2893 2894
      } else if (name == "stats_history") {
        PrintStatsHistory();
2895 2896 2897 2898 2899 2900 2901 2902 2903 2904
      } else if (name == "replay") {
        if (num_threads > 1) {
          fprintf(stderr, "Multi-threaded replay is not yet supported\n");
          exit(1);
        }
        if (FLAGS_trace_file == "") {
          fprintf(stderr, "Please set --trace_file to be replayed from\n");
          exit(1);
        }
        method = &Benchmark::Replay;
2905 2906
      } else if (name == "getmergeoperands") {
        method = &Benchmark::GetMergeOperands;
2907 2908 2909
      } else if (!name.empty()) {  // No error message for empty name
        fprintf(stderr, "unknown benchmark '%s'\n", name.c_str());
        exit(1);
2910
      }
2911 2912 2913 2914

      if (fresh_db) {
        if (FLAGS_use_existing_db) {
          fprintf(stdout, "%-12s : skipped (--use_existing_db is true)\n",
2915
                  name.c_str());
2916
          method = nullptr;
2917
        } else {
2918
          if (db_.db != nullptr) {
A
agiardullo 已提交
2919
            db_.DeleteDBs();
2920
            DestroyDB(FLAGS_db, open_options_);
2921
          }
2922
          Options options = open_options_;
2923
          for (size_t i = 0; i < multi_dbs_.size(); i++) {
2924
            delete multi_dbs_[i].db;
2925 2926 2927 2928
            if (!open_options_.wal_dir.empty()) {
              options.wal_dir = GetPathForMultiple(open_options_.wal_dir, i);
            }
            DestroyDB(GetPathForMultiple(FLAGS_db, i), options);
2929 2930
          }
          multi_dbs_.clear();
2931
        }
2932
        Open(&open_options_);  // use open_options for the last accessed
2933 2934
      }

2935
      if (method != nullptr) {
2936
        fprintf(stdout, "DB path: [%s]\n", FLAGS_db.c_str());
2937

2938
#ifndef ROCKSDB_LITE
2939 2940 2941 2942 2943 2944 2945 2946 2947 2948 2949 2950 2951 2952 2953 2954 2955 2956 2957 2958 2959 2960
        // A trace_file option can be provided both for trace and replay
        // operations. But db_bench does not support tracing and replaying at
        // the same time, for now. So, start tracing only when it is not a
        // replay.
        if (FLAGS_trace_file != "" && name != "replay") {
          std::unique_ptr<TraceWriter> trace_writer;
          Status s = NewFileTraceWriter(FLAGS_env, EnvOptions(),
                                        FLAGS_trace_file, &trace_writer);
          if (!s.ok()) {
            fprintf(stderr, "Encountered an error starting a trace, %s\n",
                    s.ToString().c_str());
            exit(1);
          }
          s = db_.db->StartTrace(trace_options_, std::move(trace_writer));
          if (!s.ok()) {
            fprintf(stderr, "Encountered an error starting a trace, %s\n",
                    s.ToString().c_str());
            exit(1);
          }
          fprintf(stdout, "Tracing the workload to: [%s]\n",
                  FLAGS_trace_file.c_str());
        }
2961 2962 2963 2964 2965 2966 2967 2968 2969 2970 2971 2972 2973 2974 2975 2976 2977 2978 2979 2980 2981 2982 2983 2984 2985 2986 2987 2988 2989 2990 2991 2992 2993 2994 2995 2996 2997 2998 2999 3000 3001
        // Start block cache tracing.
        if (!FLAGS_block_cache_trace_file.empty()) {
          // Sanity checks.
          if (FLAGS_block_cache_trace_sampling_frequency <= 0) {
            fprintf(stderr,
                    "Block cache trace sampling frequency must be higher than "
                    "0.\n");
            exit(1);
          }
          if (FLAGS_block_cache_trace_max_trace_file_size_in_bytes <= 0) {
            fprintf(stderr,
                    "The maximum file size for block cache tracing must be "
                    "higher than 0.\n");
            exit(1);
          }
          block_cache_trace_options_.max_trace_file_size =
              FLAGS_block_cache_trace_max_trace_file_size_in_bytes;
          block_cache_trace_options_.sampling_frequency =
              FLAGS_block_cache_trace_sampling_frequency;
          std::unique_ptr<TraceWriter> block_cache_trace_writer;
          Status s = NewFileTraceWriter(FLAGS_env, EnvOptions(),
                                        FLAGS_block_cache_trace_file,
                                        &block_cache_trace_writer);
          if (!s.ok()) {
            fprintf(stderr,
                    "Encountered an error when creating trace writer, %s\n",
                    s.ToString().c_str());
            exit(1);
          }
          s = db_.db->StartBlockCacheTrace(block_cache_trace_options_,
                                           std::move(block_cache_trace_writer));
          if (!s.ok()) {
            fprintf(
                stderr,
                "Encountered an error when starting block cache tracing, %s\n",
                s.ToString().c_str());
            exit(1);
          }
          fprintf(stdout, "Tracing block cache accesses to: [%s]\n",
                  FLAGS_block_cache_trace_file.c_str());
        }
3002
#endif  // ROCKSDB_LITE
3003

3004 3005 3006 3007 3008 3009 3010 3011 3012 3013 3014 3015 3016 3017 3018 3019 3020 3021 3022 3023
        if (num_warmup > 0) {
          printf("Warming up benchmark by running %d times\n", num_warmup);
        }

        for (int i = 0; i < num_warmup; i++) {
          RunBenchmark(num_threads, name, method);
        }

        if (num_repeat > 1) {
          printf("Running benchmark for %d times\n", num_repeat);
        }

        CombinedStats combined_stats;
        for (int i = 0; i < num_repeat; i++) {
          Stats stats = RunBenchmark(num_threads, name, method);
          combined_stats.AddStats(stats);
        }
        if (num_repeat > 1) {
          combined_stats.Report(name);
        }
J
jorlow@chromium.org 已提交
3024
      }
A
agiardullo 已提交
3025 3026 3027
      if (post_process_method != nullptr) {
        (this->*post_process_method)();
      }
J
jorlow@chromium.org 已提交
3028
    }
3029

3030 3031 3032 3033 3034 3035
    if (secondary_update_thread_) {
      secondary_update_stopped_.store(1, std::memory_order_relaxed);
      secondary_update_thread_->join();
      secondary_update_thread_.reset();
    }

3036
#ifndef ROCKSDB_LITE
3037 3038 3039 3040 3041 3042 3043
    if (name != "replay" && FLAGS_trace_file != "") {
      Status s = db_.db->EndTrace();
      if (!s.ok()) {
        fprintf(stderr, "Encountered an error ending the trace, %s\n",
                s.ToString().c_str());
      }
    }
3044 3045 3046 3047 3048 3049 3050 3051
    if (!FLAGS_block_cache_trace_file.empty()) {
      Status s = db_.db->EndBlockCacheTrace();
      if (!s.ok()) {
        fprintf(stderr,
                "Encountered an error ending the block cache tracing, %s\n",
                s.ToString().c_str());
      }
    }
3052
#endif  // ROCKSDB_LITE
3053

3054
    if (FLAGS_statistics) {
K
krad 已提交
3055
      fprintf(stdout, "STATISTICS:\n%s\n", dbstats->ToString().c_str());
3056
    }
I
Islam AbdelRahman 已提交
3057
    if (FLAGS_simcache_size >= 0) {
3058
      fprintf(stdout, "SIMULATOR CACHE STATISTICS:\n%s\n",
S
Siying Dong 已提交
3059 3060 3061
              static_cast_with_check<SimCache, Cache>(cache_.get())
                  ->ToString()
                  .c_str());
3062
    }
3063 3064

#ifndef ROCKSDB_LITE
3065 3066 3067 3068
    if (FLAGS_use_secondary_db) {
      fprintf(stdout, "Secondary instance updated  %" PRIu64 " times.\n",
              secondary_db_updates_);
    }
3069
#endif  // ROCKSDB_LITE
J
jorlow@chromium.org 已提交
3070 3071
  }

3072
 private:
3073
  std::shared_ptr<TimestampEmulator> timestamp_emulator_;
3074 3075
  std::unique_ptr<port::Thread> secondary_update_thread_;
  std::atomic<int> secondary_update_stopped_{0};
3076
#ifndef ROCKSDB_LITE
3077
  uint64_t secondary_db_updates_ = 0;
3078
#endif  // ROCKSDB_LITE
3079 3080 3081 3082 3083 3084 3085 3086 3087 3088 3089 3090 3091 3092 3093 3094 3095 3096 3097 3098 3099 3100
  struct ThreadArg {
    Benchmark* bm;
    SharedState* shared;
    ThreadState* thread;
    void (Benchmark::*method)(ThreadState*);
  };

  static void ThreadBody(void* v) {
    ThreadArg* arg = reinterpret_cast<ThreadArg*>(v);
    SharedState* shared = arg->shared;
    ThreadState* thread = arg->thread;
    {
      MutexLock l(&shared->mu);
      shared->num_initialized++;
      if (shared->num_initialized >= shared->total) {
        shared->cv.SignalAll();
      }
      while (!shared->start) {
        shared->cv.Wait();
      }
    }

3101
    SetPerfLevel(static_cast<PerfLevel> (shared->perf_level));
3102
    perf_context.EnablePerLevelPerfContext();
3103
    thread->stats.Start(thread->tid);
3104 3105 3106 3107 3108 3109 3110 3111 3112 3113 3114 3115
    (arg->bm->*(arg->method))(thread);
    thread->stats.Stop();

    {
      MutexLock l(&shared->mu);
      shared->num_done++;
      if (shared->num_done >= shared->total) {
        shared->cv.SignalAll();
      }
    }
  }

3116 3117
  Stats RunBenchmark(int n, Slice name,
                     void (Benchmark::*method)(ThreadState*)) {
3118 3119 3120 3121 3122
    SharedState shared;
    shared.total = n;
    shared.num_initialized = 0;
    shared.num_done = 0;
    shared.start = false;
3123 3124 3125 3126
    if (FLAGS_benchmark_write_rate_limit > 0) {
      shared.write_rate_limiter.reset(
          NewGenericRateLimiter(FLAGS_benchmark_write_rate_limit));
    }
3127
    if (FLAGS_benchmark_read_rate_limit > 0) {
3128 3129 3130
      shared.read_rate_limiter.reset(NewGenericRateLimiter(
          FLAGS_benchmark_read_rate_limit, 100000 /* refill_period_us */,
          10 /* fairness */, RateLimiter::Mode::kReadsOnly));
3131
    }
3132

3133 3134 3135 3136 3137 3138
    std::unique_ptr<ReporterAgent> reporter_agent;
    if (FLAGS_report_interval_seconds > 0) {
      reporter_agent.reset(new ReporterAgent(FLAGS_env, FLAGS_report_file,
                                             FLAGS_report_interval_seconds));
    }

3139
    ThreadArg* arg = new ThreadArg[n];
3140

3141
    for (int i = 0; i < n; i++) {
3142 3143 3144 3145 3146 3147 3148 3149 3150 3151 3152 3153 3154 3155 3156 3157 3158
#ifdef NUMA
      if (FLAGS_enable_numa) {
        // Performs a local allocation of memory to threads in numa node.
        int n_nodes = numa_num_task_nodes();  // Number of nodes in NUMA.
        numa_exit_on_error = 1;
        int numa_node = i % n_nodes;
        bitmask* nodes = numa_allocate_nodemask();
        numa_bitmask_clearall(nodes);
        numa_bitmask_setbit(nodes, numa_node);
        // numa_bind() call binds the process to the node and these
        // properties are passed on to the thread that is created in
        // StartThread method called later in the loop.
        numa_bind(nodes);
        numa_set_strict(1);
        numa_free_nodemask(nodes);
      }
#endif
3159 3160 3161 3162
      arg[i].bm = this;
      arg[i].method = method;
      arg[i].shared = &shared;
      arg[i].thread = new ThreadState(i);
3163
      arg[i].thread->stats.SetReporterAgent(reporter_agent.get());
3164
      arg[i].thread->shared = &shared;
3165
      FLAGS_env->StartThread(ThreadBody, &arg[i]);
3166 3167 3168 3169 3170 3171 3172 3173 3174 3175 3176 3177 3178 3179
    }

    shared.mu.Lock();
    while (shared.num_initialized < n) {
      shared.cv.Wait();
    }

    shared.start = true;
    shared.cv.SignalAll();
    while (shared.num_done < n) {
      shared.cv.Wait();
    }
    shared.mu.Unlock();

3180 3181 3182 3183
    // Stats for some threads can be excluded.
    Stats merge_stats;
    for (int i = 0; i < n; i++) {
      merge_stats.Merge(arg[i].thread->stats);
3184
    }
3185
    merge_stats.Report(name);
3186

3187 3188 3189 3190 3191
    for (int i = 0; i < n; i++) {
      delete arg[i].thread;
    }
    delete[] arg;

3192
    return merge_stats;
3193 3194 3195
  }

  void Crc32c(ThreadState* thread) {
J
jorlow@chromium.org 已提交
3196
    // Checksum about 500MB of data total
3197 3198 3199 3200
    const int size = FLAGS_block_size; // use --block_size option for db_bench
    std::string labels = "(" + ToString(FLAGS_block_size) + " per op)";
    const char* label = labels.c_str();

J
jorlow@chromium.org 已提交
3201
    std::string data(size, 'x');
J
jorlow@chromium.org 已提交
3202 3203 3204 3205
    int64_t bytes = 0;
    uint32_t crc = 0;
    while (bytes < 500 * 1048576) {
      crc = crc32c::Value(data.data(), size);
3206
      thread->stats.FinishedOps(nullptr, nullptr, 1, kCrc);
J
jorlow@chromium.org 已提交
3207 3208 3209 3210 3211
      bytes += size;
    }
    // Print so result is not dead
    fprintf(stderr, "... crc=0x%x\r", static_cast<unsigned int>(crc));

3212 3213
    thread->stats.AddBytes(bytes);
    thread->stats.AddMessage(label);
J
jorlow@chromium.org 已提交
3214 3215
  }

I
xxHash  
Igor Canadi 已提交
3216 3217 3218 3219 3220 3221 3222 3223 3224
  void xxHash(ThreadState* thread) {
    // Checksum about 500MB of data total
    const int size = 4096;
    const char* label = "(4K per op)";
    std::string data(size, 'x');
    int64_t bytes = 0;
    unsigned int xxh32 = 0;
    while (bytes < 500 * 1048576) {
      xxh32 = XXH32(data.data(), size, 0);
3225
      thread->stats.FinishedOps(nullptr, nullptr, 1, kHash);
I
xxHash  
Igor Canadi 已提交
3226 3227 3228 3229 3230 3231 3232 3233 3234
      bytes += size;
    }
    // Print so result is not dead
    fprintf(stderr, "... xxh32=0x%x\r", static_cast<unsigned int>(xxh32));

    thread->stats.AddBytes(bytes);
    thread->stats.AddMessage(label);
  }

3235
  void AcquireLoad(ThreadState* thread) {
3236
    int dummy;
I
Igor Canadi 已提交
3237
    std::atomic<void*> ap(&dummy);
3238
    int count = 0;
3239
    void *ptr = nullptr;
3240
    thread->stats.AddMessage("(each op is 1000 loads)");
3241 3242
    while (count < 100000) {
      for (int i = 0; i < 1000; i++) {
I
Igor Canadi 已提交
3243
        ptr = ap.load(std::memory_order_acquire);
3244 3245
      }
      count++;
3246
      thread->stats.FinishedOps(nullptr, nullptr, 1, kOthers);
3247
    }
3248
    if (ptr == nullptr) exit(1);  // Disable unused variable warning.
3249 3250
  }

A
Albert Strasheim 已提交
3251
  void Compress(ThreadState *thread) {
3252
    RandomGenerator gen;
3253
    Slice input = gen.Generate(FLAGS_block_size);
3254 3255 3256 3257
    int64_t bytes = 0;
    int64_t produced = 0;
    bool ok = true;
    std::string compressed;
3258 3259 3260
    CompressionOptions opts;
    CompressionContext context(FLAGS_compression_type_e);
    CompressionInfo info(opts, context, CompressionDict::GetEmptyDict(),
3261 3262
                         FLAGS_compression_type_e,
                         FLAGS_sample_for_compression);
A
Albert Strasheim 已提交
3263 3264
    // Compress 1G
    while (ok && bytes < int64_t(1) << 30) {
3265
      compressed.clear();
3266
      ok = CompressSlice(info, input, &compressed);
3267 3268
      produced += compressed.size();
      bytes += input.size();
3269
      thread->stats.FinishedOps(nullptr, nullptr, 1, kCompress);
3270 3271 3272
    }

    if (!ok) {
A
Albert Strasheim 已提交
3273
      thread->stats.AddMessage("(compression failure)");
3274
    } else {
D
Daniel Black 已提交
3275
      char buf[340];
3276 3277
      snprintf(buf, sizeof(buf), "(output: %.1f%%)",
               (produced * 100.0) / bytes);
3278 3279
      thread->stats.AddMessage(buf);
      thread->stats.AddBytes(bytes);
3280 3281 3282
    }
  }

A
Albert Strasheim 已提交
3283
  void Uncompress(ThreadState *thread) {
3284
    RandomGenerator gen;
3285
    Slice input = gen.Generate(FLAGS_block_size);
3286
    std::string compressed;
A
Albert Strasheim 已提交
3287

3288 3289
    CompressionContext compression_ctx(FLAGS_compression_type_e);
    CompressionOptions compression_opts;
3290 3291 3292
    CompressionInfo compression_info(
        compression_opts, compression_ctx, CompressionDict::GetEmptyDict(),
        FLAGS_compression_type_e, FLAGS_sample_for_compression);
3293
    UncompressionContext uncompression_ctx(FLAGS_compression_type_e);
3294 3295 3296
    UncompressionInfo uncompression_info(uncompression_ctx,
                                         UncompressionDict::GetEmptyDict(),
                                         FLAGS_compression_type_e);
3297

3298
    bool ok = CompressSlice(compression_info, input, &compressed);
3299
    int64_t bytes = 0;
A
Albert Strasheim 已提交
3300 3301
    int decompress_size;
    while (ok && bytes < 1024 * 1048576) {
3302
      CacheAllocationPtr uncompressed;
A
Albert Strasheim 已提交
3303
      switch (FLAGS_compression_type_e) {
3304 3305 3306 3307 3308 3309 3310 3311
        case rocksdb::kSnappyCompression: {
          // get size and allocate here to make comparison fair
          size_t ulength = 0;
          if (!Snappy_GetUncompressedLength(compressed.data(),
                                            compressed.size(), &ulength)) {
            ok = false;
            break;
          }
3312
          uncompressed = AllocateBlock(ulength, nullptr);
3313
          ok = Snappy_Uncompress(compressed.data(), compressed.size(),
3314
                                 uncompressed.get());
3315 3316
          break;
        }
A
Albert Strasheim 已提交
3317
      case rocksdb::kZlibCompression:
3318
        uncompressed = Zlib_Uncompress(uncompression_info, compressed.data(),
3319
                                       compressed.size(), &decompress_size, 2);
3320
        ok = uncompressed.get() != nullptr;
A
Albert Strasheim 已提交
3321 3322
        break;
      case rocksdb::kBZip2Compression:
I
Igor Canadi 已提交
3323
        uncompressed = BZip2_Uncompress(compressed.data(), compressed.size(),
3324
                                        &decompress_size, 2);
3325
        ok = uncompressed.get() != nullptr;
A
Albert Strasheim 已提交
3326 3327
        break;
      case rocksdb::kLZ4Compression:
3328
        uncompressed = LZ4_Uncompress(uncompression_info, compressed.data(),
3329
                                      compressed.size(), &decompress_size, 2);
3330
        ok = uncompressed.get() != nullptr;
A
Albert Strasheim 已提交
3331 3332
        break;
      case rocksdb::kLZ4HCCompression:
3333
        uncompressed = LZ4_Uncompress(uncompression_info, compressed.data(),
3334
                                      compressed.size(), &decompress_size, 2);
3335
        ok = uncompressed.get() != nullptr;
A
Albert Strasheim 已提交
3336
        break;
3337
      case rocksdb::kXpressCompression:
3338 3339 3340
        uncompressed.reset(XPRESS_Uncompress(
            compressed.data(), compressed.size(), &decompress_size));
        ok = uncompressed.get() != nullptr;
3341
        break;
S
sdong 已提交
3342
      case rocksdb::kZSTD:
3343
        uncompressed = ZSTD_Uncompress(uncompression_info, compressed.data(),
3344
                                       compressed.size(), &decompress_size);
3345
        ok = uncompressed.get() != nullptr;
3346
        break;
A
Albert Strasheim 已提交
3347 3348 3349
      default:
        ok = false;
      }
3350
      bytes += input.size();
3351
      thread->stats.FinishedOps(nullptr, nullptr, 1, kUncompress);
3352 3353 3354
    }

    if (!ok) {
A
Albert Strasheim 已提交
3355
      thread->stats.AddMessage("(compression failure)");
3356
    } else {
3357
      thread->stats.AddBytes(bytes);
3358 3359 3360
    }
  }

3361 3362 3363 3364 3365 3366 3367 3368 3369 3370 3371 3372 3373 3374 3375 3376 3377 3378
  // Returns true if the options is initialized from the specified
  // options file.
  bool InitializeOptionsFromFile(Options* opts) {
#ifndef ROCKSDB_LITE
    printf("Initializing RocksDB Options from the specified file\n");
    DBOptions db_opts;
    std::vector<ColumnFamilyDescriptor> cf_descs;
    if (FLAGS_options_file != "") {
      auto s = LoadOptionsFromFile(FLAGS_options_file, Env::Default(), &db_opts,
                                   &cf_descs);
      if (s.ok()) {
        *opts = Options(db_opts, cf_descs[0].options);
        return true;
      }
      fprintf(stderr, "Unable to load options file %s --- %s\n",
              FLAGS_options_file.c_str(), s.ToString().c_str());
      exit(1);
    }
3379 3380
#else
    (void)opts;
3381 3382 3383 3384 3385 3386
#endif
    return false;
  }

  void InitializeOptionsFromFlags(Options* opts) {
    printf("Initializing RocksDB Options from command-line flags\n");
3387 3388
    Options& options = *opts;

3389
    assert(db_.db == nullptr);
3390

3391
    options.max_open_files = FLAGS_open_files;
3392 3393 3394 3395
    if (FLAGS_cost_write_buffer_to_cache || FLAGS_db_write_buffer_size != 0) {
      options.write_buffer_manager.reset(
          new WriteBufferManager(FLAGS_db_write_buffer_size, cache_));
    }
3396
    options.write_buffer_size = FLAGS_write_buffer_size;
3397
    options.max_write_buffer_number = FLAGS_max_write_buffer_number;
3398 3399
    options.min_write_buffer_number_to_merge =
      FLAGS_min_write_buffer_number_to_merge;
3400 3401
    options.max_write_buffer_number_to_maintain =
        FLAGS_max_write_buffer_number_to_maintain;
3402 3403
    options.max_write_buffer_size_to_maintain =
        FLAGS_max_write_buffer_size_to_maintain;
3404
    options.max_background_jobs = FLAGS_max_background_jobs;
3405
    options.max_background_compactions = FLAGS_max_background_compactions;
3406
    options.max_subcompactions = static_cast<uint32_t>(FLAGS_subcompactions);
3407
    options.max_background_flushes = FLAGS_max_background_flushes;
3408
    options.compaction_style = FLAGS_compaction_style_e;
3409
    options.compaction_pri = FLAGS_compaction_pri_e;
3410 3411 3412
    options.allow_mmap_reads = FLAGS_mmap_read;
    options.allow_mmap_writes = FLAGS_mmap_write;
    options.use_direct_reads = FLAGS_use_direct_reads;
3413 3414
    options.use_direct_io_for_flush_and_compaction =
        FLAGS_use_direct_io_for_flush_and_compaction;
3415
#ifndef ROCKSDB_LITE
3416
    options.ttl = FLAGS_fifo_compaction_ttl;
3417
    options.compaction_options_fifo = CompactionOptionsFIFO(
3418
        FLAGS_fifo_compaction_max_table_files_size_mb * 1024 * 1024,
3419
        FLAGS_fifo_compaction_allow_compaction);
3420
#endif  // ROCKSDB_LITE
3421
    if (FLAGS_prefix_size != 0) {
3422 3423 3424
      options.prefix_extractor.reset(
          NewFixedPrefixTransform(FLAGS_prefix_size));
    }
3425 3426 3427 3428 3429 3430 3431
    if (FLAGS_use_uint64_comparator) {
      options.comparator = test::Uint64Comparator();
      if (FLAGS_key_size != 8) {
        fprintf(stderr, "Using Uint64 comparator but key size is not 8.\n");
        exit(1);
      }
    }
3432 3433 3434
    if (FLAGS_use_stderr_info_logger) {
      options.info_log.reset(new StderrLogger());
    }
3435
    options.memtable_huge_page_size = FLAGS_memtable_use_huge_page ? 2048 : 0;
3436
    options.memtable_prefix_bloom_size_ratio = FLAGS_memtable_bloom_size_ratio;
3437
    options.memtable_whole_key_filtering = FLAGS_memtable_whole_key_filtering;
3438 3439 3440 3441 3442
    if (FLAGS_memtable_insert_with_hint_prefix_size > 0) {
      options.memtable_insert_with_hint_prefix_extractor.reset(
          NewCappedPrefixTransform(
              FLAGS_memtable_insert_with_hint_prefix_size));
    }
L
Lei Jin 已提交
3443
    options.bloom_locality = FLAGS_bloom_locality;
3444
    options.max_file_opening_threads = FLAGS_file_opening_threads;
3445 3446 3447
    options.new_table_reader_for_compaction_inputs =
        FLAGS_new_table_reader_for_compaction_inputs;
    options.compaction_readahead_size = FLAGS_compaction_readahead_size;
3448
    options.random_access_max_buffer_size = FLAGS_random_access_max_buffer_size;
3449
    options.writable_file_max_buffer_size = FLAGS_writable_file_max_buffer_size;
3450
    options.use_fsync = FLAGS_use_fsync;
3451
    options.num_levels = FLAGS_num_levels;
H
heyongqiang 已提交
3452 3453 3454
    options.target_file_size_base = FLAGS_target_file_size_base;
    options.target_file_size_multiplier = FLAGS_target_file_size_multiplier;
    options.max_bytes_for_level_base = FLAGS_max_bytes_for_level_base;
3455 3456
    options.level_compaction_dynamic_level_bytes =
        FLAGS_level_compaction_dynamic_level_bytes;
H
heyongqiang 已提交
3457 3458
    options.max_bytes_for_level_multiplier =
        FLAGS_max_bytes_for_level_multiplier;
3459 3460 3461 3462
    if ((FLAGS_prefix_size == 0) && (FLAGS_rep_factory == kPrefixHash ||
                                     FLAGS_rep_factory == kHashLinkedList)) {
      fprintf(stderr, "prefix_size should be non-zero if PrefixHash or "
                      "HashLinkedList memtablerep is used\n");
J
Jim Paton 已提交
3463 3464 3465 3466
      exit(1);
    }
    switch (FLAGS_rep_factory) {
      case kSkipList:
T
Tomislav Novak 已提交
3467 3468
        options.memtable_factory.reset(new SkipListFactory(
            FLAGS_skip_list_lookahead));
J
Jim Paton 已提交
3469
        break;
S
sdong 已提交
3470 3471 3472 3473 3474
#ifndef ROCKSDB_LITE
      case kPrefixHash:
        options.memtable_factory.reset(
            NewHashSkipListRepFactory(FLAGS_hash_bucket_count));
        break;
3475 3476 3477 3478
      case kHashLinkedList:
        options.memtable_factory.reset(NewHashLinkListRepFactory(
            FLAGS_hash_bucket_count));
        break;
J
Jim Paton 已提交
3479 3480 3481 3482 3483
      case kVectorRep:
        options.memtable_factory.reset(
          new VectorRepFactory
        );
        break;
S
sdong 已提交
3484 3485 3486 3487 3488
#else
      default:
        fprintf(stderr, "Only skip list is supported in lite mode\n");
        exit(1);
#endif  // ROCKSDB_LITE
J
Jim Paton 已提交
3489
    }
L
Lei Jin 已提交
3490
    if (FLAGS_use_plain_table) {
S
sdong 已提交
3491
#ifndef ROCKSDB_LITE
3492 3493
      if (FLAGS_rep_factory != kPrefixHash &&
          FLAGS_rep_factory != kHashLinkedList) {
L
Lei Jin 已提交
3494 3495 3496 3497 3498 3499 3500
        fprintf(stderr, "Waring: plain table is used with skipList\n");
      }

      int bloom_bits_per_key = FLAGS_bloom_bits;
      if (bloom_bits_per_key < 0) {
        bloom_bits_per_key = 0;
      }
S
Stanislau Hlebik 已提交
3501 3502 3503 3504 3505 3506 3507

      PlainTableOptions plain_table_options;
      plain_table_options.user_key_len = FLAGS_key_size;
      plain_table_options.bloom_bits_per_key = bloom_bits_per_key;
      plain_table_options.hash_table_ratio = 0.75;
      options.table_factory = std::shared_ptr<TableFactory>(
          NewPlainTableFactory(plain_table_options));
S
sdong 已提交
3508 3509 3510 3511
#else
      fprintf(stderr, "Plain table is not supported in lite mode\n");
      exit(1);
#endif  // ROCKSDB_LITE
3512
    } else if (FLAGS_use_cuckoo_table) {
S
sdong 已提交
3513
#ifndef ROCKSDB_LITE
3514 3515 3516 3517
      if (FLAGS_cuckoo_hash_ratio > 1 || FLAGS_cuckoo_hash_ratio < 0) {
        fprintf(stderr, "Invalid cuckoo_hash_ratio\n");
        exit(1);
      }
3518 3519 3520 3521 3522 3523

      if (!FLAGS_mmap_read) {
        fprintf(stderr, "cuckoo table format requires mmap read to operate\n");
        exit(1);
      }

3524 3525 3526
      rocksdb::CuckooTableOptions table_options;
      table_options.hash_table_ratio = FLAGS_cuckoo_hash_ratio;
      table_options.identity_as_first_hash = FLAGS_identity_as_first_hash;
3527
      options.table_factory = std::shared_ptr<TableFactory>(
3528
          NewCuckooTableFactory(table_options));
S
sdong 已提交
3529 3530 3531 3532
#else
      fprintf(stderr, "Cuckoo table is not supported in lite mode\n");
      exit(1);
#endif  // ROCKSDB_LITE
3533 3534 3535
    } else {
      BlockBasedTableOptions block_based_options;
      if (FLAGS_use_hash_search) {
3536 3537 3538 3539 3540
        if (FLAGS_prefix_size == 0) {
          fprintf(stderr,
              "prefix_size not assigned when enable use_hash_search \n");
          exit(1);
        }
3541 3542 3543 3544
        block_based_options.index_type = BlockBasedTableOptions::kHashSearch;
      } else {
        block_based_options.index_type = BlockBasedTableOptions::kBinarySearch;
      }
3545
      if (FLAGS_partition_index_and_filters || FLAGS_partition_index) {
3546 3547 3548
        if (FLAGS_use_hash_search) {
          fprintf(stderr,
                  "use_hash_search is incompatible with "
3549
                  "partition index and is ignored");
3550 3551 3552 3553
        }
        block_based_options.index_type =
            BlockBasedTableOptions::kTwoLevelIndexSearch;
        block_based_options.metadata_block_size = FLAGS_metadata_block_size;
3554 3555 3556
        if (FLAGS_partition_index_and_filters) {
          block_based_options.partition_filters = true;
        }
3557
      }
3558 3559 3560
      if (cache_ == nullptr) {
        block_based_options.no_block_cache = true;
      }
3561 3562
      block_based_options.cache_index_and_filter_blocks =
          FLAGS_cache_index_and_filter_blocks;
3563 3564
      block_based_options.pin_l0_filter_and_index_blocks_in_cache =
          FLAGS_pin_l0_filter_and_index_blocks_in_cache;
3565 3566
      block_based_options.pin_top_level_index_and_filter =
          FLAGS_pin_top_level_index_and_filter;
3567 3568 3569 3570
      if (FLAGS_cache_high_pri_pool_ratio > 1e-6) {  // > 0.0 + eps
        block_based_options.cache_index_and_filter_blocks_with_high_priority =
            true;
      }
3571 3572 3573 3574
      block_based_options.block_cache = cache_;
      block_based_options.block_cache_compressed = compressed_cache_;
      block_based_options.block_size = FLAGS_block_size;
      block_based_options.block_restart_interval = FLAGS_block_restart_interval;
3575 3576
      block_based_options.index_block_restart_interval =
          FLAGS_index_block_restart_interval;
3577
      block_based_options.filter_policy = filter_policy_;
3578 3579
      block_based_options.format_version =
          static_cast<uint32_t>(FLAGS_format_version);
3580
      block_based_options.read_amp_bytes_per_bit = FLAGS_read_amp_bytes_per_bit;
3581 3582
      block_based_options.enable_index_compression =
          FLAGS_enable_index_compression;
3583
      block_based_options.block_align = FLAGS_block_align;
3584 3585 3586 3587 3588 3589 3590 3591 3592
      if (FLAGS_use_data_block_hash_index) {
        block_based_options.data_block_index_type =
            rocksdb::BlockBasedTableOptions::kDataBlockBinaryAndHash;
      } else {
        block_based_options.data_block_index_type =
            rocksdb::BlockBasedTableOptions::kDataBlockBinarySearch;
      }
      block_based_options.data_block_hash_table_util_ratio =
          FLAGS_data_block_hash_table_util_ratio;
3593 3594 3595 3596 3597 3598 3599 3600 3601 3602 3603 3604 3605 3606 3607 3608 3609 3610 3611 3612 3613 3614 3615 3616 3617 3618 3619 3620 3621 3622 3623 3624 3625 3626 3627 3628 3629 3630 3631
      if (FLAGS_read_cache_path != "") {
#ifndef ROCKSDB_LITE
        Status rc_status;

        // Read cache need to be provided with a the Logger, we will put all
        // reac cache logs in the read cache path in a file named rc_LOG
        rc_status = FLAGS_env->CreateDirIfMissing(FLAGS_read_cache_path);
        std::shared_ptr<Logger> read_cache_logger;
        if (rc_status.ok()) {
          rc_status = FLAGS_env->NewLogger(FLAGS_read_cache_path + "/rc_LOG",
                                           &read_cache_logger);
        }

        if (rc_status.ok()) {
          PersistentCacheConfig rc_cfg(FLAGS_env, FLAGS_read_cache_path,
                                       FLAGS_read_cache_size,
                                       read_cache_logger);

          rc_cfg.enable_direct_reads = FLAGS_read_cache_direct_read;
          rc_cfg.enable_direct_writes = FLAGS_read_cache_direct_write;
          rc_cfg.writer_qdepth = 4;
          rc_cfg.writer_dispatch_size = 4 * 1024;

          auto pcache = std::make_shared<BlockCacheTier>(rc_cfg);
          block_based_options.persistent_cache = pcache;
          rc_status = pcache->Open();
        }

        if (!rc_status.ok()) {
          fprintf(stderr, "Error initializing read cache, %s\n",
                  rc_status.ToString().c_str());
          exit(1);
        }
#else
        fprintf(stderr, "Read cache is not supported in LITE\n");
        exit(1);

#endif
      }
3632 3633
      options.table_factory.reset(
          NewBlockBasedTableFactory(block_based_options));
L
Lei Jin 已提交
3634
    }
3635 3636
    if (FLAGS_max_bytes_for_level_multiplier_additional_v.size() > 0) {
      if (FLAGS_max_bytes_for_level_multiplier_additional_v.size() !=
3637
          static_cast<unsigned int>(FLAGS_num_levels)) {
3638 3639 3640
        fprintf(stderr, "Insufficient number of fanouts specified %d\n",
                static_cast<int>(
                    FLAGS_max_bytes_for_level_multiplier_additional_v.size()));
3641 3642 3643
        exit(1);
      }
      options.max_bytes_for_level_multiplier_additional =
3644
        FLAGS_max_bytes_for_level_multiplier_additional_v;
3645
    }
H
heyongqiang 已提交
3646
    options.level0_stop_writes_trigger = FLAGS_level0_stop_writes_trigger;
M
Mark Callaghan 已提交
3647
    options.level0_file_num_compaction_trigger =
3648
        FLAGS_level0_file_num_compaction_trigger;
H
heyongqiang 已提交
3649 3650
    options.level0_slowdown_writes_trigger =
      FLAGS_level0_slowdown_writes_trigger;
3651
    options.compression = FLAGS_compression_type_e;
3652
    options.sample_for_compression = FLAGS_sample_for_compression;
3653 3654
    options.WAL_ttl_seconds = FLAGS_wal_ttl_seconds;
    options.WAL_size_limit_MB = FLAGS_wal_size_limit_MB;
3655 3656
    options.max_total_wal_size = FLAGS_max_total_wal_size;

3657 3658
    if (FLAGS_min_level_to_compress >= 0) {
      assert(FLAGS_min_level_to_compress <= FLAGS_num_levels);
3659
      options.compression_per_level.resize(FLAGS_num_levels);
3660
      for (int i = 0; i < FLAGS_min_level_to_compress; i++) {
3661 3662
        options.compression_per_level[i] = kNoCompression;
      }
3663
      for (int i = FLAGS_min_level_to_compress;
3664
           i < FLAGS_num_levels; i++) {
3665
        options.compression_per_level[i] = FLAGS_compression_type_e;
3666 3667
      }
    }
J
Jim Paton 已提交
3668 3669
    options.soft_rate_limit = FLAGS_soft_rate_limit;
    options.hard_rate_limit = FLAGS_hard_rate_limit;
3670 3671
    options.soft_pending_compaction_bytes_limit =
        FLAGS_soft_pending_compaction_bytes_limit;
3672 3673
    options.hard_pending_compaction_bytes_limit =
        FLAGS_hard_pending_compaction_bytes_limit;
S
sdong 已提交
3674
    options.delayed_write_rate = FLAGS_delayed_write_rate;
3675 3676
    options.allow_concurrent_memtable_write =
        FLAGS_allow_concurrent_memtable_write;
3677 3678
    options.inplace_update_support = FLAGS_inplace_update_support;
    options.inplace_update_num_locks = FLAGS_inplace_update_num_locks;
3679 3680
    options.enable_write_thread_adaptive_yield =
        FLAGS_enable_write_thread_adaptive_yield;
3681
    options.enable_pipelined_write = FLAGS_enable_pipelined_write;
M
Maysam Yabandeh 已提交
3682
    options.unordered_write = FLAGS_unordered_write;
3683 3684
    options.write_thread_max_yield_usec = FLAGS_write_thread_max_yield_usec;
    options.write_thread_slow_yield_usec = FLAGS_write_thread_slow_yield_usec;
J
Jim Paton 已提交
3685 3686
    options.rate_limit_delay_max_milliseconds =
      FLAGS_rate_limit_delay_max_milliseconds;
3687
    options.table_cache_numshardbits = FLAGS_table_cache_numshardbits;
3688
    options.max_compaction_bytes = FLAGS_max_compaction_bytes;
3689
    options.disable_auto_compactions = FLAGS_disable_auto_compactions;
3690
    options.optimize_filters_for_hits = FLAGS_optimize_filters_for_hits;
3691 3692

    // fill storage options
3693
    options.advise_random_on_open = FLAGS_advise_random_on_open;
3694
    options.access_hint_on_compaction_start = FLAGS_compaction_fadvice_e;
H
Haobo Xu 已提交
3695
    options.use_adaptive_mutex = FLAGS_use_adaptive_mutex;
H
Haobo Xu 已提交
3696
    options.bytes_per_sync = FLAGS_bytes_per_sync;
3697
    options.wal_bytes_per_sync = FLAGS_wal_bytes_per_sync;
H
Haobo Xu 已提交
3698

D
Deon Nicholas 已提交
3699
    // merge operator options
3700 3701 3702
    options.merge_operator = MergeOperators::CreateFromStringId(
        FLAGS_merge_operator);
    if (options.merge_operator == nullptr && !FLAGS_merge_operator.empty()) {
D
Deon Nicholas 已提交
3703 3704 3705 3706
      fprintf(stderr, "invalid merge operator: %s\n",
              FLAGS_merge_operator.c_str());
      exit(1);
    }
3707
    options.max_successive_merges = FLAGS_max_successive_merges;
3708
    options.report_bg_io_stats = FLAGS_report_bg_io_stats;
D
Deon Nicholas 已提交
3709

3710 3711 3712 3713 3714 3715 3716 3717 3718 3719 3720 3721 3722 3723 3724 3725 3726
    // set universal style compaction configurations, if applicable
    if (FLAGS_universal_size_ratio != 0) {
      options.compaction_options_universal.size_ratio =
        FLAGS_universal_size_ratio;
    }
    if (FLAGS_universal_min_merge_width != 0) {
      options.compaction_options_universal.min_merge_width =
        FLAGS_universal_min_merge_width;
    }
    if (FLAGS_universal_max_merge_width != 0) {
      options.compaction_options_universal.max_merge_width =
        FLAGS_universal_max_merge_width;
    }
    if (FLAGS_universal_max_size_amplification_percent != 0) {
      options.compaction_options_universal.max_size_amplification_percent =
        FLAGS_universal_max_size_amplification_percent;
    }
3727 3728 3729 3730
    if (FLAGS_universal_compression_size_percent != -1) {
      options.compaction_options_universal.compression_size_percent =
        FLAGS_universal_compression_size_percent;
    }
3731 3732
    options.compaction_options_universal.allow_trivial_move =
        FLAGS_universal_allow_trivial_move;
3733 3734 3735
    if (FLAGS_thread_status_per_interval > 0) {
      options.enable_thread_tracking = true;
    }
3736

3737
#ifndef ROCKSDB_LITE
A
agiardullo 已提交
3738 3739 3740 3741
    if (FLAGS_readonly && FLAGS_transaction_db) {
      fprintf(stderr, "Cannot use readonly flag with transaction_db\n");
      exit(1);
    }
3742 3743 3744 3745 3746
    if (FLAGS_use_secondary_db &&
        (FLAGS_transaction_db || FLAGS_optimistic_transaction_db)) {
      fprintf(stderr, "Cannot use use_secondary_db flag with transaction_db\n");
      exit(1);
    }
3747
#endif  // ROCKSDB_LITE
A
agiardullo 已提交
3748

3749 3750 3751 3752 3753
  }

  void InitializeOptionsGeneral(Options* opts) {
    Options& options = *opts;

3754
    options.create_missing_column_families = FLAGS_num_column_families > 1;
3755 3756 3757
    options.statistics = dbstats;
    options.wal_dir = FLAGS_wal_dir;
    options.create_if_missing = !FLAGS_use_existing_db;
3758
    options.dump_malloc_stats = FLAGS_dump_malloc_stats;
3759 3760
    options.stats_dump_period_sec =
        static_cast<unsigned int>(FLAGS_stats_dump_period_sec);
3761 3762
    options.stats_persist_period_sec =
        static_cast<unsigned int>(FLAGS_stats_persist_period_sec);
3763
    options.persist_stats_to_disk = FLAGS_persist_stats_to_disk;
3764 3765
    options.stats_history_buffer_size =
        static_cast<size_t>(FLAGS_stats_history_buffer_size);
3766

A
Andrew Kryczka 已提交
3767 3768 3769 3770
    options.compression_opts.level = FLAGS_compression_level;
    options.compression_opts.max_dict_bytes = FLAGS_compression_max_dict_bytes;
    options.compression_opts.zstd_max_train_bytes =
        FLAGS_compression_zstd_max_train_bytes;
3771 3772 3773 3774 3775 3776 3777
    // If this is a block based table, set some related options
    if (options.table_factory->Name() == BlockBasedTableFactory::kName &&
        options.table_factory->GetOptions() != nullptr) {
      BlockBasedTableOptions* table_options =
          reinterpret_cast<BlockBasedTableOptions*>(
              options.table_factory->GetOptions());
      if (FLAGS_cache_size) {
3778 3779
        table_options->block_cache = cache_;
      }
3780 3781 3782 3783
      if (FLAGS_bloom_bits >= 0) {
        table_options->filter_policy.reset(NewBloomFilterPolicy(
            FLAGS_bloom_bits, FLAGS_use_block_based_filter));
      }
3784
    }
3785 3786 3787 3788 3789 3790 3791 3792 3793 3794 3795 3796
    if (FLAGS_row_cache_size) {
      if (FLAGS_cache_numshardbits >= 1) {
        options.row_cache =
            NewLRUCache(FLAGS_row_cache_size, FLAGS_cache_numshardbits);
      } else {
        options.row_cache = NewLRUCache(FLAGS_row_cache_size);
      }
    }
    if (FLAGS_enable_io_prio) {
      FLAGS_env->LowerThreadPoolIOPriority(Env::LOW);
      FLAGS_env->LowerThreadPoolIOPriority(Env::HIGH);
    }
3797 3798 3799 3800
    if (FLAGS_enable_cpu_prio) {
      FLAGS_env->LowerThreadPoolCPUPriority(Env::LOW);
      FLAGS_env->LowerThreadPoolCPUPriority(Env::HIGH);
    }
I
Igor Canadi 已提交
3801
    options.env = FLAGS_env;
3802 3803 3804
    if (FLAGS_sine_write_rate) {
      FLAGS_benchmark_write_rate_limit = static_cast<uint64_t>(SineRate(0));
    }
3805

3806 3807 3808 3809 3810 3811 3812 3813 3814 3815 3816 3817
    if (FLAGS_rate_limiter_bytes_per_sec > 0) {
      if (FLAGS_rate_limit_bg_reads &&
          !FLAGS_new_table_reader_for_compaction_inputs) {
        fprintf(stderr,
                "rate limit compaction reads must have "
                "new_table_reader_for_compaction_inputs set\n");
        exit(1);
      }
      options.rate_limiter.reset(NewGenericRateLimiter(
          FLAGS_rate_limiter_bytes_per_sec, 100 * 1000 /* refill_period_us */,
          10 /* fairness */,
          FLAGS_rate_limit_bg_reads ? RateLimiter::Mode::kReadsOnly
A
Andrew Kryczka 已提交
3818 3819
                                    : RateLimiter::Mode::kWritesOnly,
          FLAGS_rate_limiter_auto_tuned));
3820 3821
    }

3822
    options.listeners.emplace_back(listener_);
3823 3824 3825 3826
    if (FLAGS_num_multi_db <= 1) {
      OpenDb(options, FLAGS_db, &db_);
    } else {
      multi_dbs_.clear();
3827
      multi_dbs_.resize(FLAGS_num_multi_db);
3828
      auto wal_dir = options.wal_dir;
3829
      for (int i = 0; i < FLAGS_num_multi_db; i++) {
3830 3831 3832 3833
        if (!wal_dir.empty()) {
          options.wal_dir = GetPathForMultiple(wal_dir, i);
        }
        OpenDb(options, GetPathForMultiple(FLAGS_db, i), &multi_dbs_[i]);
3834
      }
3835
      options.wal_dir = wal_dir;
3836
    }
3837 3838 3839 3840 3841 3842

    // KeepFilter is a noop filter, this can be used to test compaction filter
    if (FLAGS_use_keep_filter) {
      options.compaction_filter = new KeepFilter();
      fprintf(stdout, "A noop compaction filter is used\n");
    }
3843 3844 3845 3846 3847 3848 3849 3850 3851 3852 3853 3854 3855

    if (FLAGS_use_existing_keys) {
      // Only work on single database
      assert(db_.db != nullptr);
      ReadOptions read_opts;
      read_opts.total_order_seek = true;
      Iterator* iter = db_.db->NewIterator(read_opts);
      for (iter->SeekToFirst(); iter->Valid(); iter->Next()) {
        keys_.emplace_back(iter->key().ToString());
      }
      delete iter;
      FLAGS_num = keys_.size();
    }
3856 3857 3858 3859 3860
  }

  void Open(Options* opts) {
    if (!InitializeOptionsFromFile(opts)) {
      InitializeOptionsFromFlags(opts);
3861
    }
3862

3863
    InitializeOptionsGeneral(opts);
3864 3865
  }

Y
Yi Wu 已提交
3866
  void OpenDb(Options options, const std::string& db_name,
3867
      DBWithColumnFamilies* db) {
H
heyongqiang 已提交
3868
    Status s;
3869 3870
    // Open with column families if necessary.
    if (FLAGS_num_column_families > 1) {
3871 3872 3873 3874 3875 3876 3877
      size_t num_hot = FLAGS_num_column_families;
      if (FLAGS_num_hot_column_families > 0 &&
          FLAGS_num_hot_column_families < FLAGS_num_column_families) {
        num_hot = FLAGS_num_hot_column_families;
      } else {
        FLAGS_num_hot_column_families = FLAGS_num_column_families;
      }
3878
      std::vector<ColumnFamilyDescriptor> column_families;
3879
      for (size_t i = 0; i < num_hot; i++) {
3880 3881 3882
        column_families.push_back(ColumnFamilyDescriptor(
              ColumnFamilyName(i), ColumnFamilyOptions(options)));
      }
3883 3884 3885 3886 3887 3888 3889 3890 3891 3892 3893 3894 3895 3896 3897 3898 3899 3900 3901 3902 3903 3904
      std::vector<int> cfh_idx_to_prob;
      if (!FLAGS_column_family_distribution.empty()) {
        std::stringstream cf_prob_stream(FLAGS_column_family_distribution);
        std::string cf_prob;
        int sum = 0;
        while (std::getline(cf_prob_stream, cf_prob, ',')) {
          cfh_idx_to_prob.push_back(std::stoi(cf_prob));
          sum += cfh_idx_to_prob.back();
        }
        if (sum != 100) {
          fprintf(stderr, "column_family_distribution items must sum to 100\n");
          exit(1);
        }
        if (cfh_idx_to_prob.size() != num_hot) {
          fprintf(stderr,
                  "got %" ROCKSDB_PRIszt
                  " column_family_distribution items; expected "
                  "%" ROCKSDB_PRIszt "\n",
                  cfh_idx_to_prob.size(), num_hot);
          exit(1);
        }
      }
3905
#ifndef ROCKSDB_LITE
3906 3907 3908
      if (FLAGS_readonly) {
        s = DB::OpenForReadOnly(options, db_name, column_families,
            &db->cfh, &db->db);
A
agiardullo 已提交
3909
      } else if (FLAGS_optimistic_transaction_db) {
A
agiardullo 已提交
3910
        s = OptimisticTransactionDB::Open(options, db_name, column_families,
A
agiardullo 已提交
3911 3912 3913 3914 3915 3916 3917
                                          &db->cfh, &db->opt_txn_db);
        if (s.ok()) {
          db->db = db->opt_txn_db->GetBaseDB();
        }
      } else if (FLAGS_transaction_db) {
        TransactionDB* ptr;
        TransactionDBOptions txn_db_options;
3918 3919 3920 3921 3922
        if (options.unordered_write) {
          options.two_write_queues = true;
          txn_db_options.skip_concurrency_control = true;
          txn_db_options.write_policy = WRITE_PREPARED;
        }
A
agiardullo 已提交
3923 3924
        s = TransactionDB::Open(options, txn_db_options, db_name,
                                column_families, &db->cfh, &ptr);
A
agiardullo 已提交
3925
        if (s.ok()) {
A
agiardullo 已提交
3926
          db->db = ptr;
A
agiardullo 已提交
3927
        }
3928 3929 3930
      } else {
        s = DB::Open(options, db_name, column_families, &db->cfh, &db->db);
      }
3931 3932 3933
#else
      s = DB::Open(options, db_name, column_families, &db->cfh, &db->db);
#endif  // ROCKSDB_LITE
3934 3935 3936
      db->cfh.resize(FLAGS_num_column_families);
      db->num_created = num_hot;
      db->num_hot = num_hot;
3937
      db->cfh_idx_to_prob = std::move(cfh_idx_to_prob);
3938
#ifndef ROCKSDB_LITE
3939 3940
    } else if (FLAGS_readonly) {
      s = DB::OpenForReadOnly(options, db_name, &db->db);
A
agiardullo 已提交
3941 3942 3943 3944 3945
    } else if (FLAGS_optimistic_transaction_db) {
      s = OptimisticTransactionDB::Open(options, db_name, &db->opt_txn_db);
      if (s.ok()) {
        db->db = db->opt_txn_db->GetBaseDB();
      }
A
agiardullo 已提交
3946
    } else if (FLAGS_transaction_db) {
3947
      TransactionDB* ptr = nullptr;
A
agiardullo 已提交
3948
      TransactionDBOptions txn_db_options;
3949 3950 3951 3952 3953
      if (options.unordered_write) {
        options.two_write_queues = true;
        txn_db_options.skip_concurrency_control = true;
        txn_db_options.write_policy = WRITE_PREPARED;
      }
Y
Yi Wu 已提交
3954 3955 3956 3957
      s = CreateLoggerFromOptions(db_name, options, &options.info_log);
      if (s.ok()) {
        s = TransactionDB::Open(options, txn_db_options, db_name, &ptr);
      }
A
agiardullo 已提交
3958
      if (s.ok()) {
A
agiardullo 已提交
3959
        db->db = ptr;
A
agiardullo 已提交
3960
      }
3961
    } else if (FLAGS_use_blob_db) {
A
Anirban Rahut 已提交
3962
      blob_db::BlobDBOptions blob_db_options;
3963
      blob_db_options.enable_garbage_collection = FLAGS_blob_db_enable_gc;
S
Sagar Vemuri 已提交
3964
      blob_db_options.is_fifo = FLAGS_blob_db_is_fifo;
Y
Yi Wu 已提交
3965
      blob_db_options.max_db_size = FLAGS_blob_db_max_db_size;
S
Sagar Vemuri 已提交
3966 3967 3968 3969
      blob_db_options.ttl_range_secs = FLAGS_blob_db_ttl_range_secs;
      blob_db_options.min_blob_size = FLAGS_blob_db_min_blob_size;
      blob_db_options.bytes_per_sync = FLAGS_blob_db_bytes_per_sync;
      blob_db_options.blob_file_size = FLAGS_blob_db_file_size;
3970
      blob_db::BlobDB* ptr = nullptr;
Y
Yi Wu 已提交
3971
      s = blob_db::BlobDB::Open(options, blob_db_options, db_name, &ptr);
A
Anirban Rahut 已提交
3972 3973 3974
      if (s.ok()) {
        db->db = ptr;
      }
3975 3976 3977 3978 3979 3980 3981 3982 3983 3984 3985 3986 3987 3988 3989 3990 3991 3992 3993 3994 3995 3996 3997 3998 3999 4000
    } else if (FLAGS_use_secondary_db) {
      if (FLAGS_secondary_path.empty()) {
        std::string default_secondary_path;
        FLAGS_env->GetTestDirectory(&default_secondary_path);
        default_secondary_path += "/dbbench_secondary";
        FLAGS_secondary_path = default_secondary_path;
      }
      s = DB::OpenAsSecondary(options, db_name, FLAGS_secondary_path, &db->db);
      if (s.ok() && FLAGS_secondary_update_interval > 0) {
        secondary_update_thread_.reset(new port::Thread(
            [this](int interval, DBWithColumnFamilies* _db) {
              while (0 == secondary_update_stopped_.load(
                              std::memory_order_relaxed)) {
                Status secondary_update_status =
                    _db->db->TryCatchUpWithPrimary();
                if (!secondary_update_status.ok()) {
                  fprintf(stderr, "Failed to catch up with primary: %s\n",
                          secondary_update_status.ToString().c_str());
                  break;
                }
                ++secondary_db_updates_;
                FLAGS_env->SleepForMicroseconds(interval * 1000000);
              }
            },
            FLAGS_secondary_update_interval, db));
      }
A
Anirban Rahut 已提交
4001
#endif  // ROCKSDB_LITE
H
heyongqiang 已提交
4002
    } else {
4003
      s = DB::Open(options, db_name, &db->db);
H
heyongqiang 已提交
4004
    }
4005 4006 4007 4008 4009 4010
    if (!s.ok()) {
      fprintf(stderr, "open error: %s\n", s.ToString().c_str());
      exit(1);
    }
  }

4011 4012 4013 4014
  enum WriteMode {
    RANDOM, SEQUENTIAL, UNIQUE_RANDOM
  };

4015 4016 4017 4018 4019 4020 4021 4022 4023
  void WriteSeqDeterministic(ThreadState* thread) {
    DoDeterministicCompact(thread, open_options_.compaction_style, SEQUENTIAL);
  }

  void WriteUniqueRandomDeterministic(ThreadState* thread) {
    DoDeterministicCompact(thread, open_options_.compaction_style,
                           UNIQUE_RANDOM);
  }

4024
  void WriteSeq(ThreadState* thread) {
4025
    DoWrite(thread, SEQUENTIAL);
4026
  }
4027

4028
  void WriteRandom(ThreadState* thread) {
4029
    DoWrite(thread, RANDOM);
4030 4031
  }

4032 4033 4034 4035
  void WriteUniqueRandom(ThreadState* thread) {
    DoWrite(thread, UNIQUE_RANDOM);
  }

4036 4037
  class KeyGenerator {
   public:
A
Andrew Kryczka 已提交
4038 4039 4040
    KeyGenerator(Random64* rand, WriteMode mode, uint64_t num,
                 uint64_t /*num_per_set*/ = 64 * 1024)
        : rand_(rand), mode_(mode), num_(num), next_(0) {
4041 4042 4043 4044 4045 4046 4047 4048 4049
      if (mode_ == UNIQUE_RANDOM) {
        // NOTE: if memory consumption of this approach becomes a concern,
        // we can either break it into pieces and only random shuffle a section
        // each time. Alternatively, use a bit map implementation
        // (https://reviews.facebook.net/differential/diff/54627/)
        values_.resize(num_);
        for (uint64_t i = 0; i < num_; ++i) {
          values_[i] = i;
        }
4050 4051 4052
        std::shuffle(
            values_.begin(), values_.end(),
            std::default_random_engine(static_cast<unsigned int>(FLAGS_seed)));
4053 4054 4055 4056 4057 4058 4059 4060 4061 4062
      }
    }

    uint64_t Next() {
      switch (mode_) {
        case SEQUENTIAL:
          return next_++;
        case RANDOM:
          return rand_->Next() % num_;
        case UNIQUE_RANDOM:
4063
          assert(next_ < num_);
4064 4065 4066 4067 4068 4069 4070 4071 4072 4073 4074 4075 4076 4077
          return values_[next_++];
      }
      assert(false);
      return std::numeric_limits<uint64_t>::max();
    }

   private:
    Random64* rand_;
    WriteMode mode_;
    const uint64_t num_;
    uint64_t next_;
    std::vector<uint64_t> values_;
  };

4078
  DB* SelectDB(ThreadState* thread) {
4079 4080 4081 4082 4083 4084 4085 4086 4087 4088 4089 4090
    return SelectDBWithCfh(thread)->db;
  }

  DBWithColumnFamilies* SelectDBWithCfh(ThreadState* thread) {
    return SelectDBWithCfh(thread->rand.Next());
  }

  DBWithColumnFamilies* SelectDBWithCfh(uint64_t rand_int) {
    if (db_.db != nullptr) {
      return &db_;
    } else  {
      return &multi_dbs_[rand_int % multi_dbs_.size()];
4091 4092
    }
  }
4093

4094 4095 4096 4097
  double SineRate(double x) {
    return FLAGS_sine_a*sin((FLAGS_sine_b*x) + FLAGS_sine_c) + FLAGS_sine_d;
  }

4098 4099
  void DoWrite(ThreadState* thread, WriteMode write_mode) {
    const int test_duration = write_mode == RANDOM ? FLAGS_duration : 0;
4100
    const int64_t num_ops = writes_ == 0 ? num_ : writes_;
4101

4102
    size_t num_key_gens = 1;
4103
    if (db_.db == nullptr) {
4104 4105 4106
      num_key_gens = multi_dbs_.size();
    }
    std::vector<std::unique_ptr<KeyGenerator>> key_gens(num_key_gens);
4107 4108 4109 4110 4111 4112 4113 4114 4115
    int64_t max_ops = num_ops * num_key_gens;
    int64_t ops_per_stage = max_ops;
    if (FLAGS_num_column_families > 1 && FLAGS_num_hot_column_families > 0) {
      ops_per_stage = (max_ops - 1) / (FLAGS_num_column_families /
                                       FLAGS_num_hot_column_families) +
                      1;
    }

    Duration duration(test_duration, max_ops, ops_per_stage);
4116
    for (size_t i = 0; i < num_key_gens; i++) {
4117 4118
      key_gens[i].reset(new KeyGenerator(&(thread->rand), write_mode,
                                         num_ + max_num_range_tombstones_,
4119
                                         ops_per_stage));
4120
    }
M
Mark Callaghan 已提交
4121

4122
    if (num_ != FLAGS_num) {
4123
      char msg[100];
4124
      snprintf(msg, sizeof(msg), "(%" PRIu64 " ops)", num_);
4125
      thread->stats.AddMessage(msg);
4126 4127
    }

4128
    RandomGenerator gen;
J
jorlow@chromium.org 已提交
4129 4130
    WriteBatch batch;
    Status s;
4131
    int64_t bytes = 0;
L
Lei Jin 已提交
4132

4133 4134
    std::unique_ptr<const char[]> key_guard;
    Slice key = AllocateKey(&key_guard);
A
Andrew Kryczka 已提交
4135 4136 4137 4138
    std::unique_ptr<const char[]> begin_key_guard;
    Slice begin_key = AllocateKey(&begin_key_guard);
    std::unique_ptr<const char[]> end_key_guard;
    Slice end_key = AllocateKey(&end_key_guard);
4139 4140 4141 4142 4143 4144 4145 4146
    std::vector<std::unique_ptr<const char[]>> expanded_key_guards;
    std::vector<Slice> expanded_keys;
    if (FLAGS_expand_range_tombstones) {
      expanded_key_guards.resize(range_tombstone_width_);
      for (auto& expanded_key_guard : expanded_key_guards) {
        expanded_keys.emplace_back(AllocateKey(&expanded_key_guard));
      }
    }
A
Andrew Kryczka 已提交
4147

4148
    int64_t stage = 0;
A
Andrew Kryczka 已提交
4149
    int64_t num_written = 0;
M
Mark Callaghan 已提交
4150
    while (!duration.Done(entries_per_batch_)) {
4151 4152 4153 4154 4155 4156 4157 4158 4159 4160
      if (duration.GetStage() != stage) {
        stage = duration.GetStage();
        if (db_.db != nullptr) {
          db_.CreateNewCf(open_options_, stage);
        } else {
          for (auto& db : multi_dbs_) {
            db.CreateNewCf(open_options_, stage);
          }
        }
      }
4161

4162 4163
      size_t id = thread->rand.Next() % num_key_gens;
      DBWithColumnFamilies* db_with_cfh = SelectDBWithCfh(id);
J
jorlow@chromium.org 已提交
4164
      batch.Clear();
4165 4166 4167

      if (thread->shared->write_rate_limiter.get() != nullptr) {
        thread->shared->write_rate_limiter->Request(
4168
            entries_per_batch_ * (value_size_ + key_size_), Env::IO_HIGH,
4169
            nullptr /* stats */, RateLimiter::OpType::kWrite);
4170 4171 4172 4173 4174 4175
        // Set time at which last op finished to Now() to hide latency and
        // sleep from rate limiter. Also, do the check once per batch, not
        // once per write.
        thread->stats.ResetLastOpTime();
      }

L
Lei Jin 已提交
4176
      for (int64_t j = 0; j < entries_per_batch_; j++) {
4177 4178
        int64_t rand_num = key_gens[id]->Next();
        GenerateKeyFromInt(rand_num, FLAGS_num, &key);
Y
Yi Wu 已提交
4179 4180
        if (use_blob_db_) {
#ifndef ROCKSDB_LITE
A
Anirban Rahut 已提交
4181
          Slice val = gen.Generate(value_size_);
S
Sagar Vemuri 已提交
4182
          int ttl = rand() % FLAGS_blob_db_max_ttl_range;
A
Anirban Rahut 已提交
4183 4184 4185
          blob_db::BlobDB* blobdb =
              static_cast<blob_db::BlobDB*>(db_with_cfh->db);
          s = blobdb->PutWithTTL(write_options_, key, val, ttl);
Y
Yi Wu 已提交
4186
#endif  //  ROCKSDB_LITE
4187
        } else if (FLAGS_num_column_families <= 1) {
4188 4189 4190 4191 4192
          batch.Put(key, gen.Generate(value_size_));
        } else {
          // We use same rand_num as seed for key and column family so that we
          // can deterministically find the cfh corresponding to a particular
          // key while reading the key.
4193 4194
          batch.Put(db_with_cfh->GetCfh(rand_num), key,
                    gen.Generate(value_size_));
4195
        }
L
Lei Jin 已提交
4196
        bytes += value_size_ + key_size_;
A
Andrew Kryczka 已提交
4197 4198
        ++num_written;
        if (writes_per_range_tombstone_ > 0 &&
4199 4200 4201
            num_written > writes_before_delete_range_ &&
            (num_written - writes_before_delete_range_) /
                    writes_per_range_tombstone_ <=
A
Andrew Kryczka 已提交
4202
                max_num_range_tombstones_ &&
4203 4204 4205
            (num_written - writes_before_delete_range_) %
                    writes_per_range_tombstone_ ==
                0) {
A
Andrew Kryczka 已提交
4206
          int64_t begin_num = key_gens[id]->Next();
4207 4208 4209 4210 4211
          if (FLAGS_expand_range_tombstones) {
            for (int64_t offset = 0; offset < range_tombstone_width_;
                 ++offset) {
              GenerateKeyFromInt(begin_num + offset, FLAGS_num,
                                 &expanded_keys[offset]);
Y
Yi Wu 已提交
4212 4213
              if (use_blob_db_) {
#ifndef ROCKSDB_LITE
4214 4215
                s = db_with_cfh->db->Delete(write_options_,
                                            expanded_keys[offset]);
Y
Yi Wu 已提交
4216
#endif  //  ROCKSDB_LITE
4217 4218 4219 4220 4221 4222 4223
              } else if (FLAGS_num_column_families <= 1) {
                batch.Delete(expanded_keys[offset]);
              } else {
                batch.Delete(db_with_cfh->GetCfh(rand_num),
                             expanded_keys[offset]);
              }
            }
A
Andrew Kryczka 已提交
4224
          } else {
4225 4226 4227
            GenerateKeyFromInt(begin_num, FLAGS_num, &begin_key);
            GenerateKeyFromInt(begin_num + range_tombstone_width_, FLAGS_num,
                               &end_key);
Y
Yi Wu 已提交
4228 4229
            if (use_blob_db_) {
#ifndef ROCKSDB_LITE
4230 4231 4232
              s = db_with_cfh->db->DeleteRange(
                  write_options_, db_with_cfh->db->DefaultColumnFamily(),
                  begin_key, end_key);
Y
Yi Wu 已提交
4233
#endif  //  ROCKSDB_LITE
4234 4235 4236 4237 4238 4239
            } else if (FLAGS_num_column_families <= 1) {
              batch.DeleteRange(begin_key, end_key);
            } else {
              batch.DeleteRange(db_with_cfh->GetCfh(rand_num), begin_key,
                                end_key);
            }
A
Andrew Kryczka 已提交
4240 4241
          }
        }
4242
      }
Y
Yi Wu 已提交
4243
      if (!use_blob_db_) {
4244 4245
        s = db_with_cfh->db->Write(write_options_, &batch);
      }
4246
      thread->stats.FinishedOps(db_with_cfh, db_with_cfh->db,
4247
                                entries_per_batch_, kWrite);
4248 4249 4250 4251 4252 4253 4254 4255 4256 4257 4258 4259 4260 4261 4262 4263 4264 4265 4266 4267 4268
      if (FLAGS_sine_write_rate) {
        uint64_t now = FLAGS_env->NowMicros();

        uint64_t usecs_since_last;
        if (now > thread->stats.GetSineInterval()) {
          usecs_since_last = now - thread->stats.GetSineInterval();
        } else {
          usecs_since_last = 0;
        }

        if (usecs_since_last >
            (FLAGS_sine_write_rate_interval_milliseconds * uint64_t{1000})) {
          double usecs_since_start =
                  static_cast<double>(now - thread->stats.GetStart());
          thread->stats.ResetSineInterval();
          uint64_t write_rate =
                  static_cast<uint64_t>(SineRate(usecs_since_start / 1000000.0));
          thread->shared->write_rate_limiter.reset(
                  NewGenericRateLimiter(write_rate));
        }
      }
4269 4270 4271 4272
      if (!s.ok()) {
        s = listener_->WaitForRecovery(600000000) ? Status::OK() : s;
      }

J
jorlow@chromium.org 已提交
4273 4274 4275 4276 4277
      if (!s.ok()) {
        fprintf(stderr, "put error: %s\n", s.ToString().c_str());
        exit(1);
      }
    }
4278
    thread->stats.AddBytes(bytes);
J
jorlow@chromium.org 已提交
4279 4280
  }

4281 4282 4283 4284 4285 4286 4287 4288 4289 4290 4291 4292 4293 4294 4295 4296
  Status DoDeterministicCompact(ThreadState* thread,
                                CompactionStyle compaction_style,
                                WriteMode write_mode) {
#ifndef ROCKSDB_LITE
    ColumnFamilyMetaData meta;
    std::vector<DB*> db_list;
    if (db_.db != nullptr) {
      db_list.push_back(db_.db);
    } else {
      for (auto& db : multi_dbs_) {
        db_list.push_back(db.db);
      }
    }
    std::vector<Options> options_list;
    for (auto db : db_list) {
      options_list.push_back(db->GetOptions());
4297 4298 4299 4300 4301 4302 4303
      if (compaction_style != kCompactionStyleFIFO) {
        db->SetOptions({{"disable_auto_compactions", "1"},
                        {"level0_slowdown_writes_trigger", "400000000"},
                        {"level0_stop_writes_trigger", "400000000"}});
      } else {
        db->SetOptions({{"disable_auto_compactions", "1"}});
      }
4304 4305 4306 4307 4308 4309 4310 4311 4312 4313 4314 4315 4316 4317 4318 4319 4320 4321 4322 4323 4324 4325 4326 4327 4328 4329 4330 4331 4332 4333 4334 4335 4336 4337 4338 4339 4340 4341 4342 4343 4344 4345 4346
    }

    assert(!db_list.empty());
    auto num_db = db_list.size();
    size_t num_levels = static_cast<size_t>(open_options_.num_levels);
    size_t output_level = open_options_.num_levels - 1;
    std::vector<std::vector<std::vector<SstFileMetaData>>> sorted_runs(num_db);
    std::vector<size_t> num_files_at_level0(num_db, 0);
    if (compaction_style == kCompactionStyleLevel) {
      if (num_levels == 0) {
        return Status::InvalidArgument("num_levels should be larger than 1");
      }
      bool should_stop = false;
      while (!should_stop) {
        if (sorted_runs[0].empty()) {
          DoWrite(thread, write_mode);
        } else {
          DoWrite(thread, UNIQUE_RANDOM);
        }
        for (size_t i = 0; i < num_db; i++) {
          auto db = db_list[i];
          db->Flush(FlushOptions());
          db->GetColumnFamilyMetaData(&meta);
          if (num_files_at_level0[i] == meta.levels[0].files.size() ||
              writes_ == 0) {
            should_stop = true;
            continue;
          }
          sorted_runs[i].emplace_back(
              meta.levels[0].files.begin(),
              meta.levels[0].files.end() - num_files_at_level0[i]);
          num_files_at_level0[i] = meta.levels[0].files.size();
          if (sorted_runs[i].back().size() == 1) {
            should_stop = true;
            continue;
          }
          if (sorted_runs[i].size() == output_level) {
            auto& L1 = sorted_runs[i].back();
            L1.erase(L1.begin(), L1.begin() + L1.size() / 3);
            should_stop = true;
            continue;
          }
        }
4347
        writes_ /= static_cast<int64_t>(open_options_.max_bytes_for_level_multiplier);
4348 4349 4350
      }
      for (size_t i = 0; i < num_db; i++) {
        if (sorted_runs[i].size() < num_levels - 1) {
4351
          fprintf(stderr, "n is too small to fill %" ROCKSDB_PRIszt " levels\n", num_levels);
4352 4353 4354 4355 4356 4357
          exit(1);
        }
      }
      for (size_t i = 0; i < num_db; i++) {
        auto db = db_list[i];
        auto compactionOptions = CompactionOptions();
4358
        compactionOptions.compression = FLAGS_compression_type_e;
4359 4360 4361 4362
        auto options = db->GetOptions();
        MutableCFOptions mutable_cf_options(options);
        for (size_t j = 0; j < sorted_runs[i].size(); j++) {
          compactionOptions.output_file_size_limit =
4363 4364
              MaxFileSizeForLevel(mutable_cf_options,
                  static_cast<int>(output_level), compaction_style);
4365 4366 4367 4368 4369 4370 4371 4372 4373 4374 4375 4376 4377 4378 4379 4380 4381 4382 4383 4384 4385 4386 4387 4388 4389 4390 4391 4392 4393 4394 4395 4396 4397 4398
          std::cout << sorted_runs[i][j].size() << std::endl;
          db->CompactFiles(compactionOptions, {sorted_runs[i][j].back().name,
                                               sorted_runs[i][j].front().name},
                           static_cast<int>(output_level - j) /*level*/);
        }
      }
    } else if (compaction_style == kCompactionStyleUniversal) {
      auto ratio = open_options_.compaction_options_universal.size_ratio;
      bool should_stop = false;
      while (!should_stop) {
        if (sorted_runs[0].empty()) {
          DoWrite(thread, write_mode);
        } else {
          DoWrite(thread, UNIQUE_RANDOM);
        }
        for (size_t i = 0; i < num_db; i++) {
          auto db = db_list[i];
          db->Flush(FlushOptions());
          db->GetColumnFamilyMetaData(&meta);
          if (num_files_at_level0[i] == meta.levels[0].files.size() ||
              writes_ == 0) {
            should_stop = true;
            continue;
          }
          sorted_runs[i].emplace_back(
              meta.levels[0].files.begin(),
              meta.levels[0].files.end() - num_files_at_level0[i]);
          num_files_at_level0[i] = meta.levels[0].files.size();
          if (sorted_runs[i].back().size() == 1) {
            should_stop = true;
            continue;
          }
          num_files_at_level0[i] = meta.levels[0].files.size();
        }
4399
        writes_ =  static_cast<int64_t>(writes_* static_cast<double>(100) / (ratio + 200));
4400 4401 4402
      }
      for (size_t i = 0; i < num_db; i++) {
        if (sorted_runs[i].size() < num_levels) {
4403
          fprintf(stderr, "n is too small to fill %" ROCKSDB_PRIszt  " levels\n", num_levels);
4404 4405 4406 4407 4408 4409
          exit(1);
        }
      }
      for (size_t i = 0; i < num_db; i++) {
        auto db = db_list[i];
        auto compactionOptions = CompactionOptions();
4410
        compactionOptions.compression = FLAGS_compression_type_e;
4411 4412 4413 4414
        auto options = db->GetOptions();
        MutableCFOptions mutable_cf_options(options);
        for (size_t j = 0; j < sorted_runs[i].size(); j++) {
          compactionOptions.output_file_size_limit =
4415 4416
              MaxFileSizeForLevel(mutable_cf_options,
                  static_cast<int>(output_level), compaction_style);
4417 4418 4419 4420 4421 4422 4423 4424
          db->CompactFiles(
              compactionOptions,
              {sorted_runs[i][j].back().name, sorted_runs[i][j].front().name},
              (output_level > j ? static_cast<int>(output_level - j)
                                : 0) /*level*/);
        }
      }
    } else if (compaction_style == kCompactionStyleFIFO) {
4425 4426 4427 4428 4429 4430 4431 4432 4433 4434 4435 4436 4437 4438 4439 4440 4441 4442 4443 4444 4445 4446 4447 4448 4449 4450 4451 4452 4453 4454 4455
      if (num_levels != 1) {
        return Status::InvalidArgument(
          "num_levels should be 1 for FIFO compaction");
      }
      if (FLAGS_num_multi_db != 0) {
        return Status::InvalidArgument("Doesn't support multiDB");
      }
      auto db = db_list[0];
      std::vector<std::string> file_names;
      while (true) {
        if (sorted_runs[0].empty()) {
          DoWrite(thread, write_mode);
        } else {
          DoWrite(thread, UNIQUE_RANDOM);
        }
        db->Flush(FlushOptions());
        db->GetColumnFamilyMetaData(&meta);
        auto total_size = meta.levels[0].size;
        if (total_size >=
          db->GetOptions().compaction_options_fifo.max_table_files_size) {
          for (auto file_meta : meta.levels[0].files) {
            file_names.emplace_back(file_meta.name);
          }
          break;
        }
      }
      // TODO(shuzhang1989): Investigate why CompactFiles not working
      // auto compactionOptions = CompactionOptions();
      // db->CompactFiles(compactionOptions, file_names, 0);
      auto compactionOptions = CompactRangeOptions();
      db->CompactRange(compactionOptions, nullptr, nullptr);
4456 4457 4458 4459 4460 4461 4462 4463 4464 4465 4466 4467 4468 4469 4470 4471 4472 4473 4474 4475 4476 4477
    } else {
      fprintf(stdout,
              "%-12s : skipped (-compaction_stype=kCompactionStyleNone)\n",
              "filldeterministic");
      return Status::InvalidArgument("None compaction is not supported");
    }

// Verify seqno and key range
// Note: the seqno get changed at the max level by implementation
// optimization, so skip the check of the max level.
#ifndef NDEBUG
    for (size_t k = 0; k < num_db; k++) {
      auto db = db_list[k];
      db->GetColumnFamilyMetaData(&meta);
      // verify the number of sorted runs
      if (compaction_style == kCompactionStyleLevel) {
        assert(num_levels - 1 == sorted_runs[k].size());
      } else if (compaction_style == kCompactionStyleUniversal) {
        assert(meta.levels[0].files.size() + num_levels - 1 ==
               sorted_runs[k].size());
      } else if (compaction_style == kCompactionStyleFIFO) {
        // TODO(gzh): FIFO compaction
4478 4479 4480 4481 4482
        db->GetColumnFamilyMetaData(&meta);
        auto total_size = meta.levels[0].size;
        assert(total_size <=
          db->GetOptions().compaction_options_fifo.max_table_files_size);
          break;
4483 4484 4485 4486 4487 4488 4489 4490 4491 4492 4493 4494 4495 4496 4497 4498 4499 4500 4501 4502 4503 4504 4505 4506 4507 4508 4509 4510 4511 4512 4513 4514 4515 4516 4517 4518 4519 4520 4521 4522 4523 4524 4525 4526 4527 4528 4529 4530 4531 4532 4533 4534 4535 4536 4537 4538 4539 4540 4541 4542 4543 4544 4545 4546 4547
      }

      // verify smallest/largest seqno and key range of each sorted run
      auto max_level = num_levels - 1;
      int level;
      for (size_t i = 0; i < sorted_runs[k].size(); i++) {
        level = static_cast<int>(max_level - i);
        SequenceNumber sorted_run_smallest_seqno = kMaxSequenceNumber;
        SequenceNumber sorted_run_largest_seqno = 0;
        std::string sorted_run_smallest_key, sorted_run_largest_key;
        bool first_key = true;
        for (auto fileMeta : sorted_runs[k][i]) {
          sorted_run_smallest_seqno =
              std::min(sorted_run_smallest_seqno, fileMeta.smallest_seqno);
          sorted_run_largest_seqno =
              std::max(sorted_run_largest_seqno, fileMeta.largest_seqno);
          if (first_key ||
              db->DefaultColumnFamily()->GetComparator()->Compare(
                  fileMeta.smallestkey, sorted_run_smallest_key) < 0) {
            sorted_run_smallest_key = fileMeta.smallestkey;
          }
          if (first_key ||
              db->DefaultColumnFamily()->GetComparator()->Compare(
                  fileMeta.largestkey, sorted_run_largest_key) > 0) {
            sorted_run_largest_key = fileMeta.largestkey;
          }
          first_key = false;
        }
        if (compaction_style == kCompactionStyleLevel ||
            (compaction_style == kCompactionStyleUniversal && level > 0)) {
          SequenceNumber level_smallest_seqno = kMaxSequenceNumber;
          SequenceNumber level_largest_seqno = 0;
          for (auto fileMeta : meta.levels[level].files) {
            level_smallest_seqno =
                std::min(level_smallest_seqno, fileMeta.smallest_seqno);
            level_largest_seqno =
                std::max(level_largest_seqno, fileMeta.largest_seqno);
          }
          assert(sorted_run_smallest_key ==
                 meta.levels[level].files.front().smallestkey);
          assert(sorted_run_largest_key ==
                 meta.levels[level].files.back().largestkey);
          if (level != static_cast<int>(max_level)) {
            // compaction at max_level would change sequence number
            assert(sorted_run_smallest_seqno == level_smallest_seqno);
            assert(sorted_run_largest_seqno == level_largest_seqno);
          }
        } else if (compaction_style == kCompactionStyleUniversal) {
          // level <= 0 means sorted runs on level 0
          auto level0_file =
              meta.levels[0].files[sorted_runs[k].size() - 1 - i];
          assert(sorted_run_smallest_key == level0_file.smallestkey);
          assert(sorted_run_largest_key == level0_file.largestkey);
          if (level != static_cast<int>(max_level)) {
            assert(sorted_run_smallest_seqno == level0_file.smallest_seqno);
            assert(sorted_run_largest_seqno == level0_file.largest_seqno);
          }
        }
      }
    }
#endif
    // print the size of each sorted_run
    for (size_t k = 0; k < num_db; k++) {
      auto db = db_list[k];
      fprintf(stdout,
4548
              "---------------------- DB %" ROCKSDB_PRIszt " LSM ---------------------\n", k);
4549 4550 4551 4552 4553 4554 4555
      db->GetColumnFamilyMetaData(&meta);
      for (auto& levelMeta : meta.levels) {
        if (levelMeta.files.empty()) {
          continue;
        }
        if (levelMeta.level == 0) {
          for (auto& fileMeta : levelMeta.files) {
4556
            fprintf(stdout, "Level[%d]: %s(size: %" ROCKSDB_PRIszt " bytes)\n",
4557
                    levelMeta.level, fileMeta.name.c_str(), fileMeta.size);
4558 4559
          }
        } else {
4560
          fprintf(stdout, "Level[%d]: %s - %s(total size: %" PRIi64 " bytes)\n",
4561 4562 4563 4564 4565 4566 4567 4568 4569 4570 4571 4572 4573 4574 4575 4576
                  levelMeta.level, levelMeta.files.front().name.c_str(),
                  levelMeta.files.back().name.c_str(), levelMeta.size);
        }
      }
    }
    for (size_t i = 0; i < num_db; i++) {
      db_list[i]->SetOptions(
          {{"disable_auto_compactions",
            std::to_string(options_list[i].disable_auto_compactions)},
           {"level0_slowdown_writes_trigger",
            std::to_string(options_list[i].level0_slowdown_writes_trigger)},
           {"level0_stop_writes_trigger",
            std::to_string(options_list[i].level0_stop_writes_trigger)}});
    }
    return Status::OK();
#else
4577 4578 4579
    (void)thread;
    (void)compaction_style;
    (void)write_mode;
4580 4581 4582 4583 4584 4585
    fprintf(stderr, "Rocksdb Lite doesn't support filldeterministic\n");
    return Status::NotSupported(
        "Rocksdb Lite doesn't support filldeterministic");
#endif  // ROCKSDB_LITE
  }

4586
  void ReadSequential(ThreadState* thread) {
4587 4588
    if (db_.db != nullptr) {
      ReadSequential(thread, db_.db);
4589
    } else {
4590 4591
      for (const auto& db_with_cfh : multi_dbs_) {
        ReadSequential(thread, db_with_cfh.db);
4592 4593 4594 4595 4596
      }
    }
  }

  void ReadSequential(ThreadState* thread, DB* db) {
4597 4598 4599 4600
    ReadOptions options(FLAGS_verify_checksum, true);
    options.tailing = FLAGS_use_tailing_iterator;

    Iterator* iter = db->NewIterator(options);
4601
    int64_t i = 0;
4602
    int64_t bytes = 0;
4603
    for (iter->SeekToFirst(); i < reads_ && iter->Valid(); iter->Next()) {
4604
      bytes += iter->key().size() + iter->value().size();
4605
      thread->stats.FinishedOps(nullptr, db, 1, kRead);
4606
      ++i;
4607 4608 4609

      if (thread->shared->read_rate_limiter.get() != nullptr &&
          i % 1024 == 1023) {
4610
        thread->shared->read_rate_limiter->Request(1024, Env::IO_HIGH,
4611 4612
                                                   nullptr /* stats */,
                                                   RateLimiter::OpType::kRead);
4613
      }
4614
    }
4615

4616
    delete iter;
4617
    thread->stats.AddBytes(bytes);
4618
    if (FLAGS_perf_level > rocksdb::PerfLevel::kDisable) {
4619 4620
      thread->stats.AddMessage(std::string("PERF_CONTEXT:\n") +
                               get_perf_context()->ToString());
4621
    }
4622 4623
  }

4624
  void ReadReverse(ThreadState* thread) {
4625 4626
    if (db_.db != nullptr) {
      ReadReverse(thread, db_.db);
4627
    } else {
4628 4629
      for (const auto& db_with_cfh : multi_dbs_) {
        ReadReverse(thread, db_with_cfh.db);
4630 4631 4632 4633 4634 4635
      }
    }
  }

  void ReadReverse(ThreadState* thread, DB* db) {
    Iterator* iter = db->NewIterator(ReadOptions(FLAGS_verify_checksum, true));
4636
    int64_t i = 0;
4637
    int64_t bytes = 0;
4638
    for (iter->SeekToLast(); i < reads_ && iter->Valid(); iter->Prev()) {
4639
      bytes += iter->key().size() + iter->value().size();
4640
      thread->stats.FinishedOps(nullptr, db, 1, kRead);
4641
      ++i;
4642 4643
      if (thread->shared->read_rate_limiter.get() != nullptr &&
          i % 1024 == 1023) {
4644
        thread->shared->read_rate_limiter->Request(1024, Env::IO_HIGH,
4645 4646
                                                   nullptr /* stats */,
                                                   RateLimiter::OpType::kRead);
4647
      }
4648 4649
    }
    delete iter;
4650
    thread->stats.AddBytes(bytes);
4651 4652
  }

L
Lei Jin 已提交
4653 4654 4655
  void ReadRandomFast(ThreadState* thread) {
    int64_t read = 0;
    int64_t found = 0;
4656
    int64_t nonexist = 0;
L
Lei Jin 已提交
4657
    ReadOptions options(FLAGS_verify_checksum, true);
4658 4659
    std::unique_ptr<const char[]> key_guard;
    Slice key = AllocateKey(&key_guard);
L
Lei Jin 已提交
4660 4661 4662 4663 4664 4665 4666 4667 4668 4669 4670 4671 4672 4673
    std::string value;
    DB* db = SelectDBWithCfh(thread)->db;

    int64_t pot = 1;
    while (pot < FLAGS_num) {
      pot <<= 1;
    }

    Duration duration(FLAGS_duration, reads_);
    do {
      for (int i = 0; i < 100; ++i) {
        int64_t key_rand = thread->rand.Next() & (pot - 1);
        GenerateKeyFromInt(key_rand, FLAGS_num, &key);
        ++read;
4674 4675
        auto status = db->Get(options, key, &value);
        if (status.ok()) {
L
Lei Jin 已提交
4676
          ++found;
4677
        } else if (!status.IsNotFound()) {
I
Igor Canadi 已提交
4678 4679
          fprintf(stderr, "Get returned an error: %s\n",
                  status.ToString().c_str());
4680
          abort();
L
Lei Jin 已提交
4681
        }
4682 4683 4684
        if (key_rand >= FLAGS_num) {
          ++nonexist;
        }
L
Lei Jin 已提交
4685
      }
4686
      if (thread->shared->read_rate_limiter.get() != nullptr) {
4687 4688
        thread->shared->read_rate_limiter->Request(
            100, Env::IO_HIGH, nullptr /* stats */, RateLimiter::OpType::kRead);
4689 4690
      }

4691
      thread->stats.FinishedOps(nullptr, db, 100, kRead);
L
Lei Jin 已提交
4692 4693 4694
    } while (!duration.Done(100));

    char msg[100];
4695 4696 4697
    snprintf(msg, sizeof(msg), "(%" PRIu64 " of %" PRIu64 " found, "
             "issued %" PRIu64 " non-exist keys)\n",
             found, read, nonexist);
L
Lei Jin 已提交
4698 4699 4700

    thread->stats.AddMessage(msg);

4701
    if (FLAGS_perf_level > rocksdb::PerfLevel::kDisable) {
4702 4703
      thread->stats.AddMessage(std::string("PERF_CONTEXT:\n") +
                               get_perf_context()->ToString());
L
Lei Jin 已提交
4704 4705 4706
    }
  }

4707 4708 4709 4710 4711 4712 4713 4714 4715 4716 4717
  int64_t GetRandomKey(Random64* rand) {
    uint64_t rand_int = rand->Next();
    int64_t key_rand;
    if (read_random_exp_range_ == 0) {
      key_rand = rand_int % FLAGS_num;
    } else {
      const uint64_t kBigInt = static_cast<uint64_t>(1U) << 62;
      long double order = -static_cast<long double>(rand_int % kBigInt) /
                          static_cast<long double>(kBigInt) *
                          read_random_exp_range_;
      long double exp_ran = std::exp(order);
4718
      uint64_t rand_num =
4719
          static_cast<int64_t>(exp_ran * static_cast<long double>(FLAGS_num));
4720 4721 4722 4723
      // Map to a different number to avoid locality.
      const uint64_t kBigPrime = 0x5bd1e995;
      // Overflow is like %(2^64). Will have little impact of results.
      key_rand = static_cast<int64_t>((rand_num * kBigPrime) % FLAGS_num);
4724 4725 4726 4727
    }
    return key_rand;
  }

4728
  void ReadRandom(ThreadState* thread) {
L
Lei Jin 已提交
4729
    int64_t read = 0;
L
Lei Jin 已提交
4730
    int64_t found = 0;
4731
    int64_t bytes = 0;
4732 4733
    int num_keys = 0;
    int64_t key_rand = GetRandomKey(&thread->rand);
L
Lei Jin 已提交
4734
    ReadOptions options(FLAGS_verify_checksum, true);
4735 4736
    std::unique_ptr<const char[]> key_guard;
    Slice key = AllocateKey(&key_guard);
M
Maysam Yabandeh 已提交
4737
    PinnableSlice pinnable_val;
4738

L
Lei Jin 已提交
4739 4740
    Duration duration(FLAGS_duration, reads_);
    while (!duration.Done(1)) {
4741 4742 4743 4744 4745
      DBWithColumnFamilies* db_with_cfh = SelectDBWithCfh(thread);
      // We use same key_rand as seed for key and column family so that we can
      // deterministically find the cfh corresponding to a particular key, as it
      // is done in DoWrite method.
      GenerateKeyFromInt(key_rand, FLAGS_num, &key);
4746 4747 4748 4749 4750 4751 4752 4753 4754 4755 4756 4757 4758 4759
      if (entries_per_batch_ > 1 && FLAGS_multiread_stride) {
        if (++num_keys == entries_per_batch_) {
          num_keys = 0;
          key_rand = GetRandomKey(&thread->rand);
          if ((key_rand + (entries_per_batch_ - 1) * FLAGS_multiread_stride) >=
              FLAGS_num) {
            key_rand = FLAGS_num - entries_per_batch_ * FLAGS_multiread_stride;
          }
        } else {
          key_rand += FLAGS_multiread_stride;
        }
      } else {
        key_rand = GetRandomKey(&thread->rand);
      }
L
Lei Jin 已提交
4760
      read++;
4761 4762
      Status s;
      if (FLAGS_num_column_families > 1) {
4763
        s = db_with_cfh->db->Get(options, db_with_cfh->GetCfh(key_rand), key,
4764
                                 &pinnable_val);
4765
      } else {
4766 4767 4768 4769
        pinnable_val.Reset();
        s = db_with_cfh->db->Get(options,
                                 db_with_cfh->db->DefaultColumnFamily(), key,
                                 &pinnable_val);
4770 4771
      }
      if (s.ok()) {
L
Lei Jin 已提交
4772
        found++;
4773
        bytes += key.size() + pinnable_val.size();
4774
      } else if (!s.IsNotFound()) {
I
Igor Canadi 已提交
4775
        fprintf(stderr, "Get returned an error: %s\n", s.ToString().c_str());
4776
        abort();
M
Mark Callaghan 已提交
4777
      }
4778 4779 4780

      if (thread->shared->read_rate_limiter.get() != nullptr &&
          read % 256 == 255) {
4781 4782
        thread->shared->read_rate_limiter->Request(
            256, Env::IO_HIGH, nullptr /* stats */, RateLimiter::OpType::kRead);
4783 4784
      }

4785
      thread->stats.FinishedOps(db_with_cfh, db_with_cfh->db, 1, kRead);
S
Sanjay Ghemawat 已提交
4786
    }
4787

S
Sanjay Ghemawat 已提交
4788
    char msg[100];
L
Lei Jin 已提交
4789
    snprintf(msg, sizeof(msg), "(%" PRIu64 " of %" PRIu64 " found)\n",
L
Lei Jin 已提交
4790
             found, read);
4791

4792
    thread->stats.AddBytes(bytes);
S
Sanjay Ghemawat 已提交
4793
    thread->stats.AddMessage(msg);
4794

4795
    if (FLAGS_perf_level > rocksdb::PerfLevel::kDisable) {
4796 4797
      thread->stats.AddMessage(std::string("PERF_CONTEXT:\n") +
                               get_perf_context()->ToString());
4798
    }
S
Sanjay Ghemawat 已提交
4799 4800
  }

L
Lei Jin 已提交
4801 4802 4803 4804
  // Calls MultiGet over a list of keys from a random distribution.
  // Returns the total number of keys found.
  void MultiReadRandom(ThreadState* thread) {
    int64_t read = 0;
4805
    int64_t num_multireads = 0;
4806
    int64_t found = 0;
4807
    ReadOptions options(FLAGS_verify_checksum, true);
S
sdong 已提交
4808
    std::vector<Slice> keys;
4809
    std::vector<std::unique_ptr<const char[]> > key_guards;
L
Lei Jin 已提交
4810
    std::vector<std::string> values(entries_per_batch_);
4811
    PinnableSlice* pin_values = new PinnableSlice[entries_per_batch_];
4812
    std::unique_ptr<PinnableSlice[]> pin_values_guard(pin_values);
4813
    std::vector<Status> stat_list(entries_per_batch_);
4814
    while (static_cast<int64_t>(keys.size()) < entries_per_batch_) {
4815
      key_guards.push_back(std::unique_ptr<const char[]>());
4816
      keys.push_back(AllocateKey(&key_guards.back()));
J
jorlow@chromium.org 已提交
4817 4818
    }

M
Mark Callaghan 已提交
4819
    Duration duration(FLAGS_duration, reads_);
L
Lei Jin 已提交
4820
    while (!duration.Done(1)) {
4821
      DB* db = SelectDB(thread);
4822 4823 4824
      if (FLAGS_multiread_stride) {
        int64_t key = GetRandomKey(&thread->rand);
        if ((key + (entries_per_batch_ - 1) * FLAGS_multiread_stride) >=
4825
            static_cast<int64_t>(FLAGS_num)) {
4826 4827 4828 4829 4830 4831 4832 4833 4834 4835
          key = FLAGS_num - entries_per_batch_ * FLAGS_multiread_stride;
        }
        for (int64_t i = 0; i < entries_per_batch_; ++i) {
          GenerateKeyFromInt(key, FLAGS_num, &keys[i]);
          key += FLAGS_multiread_stride;
        }
      } else {
        for (int64_t i = 0; i < entries_per_batch_; ++i) {
          GenerateKeyFromInt(GetRandomKey(&thread->rand), FLAGS_num, &keys[i]);
        }
4836
      }
4837 4838 4839 4840 4841 4842 4843 4844 4845 4846 4847 4848 4849 4850 4851 4852 4853 4854 4855 4856 4857 4858 4859 4860 4861 4862 4863 4864 4865 4866 4867
      if (!FLAGS_multiread_batched) {
        std::vector<Status> statuses = db->MultiGet(options, keys, &values);
        assert(static_cast<int64_t>(statuses.size()) == entries_per_batch_);

        read += entries_per_batch_;
        num_multireads++;
        for (int64_t i = 0; i < entries_per_batch_; ++i) {
          if (statuses[i].ok()) {
            ++found;
          } else if (!statuses[i].IsNotFound()) {
            fprintf(stderr, "MultiGet returned an error: %s\n",
                    statuses[i].ToString().c_str());
            abort();
          }
        }
      } else {
        db->MultiGet(options, db->DefaultColumnFamily(), keys.size(),
                     keys.data(), pin_values, stat_list.data());

        read += entries_per_batch_;
        num_multireads++;
        for (int64_t i = 0; i < entries_per_batch_; ++i) {
          if (stat_list[i].ok()) {
            ++found;
          } else if (!stat_list[i].IsNotFound()) {
            fprintf(stderr, "MultiGet returned an error: %s\n",
                    stat_list[i].ToString().c_str());
            abort();
          }
          stat_list[i] = Status::OK();
          pin_values[i].Reset();
4868 4869
        }
      }
4870 4871
      if (thread->shared->read_rate_limiter.get() != nullptr &&
          num_multireads % 256 == 255) {
4872
        thread->shared->read_rate_limiter->Request(
4873 4874
            256 * entries_per_batch_, Env::IO_HIGH, nullptr /* stats */,
            RateLimiter::OpType::kRead);
4875
      }
4876
      thread->stats.FinishedOps(nullptr, db, entries_per_batch_, kRead);
4877
    }
4878 4879

    char msg[100];
4880
    snprintf(msg, sizeof(msg), "(%" PRIu64 " of %" PRIu64 " found)",
L
Lei Jin 已提交
4881
             found, read);
4882
    thread->stats.AddMessage(msg);
4883 4884
  }

4885 4886 4887 4888 4889 4890 4891 4892 4893 4894 4895 4896 4897 4898 4899 4900 4901 4902 4903 4904 4905 4906 4907 4908 4909 4910 4911 4912 4913 4914 4915 4916 4917 4918 4919 4920 4921 4922 4923 4924 4925 4926 4927 4928 4929 4930 4931 4932 4933 4934 4935 4936 4937 4938 4939 4940 4941 4942 4943 4944 4945
  // THe reverse function of Pareto function
  int64_t ParetoCdfInversion(double u, double theta, double k, double sigma) {
    double ret;
    if (k == 0.0) {
      ret = theta - sigma * std::log(u);
    } else {
      ret = theta + sigma * (std::pow(u, -1 * k) - 1) / k;
    }
    return static_cast<int64_t>(ceil(ret));
  }
  // inversion of y=ax^b
  int64_t PowerCdfInversion(double u, double a, double b) {
    double ret;
    ret = std::pow((u / a), (1 / b));
    return static_cast<int64_t>(ceil(ret));
  }

  // Add the noice to the QPS
  double AddNoise(double origin, double noise_ratio) {
    if (noise_ratio < 0.0 || noise_ratio > 1.0) {
      return origin;
    }
    int band_int = static_cast<int>(FLAGS_sine_a);
    double delta = (rand() % band_int - band_int / 2) * noise_ratio;
    if (origin + delta < 0) {
      return origin;
    } else {
      return (origin + delta);
    }
  }

  // decide the query type
  // 0 Get, 1 Put, 2 Seek, 3 SeekForPrev, 4 Delete, 5 SingleDelete, 6 merge
  class QueryDecider {
   public:
    std::vector<int> type_;
    std::vector<double> ratio_;
    int range_;

    QueryDecider() {}
    ~QueryDecider() {}

    Status Initiate(std::vector<double> ratio_input) {
      int range_max = 1000;
      double sum = 0.0;
      for (auto& ratio : ratio_input) {
        sum += ratio;
      }
      range_ = 0;
      for (auto& ratio : ratio_input) {
        range_ += static_cast<int>(ceil(range_max * (ratio / sum)));
        type_.push_back(range_);
        ratio_.push_back(ratio / sum);
      }
      return Status::OK();
    }

    int GetType(int64_t rand_num) {
      if (rand_num < 0) {
        rand_num = rand_num * (-1);
      }
4946
      assert(range_ != 0);
4947 4948 4949 4950 4951 4952 4953 4954 4955 4956 4957 4958 4959 4960 4961 4962 4963 4964 4965
      int pos = static_cast<int>(rand_num % range_);
      for (int i = 0; i < static_cast<int>(type_.size()); i++) {
        if (pos < type_[i]) {
          return i;
        }
      }
      return 0;
    }
  };

  // The graph wokrload mixed with Get, Put, Iterator
  void MixGraph(ThreadState* thread) {
    int64_t read = 0;  // including single gets and Next of iterators
    int64_t gets = 0;
    int64_t puts = 0;
    int64_t found = 0;
    int64_t seek = 0;
    int64_t seek_found = 0;
    int64_t bytes = 0;
4966
    const int64_t default_value_max = 1 * 1024 * 1024;
4967
    int64_t value_max = default_value_max;
4968 4969 4970
    int64_t scan_len_max = FLAGS_mix_max_scan_len;
    double write_rate = 1000000.0;
    double read_rate = 1000000.0;
4971 4972
    std::vector<double> ratio{FLAGS_mix_get_ratio, FLAGS_mix_put_ratio,
                              FLAGS_mix_seek_ratio};
4973
    char value_buffer[default_value_max];
4974 4975 4976
    QueryDecider query;
    RandomGenerator gen;
    Status s;
4977
    if (value_max > FLAGS_mix_max_value_size) {
4978 4979
      value_max = FLAGS_mix_max_value_size;
    }
4980 4981 4982 4983 4984 4985 4986 4987 4988 4989

    ReadOptions options(FLAGS_verify_checksum, true);
    std::unique_ptr<const char[]> key_guard;
    Slice key = AllocateKey(&key_guard);
    PinnableSlice pinnable_val;
    query.Initiate(ratio);

    // the limit of qps initiation
    if (FLAGS_sine_a != 0 || FLAGS_sine_d != 0) {
      thread->shared->read_rate_limiter.reset(NewGenericRateLimiter(
4990
          static_cast<int64_t>(read_rate), 100000 /* refill_period_us */, 10 /* fairness */,
4991 4992
          RateLimiter::Mode::kReadsOnly));
      thread->shared->write_rate_limiter.reset(
4993
          NewGenericRateLimiter(static_cast<int64_t>(write_rate)));
4994 4995 4996 4997 4998 4999 5000 5001 5002 5003 5004 5005 5006 5007 5008 5009 5010 5011 5012 5013 5014 5015 5016 5017 5018 5019 5020 5021 5022 5023 5024 5025 5026 5027 5028
    }

    Duration duration(FLAGS_duration, reads_);
    while (!duration.Done(1)) {
      DBWithColumnFamilies* db_with_cfh = SelectDBWithCfh(thread);
      int64_t rand_v, key_rand, key_seed;
      rand_v = GetRandomKey(&thread->rand) % FLAGS_num;
      double u = static_cast<double>(rand_v) / FLAGS_num;
      key_seed = PowerCdfInversion(u, FLAGS_key_dist_a, FLAGS_key_dist_b);
      Random64 rand(key_seed);
      key_rand = static_cast<int64_t>(rand.Next()) % FLAGS_num;
      GenerateKeyFromInt(key_rand, FLAGS_num, &key);
      int query_type = query.GetType(rand_v);

      // change the qps
      uint64_t now = FLAGS_env->NowMicros();
      uint64_t usecs_since_last;
      if (now > thread->stats.GetSineInterval()) {
        usecs_since_last = now - thread->stats.GetSineInterval();
      } else {
        usecs_since_last = 0;
      }

      if (usecs_since_last >
          (FLAGS_sine_mix_rate_interval_milliseconds * uint64_t{1000})) {
        double usecs_since_start =
            static_cast<double>(now - thread->stats.GetStart());
        thread->stats.ResetSineInterval();
        double mix_rate_with_noise = AddNoise(
            SineRate(usecs_since_start / 1000000.0), FLAGS_sine_mix_rate_noise);
        read_rate = mix_rate_with_noise * (query.ratio_[0] + query.ratio_[2]);
        write_rate =
            mix_rate_with_noise * query.ratio_[1] * FLAGS_mix_ave_kv_size;

        thread->shared->write_rate_limiter.reset(
5029
            NewGenericRateLimiter(static_cast<int64_t>(write_rate)));
5030
        thread->shared->read_rate_limiter.reset(NewGenericRateLimiter(
5031
            static_cast<int64_t>(read_rate),
5032 5033 5034 5035 5036 5037 5038 5039 5040 5041 5042 5043 5044 5045 5046 5047 5048 5049 5050 5051 5052 5053 5054 5055 5056 5057 5058 5059 5060 5061 5062 5063
            FLAGS_sine_mix_rate_interval_milliseconds * uint64_t{1000}, 10,
            RateLimiter::Mode::kReadsOnly));
      }
      // Start the query
      if (query_type == 0) {
        // the Get query
        gets++;
        read++;
        if (FLAGS_num_column_families > 1) {
          s = db_with_cfh->db->Get(options, db_with_cfh->GetCfh(key_rand), key,
                                   &pinnable_val);
        } else {
          pinnable_val.Reset();
          s = db_with_cfh->db->Get(options,
                                   db_with_cfh->db->DefaultColumnFamily(), key,
                                   &pinnable_val);
        }

        if (s.ok()) {
          found++;
          bytes += key.size() + pinnable_val.size();
        } else if (!s.IsNotFound()) {
          fprintf(stderr, "Get returned an error: %s\n", s.ToString().c_str());
          abort();
        }

        if (thread->shared->read_rate_limiter.get() != nullptr &&
            read % 256 == 255) {
          thread->shared->read_rate_limiter->Request(
              256, Env::IO_HIGH, nullptr /* stats */,
              RateLimiter::OpType::kRead);
        }
5064
        thread->stats.FinishedOps(db_with_cfh, db_with_cfh->db, 1, kRead);
5065 5066 5067 5068 5069 5070 5071 5072 5073 5074
      } else if (query_type == 1) {
        // the Put query
        puts++;
        int64_t value_size = ParetoCdfInversion(
            u, FLAGS_value_theta, FLAGS_value_k, FLAGS_value_sigma);
        if (value_size < 0) {
          value_size = 10;
        } else if (value_size > value_max) {
          value_size = value_size % value_max;
        }
5075 5076 5077
        s = db_with_cfh->db->Put(
            write_options_, key,
            gen.Generate(static_cast<unsigned int>(value_size)));
5078 5079 5080 5081 5082 5083 5084 5085 5086 5087
        if (!s.ok()) {
          fprintf(stderr, "put error: %s\n", s.ToString().c_str());
          exit(1);
        }

        if (thread->shared->write_rate_limiter) {
          thread->shared->write_rate_limiter->Request(
              key.size() + value_size, Env::IO_HIGH, nullptr /*stats*/,
              RateLimiter::OpType::kWrite);
        }
5088
        thread->stats.FinishedOps(db_with_cfh, db_with_cfh->db, 1, kWrite);
5089 5090 5091 5092 5093 5094 5095 5096 5097 5098 5099 5100 5101 5102 5103 5104 5105 5106 5107 5108 5109 5110 5111 5112 5113 5114 5115
      } else if (query_type == 2) {
        // Seek query
        if (db_with_cfh->db != nullptr) {
          Iterator* single_iter = nullptr;
          single_iter = db_with_cfh->db->NewIterator(options);
          if (single_iter != nullptr) {
            single_iter->Seek(key);
            seek++;
            read++;
            if (single_iter->Valid() && single_iter->key().compare(key) == 0) {
              seek_found++;
            }
            int64_t scan_length =
                ParetoCdfInversion(u, FLAGS_iter_theta, FLAGS_iter_k,
                                   FLAGS_iter_sigma) %
                scan_len_max;
            for (int64_t j = 0; j < scan_length && single_iter->Valid(); j++) {
              Slice value = single_iter->value();
              memcpy(value_buffer, value.data(),
                     std::min(value.size(), sizeof(value_buffer)));
              bytes += single_iter->key().size() + single_iter->value().size();
              single_iter->Next();
              assert(single_iter->status().ok());
            }
          }
          delete single_iter;
        }
5116
        thread->stats.FinishedOps(db_with_cfh, db_with_cfh->db, 1, kSeek);
5117 5118
      }
    }
5119
    char msg[256];
5120 5121 5122 5123 5124 5125 5126 5127 5128 5129 5130 5131 5132 5133
    snprintf(msg, sizeof(msg),
             "( Gets:%" PRIu64 " Puts:%" PRIu64 " Seek:%" PRIu64 " of %" PRIu64
             " in %" PRIu64 " found)\n",
             gets, puts, seek, found, read);

    thread->stats.AddBytes(bytes);
    thread->stats.AddMessage(msg);

    if (FLAGS_perf_level > rocksdb::PerfLevel::kDisable) {
      thread->stats.AddMessage(std::string("PERF_CONTEXT:\n") +
                               get_perf_context()->ToString());
    }
  }

5134 5135 5136 5137
  void IteratorCreation(ThreadState* thread) {
    Duration duration(FLAGS_duration, reads_);
    ReadOptions options(FLAGS_verify_checksum, true);
    while (!duration.Done(1)) {
5138 5139
      DB* db = SelectDB(thread);
      Iterator* iter = db->NewIterator(options);
5140
      delete iter;
5141
      thread->stats.FinishedOps(nullptr, db, 1, kOthers);
5142 5143 5144
    }
  }

5145 5146 5147 5148
  void IteratorCreationWhileWriting(ThreadState* thread) {
    if (thread->tid > 0) {
      IteratorCreation(thread);
    } else {
5149
      BGWriter(thread, kWrite);
5150 5151 5152
    }
  }

S
Sanjay Ghemawat 已提交
5153
  void SeekRandom(ThreadState* thread) {
L
Lei Jin 已提交
5154
    int64_t read = 0;
5155
    int64_t found = 0;
5156
    int64_t bytes = 0;
L
Lei Jin 已提交
5157
    ReadOptions options(FLAGS_verify_checksum, true);
5158 5159
    options.total_order_seek = FLAGS_total_order_seek;
    options.prefix_same_as_start = FLAGS_prefix_same_as_start;
L
Lei Jin 已提交
5160
    options.tailing = FLAGS_use_tailing_iterator;
5161
    options.readahead_size = FLAGS_readahead_size;
5162 5163 5164

    Iterator* single_iter = nullptr;
    std::vector<Iterator*> multi_iters;
5165 5166
    if (db_.db != nullptr) {
      single_iter = db_.db->NewIterator(options);
5167
    } else {
5168 5169
      for (const auto& db_with_cfh : multi_dbs_) {
        multi_iters.push_back(db_with_cfh.db->NewIterator(options));
5170 5171 5172
      }
    }

5173 5174
    std::unique_ptr<const char[]> key_guard;
    Slice key = AllocateKey(&key_guard);
L
Lei Jin 已提交
5175

5176 5177 5178 5179 5180
    std::unique_ptr<const char[]> upper_bound_key_guard;
    Slice upper_bound = AllocateKey(&upper_bound_key_guard);
    std::unique_ptr<const char[]> lower_bound_key_guard;
    Slice lower_bound = AllocateKey(&lower_bound_key_guard);

L
Lei Jin 已提交
5181
    Duration duration(FLAGS_duration, reads_);
5182
    char value_buffer[256];
M
Mark Callaghan 已提交
5183
    while (!duration.Done(1)) {
5184
      int64_t seek_pos = thread->rand.Next() % FLAGS_num;
5185 5186
      GenerateKeyFromIntForSeek(static_cast<uint64_t>(seek_pos), FLAGS_num,
                                &key);
5187 5188 5189
      if (FLAGS_max_scan_distance != 0) {
        if (FLAGS_reverse_iterator) {
          GenerateKeyFromInt(
5190 5191
              static_cast<uint64_t>(std::max(
                  static_cast<int64_t>(0), seek_pos - FLAGS_max_scan_distance)),
5192 5193 5194
              FLAGS_num, &lower_bound);
          options.iterate_lower_bound = &lower_bound;
        } else {
5195
          auto min_num =
5196 5197 5198
              std::min(FLAGS_num, seek_pos + FLAGS_max_scan_distance);
          GenerateKeyFromInt(static_cast<uint64_t>(min_num), FLAGS_num,
                             &upper_bound);
5199 5200 5201 5202
          options.iterate_upper_bound = &upper_bound;
        }
      }

M
Mark Callaghan 已提交
5203 5204 5205 5206 5207 5208 5209 5210 5211 5212 5213
      if (!FLAGS_use_tailing_iterator) {
        if (db_.db != nullptr) {
          delete single_iter;
          single_iter = db_.db->NewIterator(options);
        } else {
          for (auto iter : multi_iters) {
            delete iter;
          }
          multi_iters.clear();
          for (const auto& db_with_cfh : multi_dbs_) {
            multi_iters.push_back(db_with_cfh.db->NewIterator(options));
5214 5215 5216
          }
        }
      }
5217 5218 5219 5220 5221 5222 5223
      // Pick a Iterator to use
      Iterator* iter_to_use = single_iter;
      if (single_iter == nullptr) {
        iter_to_use = multi_iters[thread->rand.Next() % multi_iters.size()];
      }

      iter_to_use->Seek(key);
L
Lei Jin 已提交
5224
      read++;
5225
      if (iter_to_use->Valid() && iter_to_use->key().compare(key) == 0) {
L
Lei Jin 已提交
5226 5227
        found++;
      }
5228 5229 5230 5231 5232 5233

      for (int j = 0; j < FLAGS_seek_nexts && iter_to_use->Valid(); ++j) {
        // Copy out iterator's value to make sure we read them.
        Slice value = iter_to_use->value();
        memcpy(value_buffer, value.data(),
               std::min(value.size(), sizeof(value_buffer)));
5234
        bytes += iter_to_use->key().size() + iter_to_use->value().size();
M
Mark Callaghan 已提交
5235 5236 5237 5238 5239 5240

        if (!FLAGS_reverse_iterator) {
          iter_to_use->Next();
        } else {
          iter_to_use->Prev();
        }
5241 5242 5243
        assert(iter_to_use->status().ok());
      }

5244 5245
      if (thread->shared->read_rate_limiter.get() != nullptr &&
          read % 256 == 255) {
5246 5247
        thread->shared->read_rate_limiter->Request(
            256, Env::IO_HIGH, nullptr /* stats */, RateLimiter::OpType::kRead);
5248 5249
      }

5250
      thread->stats.FinishedOps(&db_, db_.db, 1, kSeek);
S
Sanjay Ghemawat 已提交
5251
    }
5252 5253 5254 5255
    delete single_iter;
    for (auto iter : multi_iters) {
      delete iter;
    }
L
Lei Jin 已提交
5256

S
Sanjay Ghemawat 已提交
5257
    char msg[100];
L
Lei Jin 已提交
5258
    snprintf(msg, sizeof(msg), "(%" PRIu64 " of %" PRIu64 " found)\n",
L
Lei Jin 已提交
5259
             found, read);
5260
    thread->stats.AddBytes(bytes);
S
Sanjay Ghemawat 已提交
5261
    thread->stats.AddMessage(msg);
5262
    if (FLAGS_perf_level > rocksdb::PerfLevel::kDisable) {
5263 5264
      thread->stats.AddMessage(std::string("PERF_CONTEXT:\n") +
                               get_perf_context()->ToString());
L
Lei Jin 已提交
5265
    }
S
Sanjay Ghemawat 已提交
5266
  }
L
Lei Jin 已提交
5267 5268 5269 5270 5271

  void SeekRandomWhileWriting(ThreadState* thread) {
    if (thread->tid > 0) {
      SeekRandom(thread);
    } else {
5272
      BGWriter(thread, kWrite);
L
Lei Jin 已提交
5273 5274
    }
  }
S
Sanjay Ghemawat 已提交
5275

5276 5277 5278 5279 5280 5281 5282 5283
  void SeekRandomWhileMerging(ThreadState* thread) {
    if (thread->tid > 0) {
      SeekRandom(thread);
    } else {
      BGWriter(thread, kMerge);
    }
  }

S
Sanjay Ghemawat 已提交
5284 5285
  void DoDelete(ThreadState* thread, bool seq) {
    WriteBatch batch;
Y
Yueh-Hsuan Chiang 已提交
5286
    Duration duration(seq ? 0 : FLAGS_duration, deletes_);
L
Lei Jin 已提交
5287
    int64_t i = 0;
5288 5289
    std::unique_ptr<const char[]> key_guard;
    Slice key = AllocateKey(&key_guard);
L
Lei Jin 已提交
5290

M
Mark Callaghan 已提交
5291
    while (!duration.Done(entries_per_batch_)) {
5292
      DB* db = SelectDB(thread);
S
Sanjay Ghemawat 已提交
5293
      batch.Clear();
L
Lei Jin 已提交
5294 5295 5296
      for (int64_t j = 0; j < entries_per_batch_; ++j) {
        const int64_t k = seq ? i + j : (thread->rand.Next() % FLAGS_num);
        GenerateKeyFromInt(k, FLAGS_num, &key);
5297
        batch.Delete(key);
S
Sanjay Ghemawat 已提交
5298
      }
5299
      auto s = db->Write(write_options_, &batch);
5300
      thread->stats.FinishedOps(nullptr, db, entries_per_batch_, kDelete);
S
Sanjay Ghemawat 已提交
5301 5302 5303 5304
      if (!s.ok()) {
        fprintf(stderr, "del error: %s\n", s.ToString().c_str());
        exit(1);
      }
L
Lei Jin 已提交
5305
      i += entries_per_batch_;
S
Sanjay Ghemawat 已提交
5306 5307 5308 5309 5310 5311 5312 5313 5314 5315 5316
    }
  }

  void DeleteSeq(ThreadState* thread) {
    DoDelete(thread, true);
  }

  void DeleteRandom(ThreadState* thread) {
    DoDelete(thread, false);
  }

5317 5318 5319 5320
  void ReadWhileWriting(ThreadState* thread) {
    if (thread->tid > 0) {
      ReadRandom(thread);
    } else {
5321
      BGWriter(thread, kWrite);
5322 5323
    }
  }
5324

M
Mark Callaghan 已提交
5325 5326 5327 5328 5329 5330 5331 5332
  void ReadWhileMerging(ThreadState* thread) {
    if (thread->tid > 0) {
      ReadRandom(thread);
    } else {
      BGWriter(thread, kMerge);
    }
  }

5333
  void BGWriter(ThreadState* thread, enum OperationType write_merge) {
5334 5335
    // Special thread that keeps writing until other threads are done.
    RandomGenerator gen;
5336
    int64_t bytes = 0;
5337

5338 5339 5340 5341 5342
    std::unique_ptr<RateLimiter> write_rate_limiter;
    if (FLAGS_benchmark_write_rate_limit > 0) {
      write_rate_limiter.reset(
          NewGenericRateLimiter(FLAGS_benchmark_write_rate_limit));
    }
5343 5344 5345 5346

    // Don't merge stats from this thread with the readers.
    thread->stats.SetExcludeFromMerge();

5347 5348
    std::unique_ptr<const char[]> key_guard;
    Slice key = AllocateKey(&key_guard);
5349 5350
    uint32_t written = 0;
    bool hint_printed = false;
5351 5352

    while (true) {
5353
      DB* db = SelectDB(thread);
5354 5355
      {
        MutexLock l(&thread->shared->mu);
5356 5357 5358 5359
        if (FLAGS_finish_after_writes && written == writes_) {
          fprintf(stderr, "Exiting the writer after %u writes...\n", written);
          break;
        }
5360 5361
        if (thread->shared->num_done + 1 >= thread->shared->num_initialized) {
          // Other threads have finished
5362 5363 5364 5365
          if (FLAGS_finish_after_writes) {
            // Wait for the writes to be finished
            if (!hint_printed) {
              fprintf(stderr, "Reads are finished. Have %d more writes to do\n",
5366
                      static_cast<int>(writes_) - written);
5367 5368 5369 5370 5371 5372
              hint_printed = true;
            }
          } else {
            // Finish the write immediately
            break;
          }
5373
        }
5374 5375 5376
      }

      GenerateKeyFromInt(thread->rand.Next() % FLAGS_num, FLAGS_num, &key);
M
Mark Callaghan 已提交
5377 5378
      Status s;

5379
      if (write_merge == kWrite) {
5380
        s = db->Put(write_options_, key, gen.Generate(value_size_));
M
Mark Callaghan 已提交
5381
      } else {
5382
        s = db->Merge(write_options_, key, gen.Generate(value_size_));
M
Mark Callaghan 已提交
5383
      }
5384
      written++;
M
Mark Callaghan 已提交
5385

5386
      if (!s.ok()) {
M
Mark Callaghan 已提交
5387
        fprintf(stderr, "put or merge error: %s\n", s.ToString().c_str());
5388 5389
        exit(1);
      }
5390
      bytes += key.size() + value_size_;
5391
      thread->stats.FinishedOps(&db_, db_.db, 1, kWrite);
5392

5393 5394
      if (FLAGS_benchmark_write_rate_limit > 0) {
        write_rate_limiter->Request(
5395
            entries_per_batch_ * (value_size_ + key_size_), Env::IO_HIGH,
5396
            nullptr /* stats */, RateLimiter::OpType::kWrite);
5397 5398
      }
    }
5399
    thread->stats.AddBytes(bytes);
5400 5401
  }

Y
Yi Wu 已提交
5402 5403 5404 5405 5406 5407 5408 5409 5410 5411 5412 5413 5414 5415 5416 5417 5418
  void ReadWhileScanning(ThreadState* thread) {
    if (thread->tid > 0) {
      ReadRandom(thread);
    } else {
      BGScan(thread);
    }
  }

  void BGScan(ThreadState* thread) {
    if (FLAGS_num_multi_db > 0) {
      fprintf(stderr, "Not supporting multiple DBs.\n");
      abort();
    }
    assert(db_.db != nullptr);
    ReadOptions read_options;
    Iterator* iter = db_.db->NewIterator(read_options);

5419
    fprintf(stderr, "num reads to do %" PRIu64 "\n", reads_);
Y
Yi Wu 已提交
5420 5421 5422 5423 5424 5425 5426 5427 5428 5429 5430 5431 5432 5433 5434 5435 5436 5437 5438 5439 5440
    Duration duration(FLAGS_duration, reads_);
    uint64_t num_seek_to_first = 0;
    uint64_t num_next = 0;
    while (!duration.Done(1)) {
      if (!iter->Valid()) {
        iter->SeekToFirst();
        num_seek_to_first++;
      } else if (!iter->status().ok()) {
        fprintf(stderr, "Iterator error: %s\n",
                iter->status().ToString().c_str());
        abort();
      } else {
        iter->Next();
        num_next++;
      }

      thread->stats.FinishedOps(&db_, db_.db, 1, kSeek);
    }
    delete iter;
  }

5441
  // Given a key K and value V, this puts (K+"0", V), (K+"1", V), (K+"2", V)
5442
  // in DB atomically i.e in a single batch. Also refer GetMany.
5443 5444
  Status PutMany(DB* db, const WriteOptions& writeoptions, const Slice& key,
                 const Slice& value) {
5445 5446 5447 5448 5449 5450 5451 5452 5453 5454
    std::string suffixes[3] = {"2", "1", "0"};
    std::string keys[3];

    WriteBatch batch;
    Status s;
    for (int i = 0; i < 3; i++) {
      keys[i] = key.ToString() + suffixes[i];
      batch.Put(keys[i], value);
    }

5455
    s = db->Write(writeoptions, &batch);
5456 5457 5458 5459 5460
    return s;
  }


  // Given a key K, this deletes (K+"0", V), (K+"1", V), (K+"2", V)
5461
  // in DB atomically i.e in a single batch. Also refer GetMany.
5462 5463
  Status DeleteMany(DB* db, const WriteOptions& writeoptions,
                    const Slice& key) {
5464 5465 5466 5467 5468 5469 5470 5471 5472 5473
    std::string suffixes[3] = {"1", "2", "0"};
    std::string keys[3];

    WriteBatch batch;
    Status s;
    for (int i = 0; i < 3; i++) {
      keys[i] = key.ToString() + suffixes[i];
      batch.Delete(keys[i]);
    }

5474
    s = db->Write(writeoptions, &batch);
5475 5476 5477 5478 5479
    return s;
  }

  // Given a key K and value V, this gets values for K+"0", K+"1" and K+"2"
  // in the same snapshot, and verifies that all the values are identical.
5480
  // ASSUMES that PutMany was used to put (K, V) into the DB.
5481 5482
  Status GetMany(DB* db, const ReadOptions& readoptions, const Slice& key,
                 std::string* value) {
5483 5484 5485 5486 5487
    std::string suffixes[3] = {"0", "1", "2"};
    std::string keys[3];
    Slice key_slices[3];
    std::string values[3];
    ReadOptions readoptionscopy = readoptions;
5488
    readoptionscopy.snapshot = db->GetSnapshot();
5489 5490 5491 5492
    Status s;
    for (int i = 0; i < 3; i++) {
      keys[i] = key.ToString() + suffixes[i];
      key_slices[i] = keys[i];
5493
      s = db->Get(readoptionscopy, key_slices[i], value);
5494 5495 5496 5497 5498 5499 5500 5501 5502 5503 5504
      if (!s.ok() && !s.IsNotFound()) {
        fprintf(stderr, "get error: %s\n", s.ToString().c_str());
        values[i] = "";
        // we continue after error rather than exiting so that we can
        // find more errors if any
      } else if (s.IsNotFound()) {
        values[i] = "";
      } else {
        values[i] = *value;
      }
    }
5505
    db->ReleaseSnapshot(readoptionscopy.snapshot);
5506 5507 5508 5509 5510 5511 5512 5513 5514 5515 5516 5517 5518

    if ((values[0] != values[1]) || (values[1] != values[2])) {
      fprintf(stderr, "inconsistent values for key %s: %s, %s, %s\n",
              key.ToString().c_str(), values[0].c_str(), values[1].c_str(),
              values[2].c_str());
      // we continue after error rather than exiting so that we can
      // find more errors if any
    }

    return s;
  }

  // Differs from readrandomwriterandom in the following ways:
5519
  // (a) Uses GetMany/PutMany to read/write key values. Refer to those funcs.
5520 5521 5522 5523
  // (b) Does deletes as well (per FLAGS_deletepercent)
  // (c) In order to achieve high % of 'found' during lookups, and to do
  //     multiple writes (including puts and deletes) it uses upto
  //     FLAGS_numdistinct distinct keys instead of FLAGS_num distinct keys.
5524
  // (d) Does not have a MultiGet option.
5525 5526 5527 5528
  void RandomWithVerify(ThreadState* thread) {
    ReadOptions options(FLAGS_verify_checksum, true);
    RandomGenerator gen;
    std::string value;
5529
    int64_t found = 0;
5530 5531 5532
    int get_weight = 0;
    int put_weight = 0;
    int delete_weight = 0;
5533 5534 5535
    int64_t gets_done = 0;
    int64_t puts_done = 0;
    int64_t deletes_done = 0;
5536

5537 5538
    std::unique_ptr<const char[]> key_guard;
    Slice key = AllocateKey(&key_guard);
L
Lei Jin 已提交
5539

5540
    // the number of iterations is the larger of read_ or write_
5541
    for (int64_t i = 0; i < readwrites_; i++) {
5542
      DB* db = SelectDB(thread);
5543
      if (get_weight == 0 && put_weight == 0 && delete_weight == 0) {
5544
        // one batch completed, reinitialize for next batch
5545 5546 5547 5548
        get_weight = FLAGS_readwritepercent;
        delete_weight = FLAGS_deletepercent;
        put_weight = 100 - get_weight - delete_weight;
      }
L
Lei Jin 已提交
5549 5550
      GenerateKeyFromInt(thread->rand.Next() % FLAGS_numdistinct,
          FLAGS_numdistinct, &key);
5551 5552
      if (get_weight > 0) {
        // do all the gets first
5553
        Status s = GetMany(db, options, key, &value);
5554
        if (!s.ok() && !s.IsNotFound()) {
5555
          fprintf(stderr, "getmany error: %s\n", s.ToString().c_str());
5556 5557 5558 5559 5560 5561 5562
          // we continue after error rather than exiting so that we can
          // find more errors if any
        } else if (!s.IsNotFound()) {
          found++;
        }
        get_weight--;
        gets_done++;
5563
        thread->stats.FinishedOps(&db_, db_.db, 1, kRead);
5564 5565 5566
      } else if (put_weight > 0) {
        // then do all the corresponding number of puts
        // for all the gets we have done earlier
5567
        Status s = PutMany(db, write_options_, key, gen.Generate(value_size_));
5568
        if (!s.ok()) {
5569
          fprintf(stderr, "putmany error: %s\n", s.ToString().c_str());
5570 5571 5572 5573
          exit(1);
        }
        put_weight--;
        puts_done++;
5574
        thread->stats.FinishedOps(&db_, db_.db, 1, kWrite);
5575
      } else if (delete_weight > 0) {
5576
        Status s = DeleteMany(db, write_options_, key);
5577
        if (!s.ok()) {
5578
          fprintf(stderr, "deletemany error: %s\n", s.ToString().c_str());
5579 5580 5581 5582
          exit(1);
        }
        delete_weight--;
        deletes_done++;
5583
        thread->stats.FinishedOps(&db_, db_.db, 1, kDelete);
5584 5585
      }
    }
D
Daniel Black 已提交
5586
    char msg[128];
5587
    snprintf(msg, sizeof(msg),
5588 5589
             "( get:%" PRIu64 " put:%" PRIu64 " del:%" PRIu64 " total:%" \
             PRIu64 " found:%" PRIu64 ")",
5590 5591 5592 5593
             gets_done, puts_done, deletes_done, readwrites_, found);
    thread->stats.AddMessage(msg);
  }

X
Xing Jin 已提交
5594
  // This is different from ReadWhileWriting because it does not use
5595
  // an extra thread.
5596 5597 5598 5599
  void ReadRandomWriteRandom(ThreadState* thread) {
    ReadOptions options(FLAGS_verify_checksum, true);
    RandomGenerator gen;
    std::string value;
5600
    int64_t found = 0;
5601 5602
    int get_weight = 0;
    int put_weight = 0;
5603 5604
    int64_t reads_done = 0;
    int64_t writes_done = 0;
M
Mark Callaghan 已提交
5605 5606
    Duration duration(FLAGS_duration, readwrites_);

5607 5608
    std::unique_ptr<const char[]> key_guard;
    Slice key = AllocateKey(&key_guard);
L
Lei Jin 已提交
5609

5610
    // the number of iterations is the larger of read_ or write_
M
Mark Callaghan 已提交
5611
    while (!duration.Done(1)) {
5612
      DB* db = SelectDB(thread);
L
Lei Jin 已提交
5613
      GenerateKeyFromInt(thread->rand.Next() % FLAGS_num, FLAGS_num, &key);
5614
      if (get_weight == 0 && put_weight == 0) {
X
Xing Jin 已提交
5615
        // one batch completed, reinitialize for next batch
5616 5617 5618 5619 5620
        get_weight = FLAGS_readwritepercent;
        put_weight = 100 - get_weight;
      }
      if (get_weight > 0) {
        // do all the gets first
5621
        Status s = db->Get(options, key, &value);
5622 5623 5624 5625 5626 5627 5628
        if (!s.ok() && !s.IsNotFound()) {
          fprintf(stderr, "get error: %s\n", s.ToString().c_str());
          // we continue after error rather than exiting so that we can
          // find more errors if any
        } else if (!s.IsNotFound()) {
          found++;
        }
5629 5630
        get_weight--;
        reads_done++;
5631
        thread->stats.FinishedOps(nullptr, db, 1, kRead);
5632 5633 5634
      } else  if (put_weight > 0) {
        // then do all the corresponding number of puts
        // for all the gets we have done earlier
5635
        Status s = db->Put(write_options_, key, gen.Generate(value_size_));
5636 5637 5638 5639 5640 5641
        if (!s.ok()) {
          fprintf(stderr, "put error: %s\n", s.ToString().c_str());
          exit(1);
        }
        put_weight--;
        writes_done++;
5642
        thread->stats.FinishedOps(nullptr, db, 1, kWrite);
5643 5644 5645
      }
    }
    char msg[100];
5646 5647
    snprintf(msg, sizeof(msg), "( reads:%" PRIu64 " writes:%" PRIu64 \
             " total:%" PRIu64 " found:%" PRIu64 ")",
5648
             reads_done, writes_done, readwrites_, found);
5649 5650 5651
    thread->stats.AddMessage(msg);
  }

M
Mark Callaghan 已提交
5652 5653 5654 5655 5656 5657
  //
  // Read-modify-write for random keys
  void UpdateRandom(ThreadState* thread) {
    ReadOptions options(FLAGS_verify_checksum, true);
    RandomGenerator gen;
    std::string value;
5658
    int64_t found = 0;
5659
    int64_t bytes = 0;
M
Mark Callaghan 已提交
5660 5661
    Duration duration(FLAGS_duration, readwrites_);

5662 5663
    std::unique_ptr<const char[]> key_guard;
    Slice key = AllocateKey(&key_guard);
M
Mark Callaghan 已提交
5664 5665
    // the number of iterations is the larger of read_ or write_
    while (!duration.Done(1)) {
5666
      DB* db = SelectDB(thread);
L
Lei Jin 已提交
5667
      GenerateKeyFromInt(thread->rand.Next() % FLAGS_num, FLAGS_num, &key);
M
Mark Callaghan 已提交
5668

5669 5670 5671
      auto status = db->Get(options, key, &value);
      if (status.ok()) {
        ++found;
5672
        bytes += key.size() + value.size();
5673
      } else if (!status.IsNotFound()) {
I
Igor Canadi 已提交
5674 5675
        fprintf(stderr, "Get returned an error: %s\n",
                status.ToString().c_str());
5676
        abort();
M
Mark Callaghan 已提交
5677 5678
      }

5679 5680 5681 5682 5683 5684
      if (thread->shared->write_rate_limiter) {
        thread->shared->write_rate_limiter->Request(
            key.size() + value_size_, Env::IO_HIGH, nullptr /*stats*/,
            RateLimiter::OpType::kWrite);
      }

5685
      Status s = db->Put(write_options_, key, gen.Generate(value_size_));
M
Mark Callaghan 已提交
5686 5687 5688 5689
      if (!s.ok()) {
        fprintf(stderr, "put error: %s\n", s.ToString().c_str());
        exit(1);
      }
5690
      bytes += key.size() + value_size_;
5691
      thread->stats.FinishedOps(nullptr, db, 1, kUpdate);
M
Mark Callaghan 已提交
5692 5693
    }
    char msg[100];
5694
    snprintf(msg, sizeof(msg),
5695
             "( updates:%" PRIu64 " found:%" PRIu64 ")", readwrites_, found);
5696
    thread->stats.AddBytes(bytes);
M
Mark Callaghan 已提交
5697 5698 5699
    thread->stats.AddMessage(msg);
  }

P
Pooya Shareghi 已提交
5700 5701 5702 5703 5704 5705 5706 5707 5708 5709 5710 5711 5712 5713 5714 5715 5716 5717 5718 5719 5720 5721 5722 5723 5724 5725 5726 5727 5728 5729 5730 5731 5732 5733 5734 5735 5736 5737 5738 5739 5740 5741 5742 5743 5744 5745 5746 5747 5748 5749 5750 5751
  // Read-XOR-write for random keys. Xors the existing value with a randomly
  // generated value, and stores the result. Assuming A in the array of bytes
  // representing the existing value, we generate an array B of the same size,
  // then compute C = A^B as C[i]=A[i]^B[i], and store C
  void XORUpdateRandom(ThreadState* thread) {
    ReadOptions options(FLAGS_verify_checksum, true);
    RandomGenerator gen;
    std::string existing_value;
    int64_t found = 0;
    Duration duration(FLAGS_duration, readwrites_);

    BytesXOROperator xor_operator;

    std::unique_ptr<const char[]> key_guard;
    Slice key = AllocateKey(&key_guard);
    // the number of iterations is the larger of read_ or write_
    while (!duration.Done(1)) {
      DB* db = SelectDB(thread);
      GenerateKeyFromInt(thread->rand.Next() % FLAGS_num, FLAGS_num, &key);

      auto status = db->Get(options, key, &existing_value);
      if (status.ok()) {
        ++found;
      } else if (!status.IsNotFound()) {
        fprintf(stderr, "Get returned an error: %s\n",
                status.ToString().c_str());
        exit(1);
      }

      Slice value = gen.Generate(value_size_);
      std::string new_value;

      if (status.ok()) {
        Slice existing_value_slice = Slice(existing_value);
        xor_operator.XOR(&existing_value_slice, value, &new_value);
      } else {
        xor_operator.XOR(nullptr, value, &new_value);
      }

      Status s = db->Put(write_options_, key, Slice(new_value));
      if (!s.ok()) {
        fprintf(stderr, "put error: %s\n", s.ToString().c_str());
        exit(1);
      }
      thread->stats.FinishedOps(nullptr, db, 1);
    }
    char msg[100];
    snprintf(msg, sizeof(msg),
             "( updates:%" PRIu64 " found:%" PRIu64 ")", readwrites_, found);
    thread->stats.AddMessage(msg);
  }

D
Deon Nicholas 已提交
5752 5753 5754 5755 5756 5757 5758
  // Read-modify-write for random keys.
  // Each operation causes the key grow by value_size (simulating an append).
  // Generally used for benchmarking against merges of similar type
  void AppendRandom(ThreadState* thread) {
    ReadOptions options(FLAGS_verify_checksum, true);
    RandomGenerator gen;
    std::string value;
5759
    int64_t found = 0;
5760
    int64_t bytes = 0;
D
Deon Nicholas 已提交
5761

5762 5763
    std::unique_ptr<const char[]> key_guard;
    Slice key = AllocateKey(&key_guard);
D
Deon Nicholas 已提交
5764 5765 5766
    // The number of iterations is the larger of read_ or write_
    Duration duration(FLAGS_duration, readwrites_);
    while (!duration.Done(1)) {
5767
      DB* db = SelectDB(thread);
L
Lei Jin 已提交
5768
      GenerateKeyFromInt(thread->rand.Next() % FLAGS_num, FLAGS_num, &key);
D
Deon Nicholas 已提交
5769

5770 5771 5772
      auto status = db->Get(options, key, &value);
      if (status.ok()) {
        ++found;
5773
        bytes += key.size() + value.size();
5774
      } else if (!status.IsNotFound()) {
I
Igor Canadi 已提交
5775 5776
        fprintf(stderr, "Get returned an error: %s\n",
                status.ToString().c_str());
5777
        abort();
D
Deon Nicholas 已提交
5778 5779 5780 5781 5782 5783 5784 5785
      } else {
        // If not existing, then just assume an empty string of data
        value.clear();
      }

      // Update the value (by appending data)
      Slice operand = gen.Generate(value_size_);
      if (value.size() > 0) {
5786
        // Use a delimiter to match the semantics for StringAppendOperator
D
Deon Nicholas 已提交
5787 5788 5789 5790 5791
        value.append(1,',');
      }
      value.append(operand.data(), operand.size());

      // Write back to the database
5792
      Status s = db->Put(write_options_, key, value);
D
Deon Nicholas 已提交
5793 5794 5795 5796
      if (!s.ok()) {
        fprintf(stderr, "put error: %s\n", s.ToString().c_str());
        exit(1);
      }
5797
      bytes += key.size() + value.size();
5798
      thread->stats.FinishedOps(nullptr, db, 1, kUpdate);
D
Deon Nicholas 已提交
5799
    }
L
Lei Jin 已提交
5800

D
Deon Nicholas 已提交
5801
    char msg[100];
5802 5803
    snprintf(msg, sizeof(msg), "( updates:%" PRIu64 " found:%" PRIu64 ")",
            readwrites_, found);
5804
    thread->stats.AddBytes(bytes);
D
Deon Nicholas 已提交
5805 5806 5807 5808 5809 5810 5811 5812 5813 5814
    thread->stats.AddMessage(msg);
  }

  // Read-modify-write for random keys (using MergeOperator)
  // The merge operator to use should be defined by FLAGS_merge_operator
  // Adjust FLAGS_value_size so that the keys are reasonable for this operator
  // Assumes that the merge operator is non-null (i.e.: is well-defined)
  //
  // For example, use FLAGS_merge_operator="uint64add" and FLAGS_value_size=8
  // to simulate random additions over 64-bit integers using merge.
5815 5816 5817
  //
  // The number of merges on the same key can be controlled by adjusting
  // FLAGS_merge_keys.
D
Deon Nicholas 已提交
5818 5819
  void MergeRandom(ThreadState* thread) {
    RandomGenerator gen;
5820
    int64_t bytes = 0;
5821 5822
    std::unique_ptr<const char[]> key_guard;
    Slice key = AllocateKey(&key_guard);
D
Deon Nicholas 已提交
5823 5824 5825
    // The number of iterations is the larger of read_ or write_
    Duration duration(FLAGS_duration, readwrites_);
    while (!duration.Done(1)) {
5826 5827 5828
      DBWithColumnFamilies* db_with_cfh = SelectDBWithCfh(thread);
      int64_t key_rand = thread->rand.Next() % merge_keys_;
      GenerateKeyFromInt(key_rand, merge_keys_, &key);
D
Deon Nicholas 已提交
5829

5830 5831 5832 5833 5834 5835 5836 5837 5838 5839
      Status s;
      if (FLAGS_num_column_families > 1) {
        s = db_with_cfh->db->Merge(write_options_,
                                   db_with_cfh->GetCfh(key_rand), key,
                                   gen.Generate(value_size_));
      } else {
        s = db_with_cfh->db->Merge(write_options_,
                                   db_with_cfh->db->DefaultColumnFamily(), key,
                                   gen.Generate(value_size_));
      }
D
Deon Nicholas 已提交
5840 5841 5842 5843 5844

      if (!s.ok()) {
        fprintf(stderr, "merge error: %s\n", s.ToString().c_str());
        exit(1);
      }
5845
      bytes += key.size() + value_size_;
5846
      thread->stats.FinishedOps(nullptr, db_with_cfh->db, 1, kMerge);
D
Deon Nicholas 已提交
5847 5848 5849 5850
    }

    // Print some statistics
    char msg[100];
5851
    snprintf(msg, sizeof(msg), "( updates:%" PRIu64 ")", readwrites_);
5852
    thread->stats.AddBytes(bytes);
D
Deon Nicholas 已提交
5853 5854 5855
    thread->stats.AddMessage(msg);
  }

5856 5857 5858 5859 5860 5861 5862 5863 5864 5865 5866
  // Read and merge random keys. The amount of reads and merges are controlled
  // by adjusting FLAGS_num and FLAGS_mergereadpercent. The number of distinct
  // keys (and thus also the number of reads and merges on the same key) can be
  // adjusted with FLAGS_merge_keys.
  //
  // As with MergeRandom, the merge operator to use should be defined by
  // FLAGS_merge_operator.
  void ReadRandomMergeRandom(ThreadState* thread) {
    ReadOptions options(FLAGS_verify_checksum, true);
    RandomGenerator gen;
    std::string value;
5867 5868 5869
    int64_t num_hits = 0;
    int64_t num_gets = 0;
    int64_t num_merges = 0;
5870 5871
    size_t max_length = 0;

5872 5873
    std::unique_ptr<const char[]> key_guard;
    Slice key = AllocateKey(&key_guard);
5874 5875 5876
    // the number of iterations is the larger of read_ or write_
    Duration duration(FLAGS_duration, readwrites_);
    while (!duration.Done(1)) {
5877
      DB* db = SelectDB(thread);
L
Lei Jin 已提交
5878
      GenerateKeyFromInt(thread->rand.Next() % merge_keys_, merge_keys_, &key);
5879 5880 5881 5882

      bool do_merge = int(thread->rand.Next() % 100) < FLAGS_mergereadpercent;

      if (do_merge) {
5883
        Status s = db->Merge(write_options_, key, gen.Generate(value_size_));
5884 5885 5886 5887 5888
        if (!s.ok()) {
          fprintf(stderr, "merge error: %s\n", s.ToString().c_str());
          exit(1);
        }
        num_merges++;
5889
        thread->stats.FinishedOps(nullptr, db, 1, kMerge);
5890
      } else {
5891
        Status s = db->Get(options, key, &value);
5892 5893 5894 5895 5896 5897 5898 5899 5900 5901 5902
        if (value.length() > max_length)
          max_length = value.length();

        if (!s.ok() && !s.IsNotFound()) {
          fprintf(stderr, "get error: %s\n", s.ToString().c_str());
          // we continue after error rather than exiting so that we can
          // find more errors if any
        } else if (!s.IsNotFound()) {
          num_hits++;
        }
        num_gets++;
5903
        thread->stats.FinishedOps(nullptr, db, 1, kRead);
5904 5905
      }
    }
L
Lei Jin 已提交
5906

5907 5908
    char msg[100];
    snprintf(msg, sizeof(msg),
S
sdong 已提交
5909 5910
             "(reads:%" PRIu64 " merges:%" PRIu64 " total:%" PRIu64
             " hits:%" PRIu64 " maxlength:%" ROCKSDB_PRIszt ")",
5911 5912 5913 5914
             num_gets, num_merges, readwrites_, num_hits, max_length);
    thread->stats.AddMessage(msg);
  }

T
Tomislav Novak 已提交
5915 5916 5917 5918 5919 5920 5921 5922 5923 5924
  void WriteSeqSeekSeq(ThreadState* thread) {
    writes_ = FLAGS_num;
    DoWrite(thread, SEQUENTIAL);
    // exclude writes from the ops/sec calculation
    thread->stats.Start(thread->tid);

    DB* db = SelectDB(thread);
    std::unique_ptr<Iterator> iter(
      db->NewIterator(ReadOptions(FLAGS_verify_checksum, true)));

5925 5926
    std::unique_ptr<const char[]> key_guard;
    Slice key = AllocateKey(&key_guard);
T
Tomislav Novak 已提交
5927 5928 5929 5930
    for (int64_t i = 0; i < FLAGS_num; ++i) {
      GenerateKeyFromInt(i, FLAGS_num, &key);
      iter->Seek(key);
      assert(iter->Valid() && iter->key() == key);
5931
      thread->stats.FinishedOps(nullptr, db, 1, kSeek);
T
Tomislav Novak 已提交
5932

5933
      for (int j = 0; j < FLAGS_seek_nexts && i + 1 < FLAGS_num; ++j) {
M
Mark Callaghan 已提交
5934 5935 5936 5937 5938
        if (!FLAGS_reverse_iterator) {
          iter->Next();
        } else {
          iter->Prev();
        }
T
Tomislav Novak 已提交
5939 5940
        GenerateKeyFromInt(++i, FLAGS_num, &key);
        assert(iter->Valid() && iter->key() == key);
5941
        thread->stats.FinishedOps(nullptr, db, 1, kSeek);
T
Tomislav Novak 已提交
5942 5943 5944 5945
      }

      iter->Seek(key);
      assert(iter->Valid() && iter->key() == key);
5946
      thread->stats.FinishedOps(nullptr, db, 1, kSeek);
T
Tomislav Novak 已提交
5947 5948 5949
    }
  }

5950 5951 5952 5953 5954 5955 5956 5957 5958 5959 5960 5961 5962 5963 5964 5965 5966 5967 5968 5969 5970 5971 5972 5973 5974 5975 5976 5977 5978 5979 5980 5981 5982 5983 5984 5985 5986 5987 5988 5989 5990 5991 5992 5993 5994 5995 5996 5997 5998 5999 6000 6001 6002 6003 6004 6005 6006 6007 6008 6009 6010 6011 6012 6013 6014 6015 6016 6017 6018 6019 6020 6021 6022 6023 6024 6025 6026 6027 6028 6029 6030 6031 6032 6033 6034 6035 6036 6037 6038 6039 6040
  bool binary_search(std::vector<int>& data, int start, int end, int key) {
    if (data.empty()) return false;
    if (start > end) return false;
    int mid = start + (end - start) / 2;
    if (mid > static_cast<int>(data.size()) - 1) return false;
    if (data[mid] == key) {
      return true;
    } else if (data[mid] > key) {
      return binary_search(data, start, mid - 1, key);
    } else {
      return binary_search(data, mid + 1, end, key);
    }
  }

  // Does a bunch of merge operations for a key(key1) where the merge operand
  // is a sorted list. Next performance comparison is done between doing a Get
  // for key1 followed by searching for another key(key2) in the large sorted
  // list vs calling GetMergeOperands for key1 and then searching for the key2
  // in all the sorted sub-lists. Later case is expected to be a lot faster.
  void GetMergeOperands(ThreadState* thread) {
    DB* db = SelectDB(thread);
    const int kTotalValues = 100000;
    const int kListSize = 100;
    std::string key = "my_key";
    std::string value;

    for (int i = 1; i < kTotalValues; i++) {
      if (i % kListSize == 0) {
        // Remove trailing ','
        value.pop_back();
        db->Merge(WriteOptions(), key, value);
        value.clear();
      } else {
        value.append(std::to_string(i)).append(",");
      }
    }

    SortList s;
    std::vector<int> data;
    // This value can be experimented with and it will demonstrate the
    // perf difference between doing a Get and searching for lookup_key in the
    // resultant large sorted list vs doing GetMergeOperands and searching
    // for lookup_key within this resultant sorted sub-lists.
    int lookup_key = 1;

    // Get API call
    std::cout << "--- Get API call --- \n";
    PinnableSlice p_slice;
    uint64_t st = FLAGS_env->NowNanos();
    db->Get(ReadOptions(), db->DefaultColumnFamily(), key, &p_slice);
    s.MakeVector(data, p_slice);
    bool found =
        binary_search(data, 0, static_cast<int>(data.size() - 1), lookup_key);
    std::cout << "Found key? " << std::to_string(found) << "\n";
    uint64_t sp = FLAGS_env->NowNanos();
    std::cout << "Get: " << (sp - st) / 1000000000.0 << " seconds\n";
    std::string* dat_ = p_slice.GetSelf();
    std::cout << "Sample data from Get API call: " << dat_->substr(0, 10)
              << "\n";
    data.clear();

    // GetMergeOperands API call
    std::cout << "--- GetMergeOperands API --- \n";
    std::vector<PinnableSlice> a_slice((kTotalValues / kListSize) + 1);
    st = FLAGS_env->NowNanos();
    int number_of_operands = 0;
    GetMergeOperandsOptions get_merge_operands_options;
    get_merge_operands_options.expected_max_number_of_operands =
        (kTotalValues / 100) + 1;
    db->GetMergeOperands(ReadOptions(), db->DefaultColumnFamily(), key,
                         a_slice.data(), &get_merge_operands_options,
                         &number_of_operands);
    for (PinnableSlice& psl : a_slice) {
      s.MakeVector(data, psl);
      found =
          binary_search(data, 0, static_cast<int>(data.size() - 1), lookup_key);
      data.clear();
      if (found) break;
    }
    std::cout << "Found key? " << std::to_string(found) << "\n";
    sp = FLAGS_env->NowNanos();
    std::cout << "Get Merge operands: " << (sp - st) / 1000000000.0
              << " seconds \n";
    int to_print = 0;
    std::cout << "Sample data from GetMergeOperands API call: ";
    for (PinnableSlice& psl : a_slice) {
      std::cout << "List: " << to_print << " : " << *psl.GetSelf() << "\n";
      if (to_print++ > 2) break;
    }
  }

6041
#ifndef ROCKSDB_LITE
A
agiardullo 已提交
6042 6043 6044 6045 6046 6047 6048 6049 6050 6051 6052 6053 6054 6055 6056 6057
  // This benchmark stress tests Transactions.  For a given --duration (or
  // total number of --writes, a Transaction will perform a read-modify-write
  // to increment the value of a key in each of N(--transaction-sets) sets of
  // keys (where each set has --num keys).  If --threads is set, this will be
  // done in parallel.
  //
  // To test transactions, use --transaction_db=true.  Not setting this
  // parameter
  // will run the same benchmark without transactions.
  //
  // RandomTransactionVerify() will then validate the correctness of the results
  // by checking if the sum of all keys in each set is the same.
  void RandomTransaction(ThreadState* thread) {
    ReadOptions options(FLAGS_verify_checksum, true);
    Duration duration(FLAGS_duration, readwrites_);
    ReadOptions read_options(FLAGS_verify_checksum, true);
S
SherlockNoMad 已提交
6058
    uint16_t num_prefix_ranges = static_cast<uint16_t>(FLAGS_transaction_sets);
A
agiardullo 已提交
6059
    uint64_t transactions_done = 0;
A
agiardullo 已提交
6060 6061 6062 6063 6064 6065

    if (num_prefix_ranges == 0 || num_prefix_ranges > 9999) {
      fprintf(stderr, "invalid value for transaction_sets\n");
      abort();
    }

A
agiardullo 已提交
6066 6067 6068 6069 6070 6071 6072 6073
    TransactionOptions txn_options;
    txn_options.lock_timeout = FLAGS_transaction_lock_timeout;
    txn_options.set_snapshot = FLAGS_transaction_set_snapshot;

    RandomTransactionInserter inserter(&thread->rand, write_options_,
                                       read_options, FLAGS_num,
                                       num_prefix_ranges);

A
agiardullo 已提交
6074 6075 6076 6077 6078 6079 6080 6081
    if (FLAGS_num_multi_db > 1) {
      fprintf(stderr,
              "Cannot run RandomTransaction benchmark with "
              "FLAGS_multi_db > 1.");
      abort();
    }

    while (!duration.Done(1)) {
A
agiardullo 已提交
6082
      bool success;
A
agiardullo 已提交
6083

A
agiardullo 已提交
6084 6085
      // RandomTransactionInserter will attempt to insert a key for each
      // # of FLAGS_transaction_sets
A
agiardullo 已提交
6086
      if (FLAGS_optimistic_transaction_db) {
A
agiardullo 已提交
6087
        success = inserter.OptimisticTransactionDBInsert(db_.opt_txn_db);
A
agiardullo 已提交
6088 6089
      } else if (FLAGS_transaction_db) {
        TransactionDB* txn_db = reinterpret_cast<TransactionDB*>(db_.db);
A
agiardullo 已提交
6090
        success = inserter.TransactionDBInsert(txn_db, txn_options);
A
agiardullo 已提交
6091
      } else {
A
agiardullo 已提交
6092
        success = inserter.DBInsert(db_.db);
A
agiardullo 已提交
6093 6094
      }

A
agiardullo 已提交
6095 6096 6097 6098
      if (!success) {
        fprintf(stderr, "Unexpected error: %s\n",
                inserter.GetLastStatus().ToString().c_str());
        abort();
6099 6100
      }

A
agiardullo 已提交
6101
      thread->stats.FinishedOps(nullptr, db_.db, 1, kOthers);
A
agiardullo 已提交
6102 6103 6104 6105
      transactions_done++;
    }

    char msg[100];
A
agiardullo 已提交
6106
    if (FLAGS_optimistic_transaction_db || FLAGS_transaction_db) {
A
agiardullo 已提交
6107 6108
      snprintf(msg, sizeof(msg),
               "( transactions:%" PRIu64 " aborts:%" PRIu64 ")",
A
agiardullo 已提交
6109
               transactions_done, inserter.GetFailureCount());
A
agiardullo 已提交
6110 6111 6112 6113 6114
    } else {
      snprintf(msg, sizeof(msg), "( batches:%" PRIu64 " )", transactions_done);
    }
    thread->stats.AddMessage(msg);

6115
    if (FLAGS_perf_level > rocksdb::PerfLevel::kDisable) {
6116 6117
      thread->stats.AddMessage(std::string("PERF_CONTEXT:\n") +
                               get_perf_context()->ToString());
A
agiardullo 已提交
6118
    }
6119
    thread->stats.AddBytes(static_cast<int64_t>(inserter.GetBytesInserted()));
A
agiardullo 已提交
6120 6121 6122 6123 6124 6125
  }

  // Verifies consistency of data after RandomTransaction() has been run.
  // Since each iteration of RandomTransaction() incremented a key in each set
  // by the same value, the sum of the keys in each set should be the same.
  void RandomTransactionVerify() {
A
agiardullo 已提交
6126
    if (!FLAGS_transaction_db && !FLAGS_optimistic_transaction_db) {
A
agiardullo 已提交
6127 6128 6129 6130
      // transactions not used, nothing to verify.
      return;
    }

A
agiardullo 已提交
6131
    Status s =
S
SherlockNoMad 已提交
6132 6133
        RandomTransactionInserter::Verify(db_.db,
                            static_cast<uint16_t>(FLAGS_transaction_sets));
A
agiardullo 已提交
6134

A
agiardullo 已提交
6135 6136 6137 6138
    if (s.ok()) {
      fprintf(stdout, "RandomTransactionVerify Success.\n");
    } else {
      fprintf(stdout, "RandomTransactionVerify FAILED!!\n");
A
agiardullo 已提交
6139 6140
    }
  }
6141
#endif  // ROCKSDB_LITE
A
agiardullo 已提交
6142

A
Andres Noetzli 已提交
6143 6144 6145 6146 6147 6148 6149 6150 6151 6152 6153 6154 6155 6156 6157 6158 6159 6160 6161 6162 6163 6164 6165 6166 6167 6168 6169 6170 6171 6172 6173 6174 6175 6176 6177 6178 6179 6180 6181 6182 6183 6184 6185 6186 6187 6188 6189 6190 6191
  // Writes and deletes random keys without overwriting keys.
  //
  // This benchmark is intended to partially replicate the behavior of MyRocks
  // secondary indices: All data is stored in keys and updates happen by
  // deleting the old version of the key and inserting the new version.
  void RandomReplaceKeys(ThreadState* thread) {
    std::unique_ptr<const char[]> key_guard;
    Slice key = AllocateKey(&key_guard);
    std::vector<uint32_t> counters(FLAGS_numdistinct, 0);
    size_t max_counter = 50;
    RandomGenerator gen;

    Status s;
    DB* db = SelectDB(thread);
    for (int64_t i = 0; i < FLAGS_numdistinct; i++) {
      GenerateKeyFromInt(i * max_counter, FLAGS_num, &key);
      s = db->Put(write_options_, key, gen.Generate(value_size_));
      if (!s.ok()) {
        fprintf(stderr, "Operation failed: %s\n", s.ToString().c_str());
        exit(1);
      }
    }

    db->GetSnapshot();

    std::default_random_engine generator;
    std::normal_distribution<double> distribution(FLAGS_numdistinct / 2.0,
                                                  FLAGS_stddev);
    Duration duration(FLAGS_duration, FLAGS_num);
    while (!duration.Done(1)) {
      int64_t rnd_id = static_cast<int64_t>(distribution(generator));
      int64_t key_id = std::max(std::min(FLAGS_numdistinct - 1, rnd_id),
                                static_cast<int64_t>(0));
      GenerateKeyFromInt(key_id * max_counter + counters[key_id], FLAGS_num,
                         &key);
      s = FLAGS_use_single_deletes ? db->SingleDelete(write_options_, key)
                                   : db->Delete(write_options_, key);
      if (s.ok()) {
        counters[key_id] = (counters[key_id] + 1) % max_counter;
        GenerateKeyFromInt(key_id * max_counter + counters[key_id], FLAGS_num,
                           &key);
        s = db->Put(write_options_, key, Slice());
      }

      if (!s.ok()) {
        fprintf(stderr, "Operation failed: %s\n", s.ToString().c_str());
        exit(1);
      }

6192
      thread->stats.FinishedOps(nullptr, db, 1, kOthers);
A
Andres Noetzli 已提交
6193 6194 6195 6196 6197 6198 6199 6200 6201 6202
    }

    char msg[200];
    snprintf(msg, sizeof(msg),
             "use single deletes: %d, "
             "standard deviation: %lf\n",
             FLAGS_use_single_deletes, FLAGS_stddev);
    thread->stats.AddMessage(msg);
  }

6203 6204 6205 6206 6207 6208 6209 6210 6211 6212 6213 6214 6215 6216 6217 6218 6219 6220 6221 6222 6223 6224 6225 6226 6227 6228 6229 6230 6231 6232 6233 6234 6235 6236 6237 6238 6239 6240 6241 6242 6243 6244 6245 6246 6247 6248 6249 6250 6251 6252 6253 6254 6255 6256 6257 6258 6259 6260 6261 6262 6263 6264
  void TimeSeriesReadOrDelete(ThreadState* thread, bool do_deletion) {
    ReadOptions options(FLAGS_verify_checksum, true);
    int64_t read = 0;
    int64_t found = 0;
    int64_t bytes = 0;

    Iterator* iter = nullptr;
    // Only work on single database
    assert(db_.db != nullptr);
    iter = db_.db->NewIterator(options);

    std::unique_ptr<const char[]> key_guard;
    Slice key = AllocateKey(&key_guard);

    char value_buffer[256];
    while (true) {
      {
        MutexLock l(&thread->shared->mu);
        if (thread->shared->num_done >= 1) {
          // Write thread have finished
          break;
        }
      }
      if (!FLAGS_use_tailing_iterator) {
        delete iter;
        iter = db_.db->NewIterator(options);
      }
      // Pick a Iterator to use

      int64_t key_id = thread->rand.Next() % FLAGS_key_id_range;
      GenerateKeyFromInt(key_id, FLAGS_num, &key);
      // Reset last 8 bytes to 0
      char* start = const_cast<char*>(key.data());
      start += key.size() - 8;
      memset(start, 0, 8);
      ++read;

      bool key_found = false;
      // Seek the prefix
      for (iter->Seek(key); iter->Valid() && iter->key().starts_with(key);
           iter->Next()) {
        key_found = true;
        // Copy out iterator's value to make sure we read them.
        if (do_deletion) {
          bytes += iter->key().size();
          if (KeyExpired(timestamp_emulator_.get(), iter->key())) {
            thread->stats.FinishedOps(&db_, db_.db, 1, kDelete);
            db_.db->Delete(write_options_, iter->key());
          } else {
            break;
          }
        } else {
          bytes += iter->key().size() + iter->value().size();
          thread->stats.FinishedOps(&db_, db_.db, 1, kRead);
          Slice value = iter->value();
          memcpy(value_buffer, value.data(),
                 std::min(value.size(), sizeof(value_buffer)));

          assert(iter->status().ok());
        }
      }
      found += key_found;
6265 6266

      if (thread->shared->read_rate_limiter.get() != nullptr) {
6267 6268
        thread->shared->read_rate_limiter->Request(
            1, Env::IO_HIGH, nullptr /* stats */, RateLimiter::OpType::kRead);
6269
      }
6270 6271 6272 6273 6274 6275 6276 6277 6278
    }
    delete iter;

    char msg[100];
    snprintf(msg, sizeof(msg), "(%" PRIu64 " of %" PRIu64 " found)", found,
             read);
    thread->stats.AddBytes(bytes);
    thread->stats.AddMessage(msg);
    if (FLAGS_perf_level > rocksdb::PerfLevel::kDisable) {
6279 6280
      thread->stats.AddMessage(std::string("PERF_CONTEXT:\n") +
                               get_perf_context()->ToString());
6281 6282 6283 6284 6285 6286 6287 6288 6289 6290 6291 6292 6293 6294 6295 6296 6297 6298 6299 6300 6301 6302 6303 6304 6305 6306 6307 6308 6309 6310 6311 6312 6313 6314 6315 6316 6317 6318 6319 6320 6321 6322 6323 6324 6325 6326 6327 6328 6329 6330 6331 6332 6333 6334 6335 6336 6337 6338
    }
  }

  void TimeSeriesWrite(ThreadState* thread) {
    // Special thread that keeps writing until other threads are done.
    RandomGenerator gen;
    int64_t bytes = 0;

    // Don't merge stats from this thread with the readers.
    thread->stats.SetExcludeFromMerge();

    std::unique_ptr<RateLimiter> write_rate_limiter;
    if (FLAGS_benchmark_write_rate_limit > 0) {
      write_rate_limiter.reset(
          NewGenericRateLimiter(FLAGS_benchmark_write_rate_limit));
    }

    std::unique_ptr<const char[]> key_guard;
    Slice key = AllocateKey(&key_guard);

    Duration duration(FLAGS_duration, writes_);
    while (!duration.Done(1)) {
      DB* db = SelectDB(thread);

      uint64_t key_id = thread->rand.Next() % FLAGS_key_id_range;
      // Write key id
      GenerateKeyFromInt(key_id, FLAGS_num, &key);
      // Write timestamp

      char* start = const_cast<char*>(key.data());
      char* pos = start + 8;
      int bytes_to_fill =
          std::min(key_size_ - static_cast<int>(pos - start), 8);
      uint64_t timestamp_value = timestamp_emulator_->Get();
      if (port::kLittleEndian) {
        for (int i = 0; i < bytes_to_fill; ++i) {
          pos[i] = (timestamp_value >> ((bytes_to_fill - i - 1) << 3)) & 0xFF;
        }
      } else {
        memcpy(pos, static_cast<void*>(&timestamp_value), bytes_to_fill);
      }

      timestamp_emulator_->Inc();

      Status s;

      s = db->Put(write_options_, key, gen.Generate(value_size_));

      if (!s.ok()) {
        fprintf(stderr, "put error: %s\n", s.ToString().c_str());
        exit(1);
      }
      bytes = key.size() + value_size_;
      thread->stats.FinishedOps(&db_, db_.db, 1, kWrite);
      thread->stats.AddBytes(bytes);

      if (FLAGS_benchmark_write_rate_limit > 0) {
        write_rate_limiter->Request(
6339
            entries_per_batch_ * (value_size_ + key_size_), Env::IO_HIGH,
6340
            nullptr /* stats */, RateLimiter::OpType::kWrite);
6341 6342 6343 6344 6345 6346 6347 6348 6349 6350 6351 6352 6353 6354 6355 6356
      }
    }
  }

  void TimeSeries(ThreadState* thread) {
    if (thread->tid > 0) {
      bool do_deletion = FLAGS_expire_style == "delete" &&
                         thread->tid <= FLAGS_num_deletion_threads;
      TimeSeriesReadOrDelete(thread, do_deletion);
    } else {
      TimeSeriesWrite(thread);
      thread->stats.Stop();
      thread->stats.Report("timeseries write");
    }
  }

6357
  void Compact(ThreadState* thread) {
6358
    DB* db = SelectDB(thread);
6359
    CompactRangeOptions cro;
6360 6361
    cro.bottommost_level_compaction =
        BottommostLevelCompaction::kForceOptimized;
6362
    db->CompactRange(cro, nullptr, nullptr);
J
jorlow@chromium.org 已提交
6363 6364
  }

6365 6366 6367 6368 6369 6370 6371 6372 6373
  void CompactAll() {
    if (db_.db != nullptr) {
      db_.db->CompactRange(CompactRangeOptions(), nullptr, nullptr);
    }
    for (const auto& db_with_cfh : multi_dbs_) {
      db_with_cfh.db->CompactRange(CompactRangeOptions(), nullptr, nullptr);
    }
  }

S
Siying Dong 已提交
6374 6375 6376 6377 6378 6379 6380 6381 6382
  void ResetStats() {
    if (db_.db != nullptr) {
      db_.db->ResetStats();
    }
    for (const auto& db_with_cfh : multi_dbs_) {
      db_with_cfh.db->ResetStats();
    }
  }

6383 6384 6385 6386 6387 6388 6389 6390 6391 6392 6393 6394 6395 6396 6397 6398 6399 6400 6401 6402 6403 6404 6405 6406 6407 6408 6409 6410 6411 6412 6413 6414 6415
  void PrintStatsHistory() {
    if (db_.db != nullptr) {
      PrintStatsHistoryImpl(db_.db, false);
    }
    for (const auto& db_with_cfh : multi_dbs_) {
      PrintStatsHistoryImpl(db_with_cfh.db, true);
    }
  }

  void PrintStatsHistoryImpl(DB* db, bool print_header) {
    if (print_header) {
      fprintf(stdout, "\n==== DB: %s ===\n", db->GetName().c_str());
    }

    std::unique_ptr<StatsHistoryIterator> shi;
    Status s = db->GetStatsHistory(0, port::kMaxUint64, &shi);
    if (!s.ok()) {
      fprintf(stdout, "%s\n", s.ToString().c_str());
      return;
    }
    assert(shi);
    while (shi->Valid()) {
      uint64_t stats_time = shi->GetStatsTime();
      fprintf(stdout, "------ %s ------\n",
              TimeToHumanString(static_cast<int>(stats_time)).c_str());
      for (auto& entry : shi->GetStatsMap()) {
        fprintf(stdout, " %" PRIu64 "   %s  %" PRIu64 "\n", stats_time,
                entry.first.c_str(), entry.second);
      }
      shi->Next();
    }
  }

S
Sanjay Ghemawat 已提交
6416
  void PrintStats(const char* key) {
6417 6418
    if (db_.db != nullptr) {
      PrintStats(db_.db, key, false);
6419
    }
6420 6421
    for (const auto& db_with_cfh : multi_dbs_) {
      PrintStats(db_with_cfh.db, key, true);
6422 6423 6424 6425 6426 6427 6428
    }
  }

  void PrintStats(DB* db, const char* key, bool print_header = false) {
    if (print_header) {
      fprintf(stdout, "\n==== DB: %s ===\n", db->GetName().c_str());
    }
6429
    std::string stats;
6430
    if (!db->GetProperty(key, &stats)) {
6431
      stats = "(failed)";
6432
    }
6433
    fprintf(stdout, "\n%s\n", stats.c_str());
6434
  }
6435 6436 6437 6438 6439 6440 6441 6442 6443

  void Replay(ThreadState* thread) {
    if (db_.db != nullptr) {
      Replay(thread, &db_);
    }
  }

  void Replay(ThreadState* /*thread*/, DBWithColumnFamilies* db_with_cfh) {
    Status s;
6444
    std::unique_ptr<TraceReader> trace_reader;
6445 6446 6447 6448 6449 6450 6451 6452 6453 6454 6455 6456
    s = NewFileTraceReader(FLAGS_env, EnvOptions(), FLAGS_trace_file,
                           &trace_reader);
    if (!s.ok()) {
      fprintf(
          stderr,
          "Encountered an error creating a TraceReader from the trace file. "
          "Error: %s\n",
          s.ToString().c_str());
      exit(1);
    }
    Replayer replayer(db_with_cfh->db, db_with_cfh->cfh,
                      std::move(trace_reader));
6457 6458
    replayer.SetFastForward(
        static_cast<uint32_t>(FLAGS_trace_replay_fast_forward));
6459 6460 6461 6462 6463 6464 6465 6466 6467
    s = replayer.Replay();
    if (s.ok()) {
      fprintf(stdout, "Replay started from trace_file: %s\n",
              FLAGS_trace_file.c_str());
    } else {
      fprintf(stderr, "Starting replay failed. Error: %s\n",
              s.ToString().c_str());
    }
  }
J
jorlow@chromium.org 已提交
6468 6469
};

6470
int db_bench_tool(int argc, char** argv) {
I
Igor Canadi 已提交
6471
  rocksdb::port::InstallStackTraceHandler();
6472 6473 6474 6475 6476 6477
  static bool initialized = false;
  if (!initialized) {
    SetUsageMessage(std::string("\nUSAGE:\n") + std::string(argv[0]) +
                    " [OPTIONS]...");
    initialized = true;
  }
6478
  ParseCommandLineFlags(&argc, &argv, true);
6479
  FLAGS_compaction_style_e = (rocksdb::CompactionStyle) FLAGS_compaction_style;
6480 6481 6482 6483 6484 6485 6486
#ifndef ROCKSDB_LITE
  if (FLAGS_statistics && !FLAGS_statistics_string.empty()) {
    fprintf(stderr,
            "Cannot provide both --statistics and --statistics_string.\n");
    exit(1);
  }
  if (!FLAGS_statistics_string.empty()) {
6487 6488
    Status s = ObjectRegistry::NewInstance()->NewSharedObject<Statistics>(
        FLAGS_statistics_string, &dbstats);
6489
    if (dbstats == nullptr) {
6490 6491 6492
      fprintf(stderr,
              "No Statistics registered matching string: %s status=%s\n",
              FLAGS_statistics_string.c_str(), s.ToString().c_str());
6493 6494 6495 6496
      exit(1);
    }
  }
#endif  // ROCKSDB_LITE
6497 6498
  if (FLAGS_statistics) {
    dbstats = rocksdb::CreateDBStatistics();
J
jorlow@chromium.org 已提交
6499
  }
S
Siying Dong 已提交
6500
  if (dbstats) {
6501
    dbstats->set_stats_level(static_cast<StatsLevel>(FLAGS_stats_level));
S
Siying Dong 已提交
6502
  }
6503
  FLAGS_compaction_pri_e = (rocksdb::CompactionPri)FLAGS_compaction_pri;
J
jorlow@chromium.org 已提交
6504

I
Igor Canadi 已提交
6505 6506
  std::vector<std::string> fanout = rocksdb::StringSplit(
      FLAGS_max_bytes_for_level_multiplier_additional, ',');
6507
  for (size_t j = 0; j < fanout.size(); j++) {
6508
    FLAGS_max_bytes_for_level_multiplier_additional_v.push_back(
S
sdong 已提交
6509 6510 6511 6512 6513
#ifndef CYGWIN
        std::stoi(fanout[j]));
#else
        stoi(fanout[j]));
#endif
6514 6515 6516 6517 6518
  }

  FLAGS_compression_type_e =
    StringToCompressionType(FLAGS_compression_type.c_str());

6519 6520 6521 6522 6523
#ifndef ROCKSDB_LITE
  if (!FLAGS_hdfs.empty() && !FLAGS_env_uri.empty()) {
    fprintf(stderr, "Cannot provide both --hdfs and --env_uri.\n");
    exit(1);
  } else if (!FLAGS_env_uri.empty()) {
6524
    Status s = Env::LoadEnv(FLAGS_env_uri, &FLAGS_env);
6525 6526 6527 6528 6529 6530
    if (FLAGS_env == nullptr) {
      fprintf(stderr, "No Env registered for URI: %s\n", FLAGS_env_uri.c_str());
      exit(1);
    }
  }
#endif  // ROCKSDB_LITE
6531 6532 6533 6534 6535 6536 6537
  if (FLAGS_use_existing_keys && !FLAGS_use_existing_db) {
    fprintf(stderr,
            "`-use_existing_db` must be true for `-use_existing_keys` to be "
            "settable\n");
    exit(1);
  }

6538 6539 6540 6541 6542 6543 6544 6545 6546 6547 6548 6549 6550 6551 6552 6553 6554 6555 6556
  if (!FLAGS_hdfs.empty()) {
    FLAGS_env  = new rocksdb::HdfsEnv(FLAGS_hdfs);
  }

  if (!strcasecmp(FLAGS_compaction_fadvice.c_str(), "NONE"))
    FLAGS_compaction_fadvice_e = rocksdb::Options::NONE;
  else if (!strcasecmp(FLAGS_compaction_fadvice.c_str(), "NORMAL"))
    FLAGS_compaction_fadvice_e = rocksdb::Options::NORMAL;
  else if (!strcasecmp(FLAGS_compaction_fadvice.c_str(), "SEQUENTIAL"))
    FLAGS_compaction_fadvice_e = rocksdb::Options::SEQUENTIAL;
  else if (!strcasecmp(FLAGS_compaction_fadvice.c_str(), "WILLNEED"))
    FLAGS_compaction_fadvice_e = rocksdb::Options::WILLNEED;
  else {
    fprintf(stdout, "Unknown compaction fadvice:%s\n",
            FLAGS_compaction_fadvice.c_str());
  }

  FLAGS_rep_factory = StringToRepFactory(FLAGS_memtablerep.c_str());

6557 6558 6559
  // Note options sanitization may increase thread pool sizes according to
  // max_background_flushes/max_background_compactions/max_background_jobs
  FLAGS_env->SetBackgroundThreads(FLAGS_num_high_pri_threads,
6560
                                  rocksdb::Env::Priority::HIGH);
6561 6562
  FLAGS_env->SetBackgroundThreads(FLAGS_num_bottom_pri_threads,
                                  rocksdb::Env::Priority::BOTTOM);
6563 6564
  FLAGS_env->SetBackgroundThreads(FLAGS_num_low_pri_threads,
                                  rocksdb::Env::Priority::LOW);
6565

H
heyongqiang 已提交
6566
  // Choose a location for the test database if none given with --db=<path>
6567 6568 6569 6570 6571
  if (FLAGS_db.empty()) {
    std::string default_db_path;
    rocksdb::Env::Default()->GetTestDirectory(&default_db_path);
    default_db_path += "/dbbench";
    FLAGS_db = default_db_path;
H
heyongqiang 已提交
6572 6573
  }

6574 6575 6576 6577 6578 6579
  if (FLAGS_stats_interval_seconds > 0) {
    // When both are set then FLAGS_stats_interval determines the frequency
    // at which the timer is checked for FLAGS_stats_interval_seconds
    FLAGS_stats_interval = 1000;
  }

6580 6581 6582 6583 6584
  if (FLAGS_seek_missing_prefix && FLAGS_prefix_size <= 8) {
    fprintf(stderr, "prefix_size > 8 required by --seek_missing_prefix\n");
    exit(1);
  }

6585
  rocksdb::Benchmark benchmark;
J
jorlow@chromium.org 已提交
6586
  benchmark.Run();
6587

Z
Zhongyi Xie 已提交
6588
#ifndef ROCKSDB_LITE
6589 6590 6591 6592 6593
  if (FLAGS_print_malloc_stats) {
    std::string stats_string;
    rocksdb::DumpMallocStats(&stats_string);
    fprintf(stdout, "Malloc stats:\n%s\n", stats_string.c_str());
  }
Z
Zhongyi Xie 已提交
6594
#endif  // ROCKSDB_LITE
6595

J
jorlow@chromium.org 已提交
6596 6597
  return 0;
}
6598
}  // namespace rocksdb
J
Jonathan Wiepert 已提交
6599
#endif