db_crashtest.py 14.7 KB
Newer Older
1
#!/usr/bin/env python2
2
# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
3 4 5
import os
import sys
import time
6
import random
7
import tempfile
8
import subprocess
I
Igor Canadi 已提交
9
import shutil
10
import argparse
11

12 13
# params overwrite priority:
#   for default:
14
#       default_params < {blackbox,whitebox}_default_params < args
15
#   for simple:
16 17 18
#       default_params < {blackbox,whitebox}_default_params <
#       simple_default_params <
#       {blackbox,whitebox}_simple_default_params < args
19 20 21
#   for enable_atomic_flush:
#       default_params < {blackbox,whitebox}_default_params <
#       atomic_flush_params < args
22

23 24
expected_values_file = tempfile.NamedTemporaryFile()

25
default_params = {
26
    "acquire_snapshot_one_in": 10000,
27
    "block_size": 16384,
28
    "cache_index_and_filter_blocks": lambda: random.randint(0, 1),
29
    "cache_size": 1048576,
30
    "checkpoint_one_in": 1000000,
31
    "compression_type": "snappy",
32 33
    "compression_max_dict_bytes": lambda: 16384 * random.randint(0, 1),
    "compression_zstd_max_train_bytes": lambda: 65536 * random.randint(0, 1),
34
    "clear_column_family_one_in": 0,
35 36
    "compact_files_one_in": 1000000,
    "compact_range_one_in": 1000000,
37 38
    "delpercent": 4,
    "delrangepercent": 1,
39
    "destroy_db_initially": 0,
40 41
    # Temporarily disable it until its concurrency issue are fixed
    "enable_pipelined_write": 0,
42
    "expected_values_path": expected_values_file.name,
43
    "flush_one_in": 1000000,
44 45 46 47 48
    "max_background_compactions": 20,
    "max_bytes_for_level_base": 10485760,
    "max_key": 100000000,
    "max_write_buffer_number": 3,
    "mmap_read": lambda: random.randint(0, 1),
49
    "nooverwritepercent": 1,
50 51 52 53
    "open_files": 500000,
    "prefixpercent": 5,
    "progress_reports": 0,
    "readpercent": 45,
54
    "recycle_log_file_num": lambda: random.randint(0, 1),
55
    "reopen": 20,
56
    "snapshot_hold_ops": 100000,
57
    "subcompactions": lambda: random.randint(1, 4),
58 59
    "target_file_size_base": 2097152,
    "target_file_size_multiplier": 2,
60 61
    "use_direct_reads": lambda: random.randint(0, 1),
    "use_direct_io_for_flush_and_compaction": lambda: random.randint(0, 1),
62 63
    "use_full_merge_v1": lambda: random.randint(0, 1),
    "use_merge": lambda: random.randint(0, 1),
64 65 66
    "verify_checksum": 1,
    "write_buffer_size": 4 * 1024 * 1024,
    "writepercent": 35,
67
    "format_version": lambda: random.randint(2, 4),
68
    "index_block_restart_interval": lambda: random.choice(range(1, 16)),
A
anand76 已提交
69
    "use_multiget" : lambda: random.randint(0, 1),
70
}
71

72 73
_TEST_DIR_ENV_VAR = 'TEST_TMPDIR'

74

75
def get_dbname(test_name):
76
    test_tmpdir = os.environ.get(_TEST_DIR_ENV_VAR)
S
sdong 已提交
77
    if test_tmpdir is None or test_tmpdir == "":
78
        dbname = tempfile.mkdtemp(prefix='rocksdb_crashtest_' + test_name)
S
sdong 已提交
79
    else:
80
        dbname = test_tmpdir + "/rocksdb_crashtest_" + test_name
81
        shutil.rmtree(dbname, True)
82
        os.mkdir(dbname)
83 84
    return dbname

85 86 87 88 89 90 91 92 93 94

def is_direct_io_supported(dbname):
    with tempfile.NamedTemporaryFile(dir=dbname) as f:
        try:
            os.open(f.name, os.O_DIRECT)
        except:
            return False
        return True


95 96 97 98 99 100 101 102 103 104 105 106 107 108 109
blackbox_default_params = {
    # total time for this script to test db_stress
    "duration": 6000,
    # time for one db_stress instance to run
    "interval": 120,
    # since we will be killing anyway, use large value for ops_per_thread
    "ops_per_thread": 100000000,
    "set_options_one_in": 10000,
    "test_batches_snapshots": 1,
}

whitebox_default_params = {
    "duration": 10000,
    "log2_keys_per_lock": 10,
    "ops_per_thread": 200000,
110
    "random_kill_odd": 888887,
111
    "test_batches_snapshots": lambda: random.randint(0, 1),
112 113 114
}

simple_default_params = {
115
    "allow_concurrent_memtable_write": lambda: random.randint(0, 1),
116
    "column_families": 1,
117 118 119
    "max_background_compactions": 1,
    "max_bytes_for_level_base": 67108864,
    "memtablerep": "skip_list",
120 121
    "prefixpercent": 25,
    "readpercent": 25,
122 123 124 125 126 127 128 129 130 131 132
    "target_file_size_base": 16777216,
    "target_file_size_multiplier": 1,
    "test_batches_snapshots": 0,
    "write_buffer_size": 32 * 1024 * 1024,
}

blackbox_simple_default_params = {
    "open_files": -1,
    "set_options_one_in": 0,
}

133
whitebox_simple_default_params = {}
134

135 136 137
atomic_flush_params = {
    "disable_wal": 1,
    "reopen": 0,
138
    "test_atomic_flush": 1,
139 140 141
    # use small value for write_buffer_size so that RocksDB triggers flush
    # more frequently
    "write_buffer_size": 1024 * 1024,
142 143
    # disable pipelined write when test_atomic_flush is true
    "enable_pipelined_write": 0,
144 145
}

146

147 148 149
def finalize_and_sanitize(src_params):
    dest_params = dict([(k,  v() if callable(v) else v)
                        for (k, v) in src_params.items()])
150 151 152
    if dest_params.get("compression_type") != "zstd" or \
            dest_params.get("compression_max_dict_bytes") == 0:
        dest_params["compression_zstd_max_train_bytes"] = 0
153
    if dest_params.get("allow_concurrent_memtable_write", 1) == 1:
154
        dest_params["memtablerep"] = "skip_list"
155 156 157 158
    if dest_params["mmap_read"] == 1 or not is_direct_io_supported(
            dest_params["db"]):
        dest_params["use_direct_io_for_flush_and_compaction"] = 0
        dest_params["use_direct_reads"] = 0
159 160 161
    if dest_params.get("test_batches_snapshots") == 1:
        dest_params["delpercent"] += dest_params["delrangepercent"]
        dest_params["delrangepercent"] = 0
162 163 164
    return dest_params


165 166 167
def gen_cmd_params(args):
    params = {}

168 169 170 171 172
    params.update(default_params)
    if args.test_type == 'blackbox':
        params.update(blackbox_default_params)
    if args.test_type == 'whitebox':
        params.update(whitebox_default_params)
173 174 175 176 177 178
    if args.simple:
        params.update(simple_default_params)
        if args.test_type == 'blackbox':
            params.update(blackbox_simple_default_params)
        if args.test_type == 'whitebox':
            params.update(whitebox_simple_default_params)
179 180
    if args.enable_atomic_flush:
        params.update(atomic_flush_params)
181 182 183 184 185 186 187

    for k, v in vars(args).items():
        if v is not None:
            params[k] = v
    return params


188
def gen_cmd(params, unknown_params):
189
    cmd = ['./db_stress'] + [
190 191
        '--{0}={1}'.format(k, v)
        for k, v in finalize_and_sanitize(params).items()
192
        if k not in set(['test_type', 'simple', 'duration', 'interval',
193
                         'random_kill_odd', 'enable_atomic_flush'])
194
        and v is not None] + unknown_params
195 196 197 198 199
    return cmd


# This script runs and kills db_stress multiple times. It checks consistency
# in case of unsafe crashes in RocksDB.
200
def blackbox_crash_main(args, unknown_args):
201
    cmd_params = gen_cmd_params(args)
S
Shusen Liu 已提交
202
    dbname = get_dbname('blackbox')
203 204 205 206
    exit_time = time.time() + cmd_params['duration']

    print("Running blackbox-crash-test with \n"
          + "interval_between_crash=" + str(cmd_params['interval']) + "\n"
207
          + "total-duration=" + str(cmd_params['duration']) + "\n")
I
Igor Canadi 已提交
208

209 210
    while time.time() < exit_time:
        run_had_errors = False
211 212
        killtime = time.time() + cmd_params['interval']

213 214
        cmd = gen_cmd(dict(
            cmd_params.items() +
215
            {'db': dbname}.items()), unknown_args)
216

217
        child = subprocess.Popen(cmd, stderr=subprocess.PIPE)
218
        print("Running db_stress with pid=%d: %s\n\n"
219
              % (child.pid, ' '.join(cmd)))
220

L
Lei Jin 已提交
221
        stop_early = False
222
        while time.time() < killtime:
L
Lei Jin 已提交
223 224 225 226 227 228
            if child.poll() is not None:
                print("WARNING: db_stress ended before kill: exitcode=%d\n"
                      % child.returncode)
                stop_early = True
                break
            time.sleep(1)
229

L
Lei Jin 已提交
230 231 232 233 234 235 236 237
        if not stop_early:
            if child.poll() is not None:
                print("WARNING: db_stress ended before kill: exitcode=%d\n"
                      % child.returncode)
            else:
                child.kill()
                print("KILLED %d\n" % child.pid)
                time.sleep(1)  # time to stabilize after a kill
238 239 240

        while True:
            line = child.stderr.readline().strip()
241 242 243
            if line == '':
                break
            elif not line.startswith('WARNING'):
244
                run_had_errors = True
245 246
                print('stderr has error message:')
                print('***' + line + '***')
247

248 249 250 251
        if run_had_errors:
            sys.exit(2)

        time.sleep(1)  # time to stabilize before the next run
252

I
Igor Canadi 已提交
253
    # we need to clean up after ourselves -- only do this on test success
S
Shusen Liu 已提交
254
    shutil.rmtree(dbname, True)
I
Igor Canadi 已提交
255

256 257 258

# This python script runs db_stress multiple times. Some runs with
# kill_random_test that causes rocksdb to crash at various points in code.
259
def whitebox_crash_main(args, unknown_args):
260
    cmd_params = gen_cmd_params(args)
S
Shusen Liu 已提交
261
    dbname = get_dbname('whitebox')
262 263 264 265 266 267

    cur_time = time.time()
    exit_time = cur_time + cmd_params['duration']
    half_time = cur_time + cmd_params['duration'] / 2

    print("Running whitebox-crash-test with \n"
268
          + "total-duration=" + str(cmd_params['duration']) + "\n")
269 270 271

    total_check_mode = 4
    check_mode = 0
272
    kill_random_test = cmd_params['random_kill_odd']
273 274 275 276 277 278 279 280
    kill_mode = 0

    while time.time() < exit_time:
        if check_mode == 0:
            additional_opts = {
                # use large ops per thread since we will kill it anyway
                "ops_per_thread": 100 * cmd_params['ops_per_thread'],
            }
281 282 283 284
            # run with kill_random_test, with three modes.
            # Mode 0 covers all kill points. Mode 1 covers less kill points but
            # increases change of triggering them. Mode 2 covers even less
            # frequent kill points and further increases triggering change.
285 286 287 288 289 290
            if kill_mode == 0:
                additional_opts.update({
                    "kill_random_test": kill_random_test,
                })
            elif kill_mode == 1:
                additional_opts.update({
291
                    "kill_random_test": (kill_random_test / 10 + 1),
292 293 294
                    "kill_prefix_blacklist": "WritableFileWriter::Append,"
                    + "WritableFileWriter::WriteBuffered",
                })
295
            elif kill_mode == 2:
296 297
                # TODO: May need to adjust random odds if kill_random_test
                # is too small.
298
                additional_opts.update({
299
                    "kill_random_test": (kill_random_test / 5000 + 1),
300 301 302 303 304 305
                    "kill_prefix_blacklist": "WritableFileWriter::Append,"
                    "WritableFileWriter::WriteBuffered,"
                    "PosixMmapFile::Allocate,WritableFileWriter::Flush",
                })
            # Run kill mode 0, 1 and 2 by turn.
            kill_mode = (kill_mode + 1) % 3
306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323
        elif check_mode == 1:
            # normal run with universal compaction mode
            additional_opts = {
                "kill_random_test": None,
                "ops_per_thread": cmd_params['ops_per_thread'],
                "compaction_style": 1,
            }
        elif check_mode == 2:
            # normal run with FIFO compaction mode
            # ops_per_thread is divided by 5 because FIFO compaction
            # style is quite a bit slower on reads with lot of files
            additional_opts = {
                "kill_random_test": None,
                "ops_per_thread": cmd_params['ops_per_thread'] / 5,
                "compaction_style": 2,
            }
        else:
            # normal run
324
            additional_opts = {
325 326 327 328
                "kill_random_test": None,
                "ops_per_thread": cmd_params['ops_per_thread'],
            }

S
Shusen Liu 已提交
329
        cmd = gen_cmd(dict(cmd_params.items() + additional_opts.items()
330
                           + {'db': dbname}.items()), unknown_args)
331

M
Mark Isaacson 已提交
332
        print "Running:" + ' '.join(cmd) + "\n"  # noqa: E999 T25377293 Grandfathered in
333

334 335
        popen = subprocess.Popen(cmd, stdout=subprocess.PIPE,
                                 stderr=subprocess.STDOUT)
336 337 338 339 340 341 342 343 344 345 346
        stdoutdata, stderrdata = popen.communicate()
        retncode = popen.returncode
        msg = ("check_mode={0}, kill option={1}, exitcode={2}\n".format(
               check_mode, additional_opts['kill_random_test'], retncode))
        print msg
        print stdoutdata

        expected = False
        if additional_opts['kill_random_test'] is None and (retncode == 0):
            # we expect zero retncode if no kill option
            expected = True
347 348 349
        elif additional_opts['kill_random_test'] is not None and retncode <= 0:
            # When kill option is given, the test MIGHT kill itself.
            # If it does, negative retncode is expected. Otherwise 0.
350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372
            expected = True

        if not expected:
            print "TEST FAILED. See kill option and exit code above!!!\n"
            sys.exit(1)

        stdoutdata = stdoutdata.lower()
        errorcount = (stdoutdata.count('error') -
                      stdoutdata.count('got errors 0 times'))
        print "#times error occurred in output is " + str(errorcount) + "\n"

        if (errorcount > 0):
            print "TEST FAILED. Output has 'error'!!!\n"
            sys.exit(2)
        if (stdoutdata.find('fail') >= 0):
            print "TEST FAILED. Output has 'fail'!!!\n"
            sys.exit(2)

        # First half of the duration, keep doing kill test. For the next half,
        # try different modes.
        if time.time() > half_time:
            # we need to clean up after ourselves -- only do this on test
            # success
S
Shusen Liu 已提交
373
            shutil.rmtree(dbname, True)
374
            os.mkdir(dbname)
375
            cmd_params.pop('expected_values_path', None)
376 377 378 379 380 381 382 383 384 385
            check_mode = (check_mode + 1) % total_check_mode

        time.sleep(1)  # time to stabilize after a kill


def main():
    parser = argparse.ArgumentParser(description="This script runs and kills \
        db_stress multiple times")
    parser.add_argument("test_type", choices=["blackbox", "whitebox"])
    parser.add_argument("--simple", action="store_true")
386
    parser.add_argument("--enable_atomic_flush", action='store_true')
387 388 389 390 391 392 393 394 395 396

    all_params = dict(default_params.items()
                      + blackbox_default_params.items()
                      + whitebox_default_params.items()
                      + simple_default_params.items()
                      + blackbox_simple_default_params.items()
                      + whitebox_simple_default_params.items())

    for k, v in all_params.items():
        parser.add_argument("--" + k, type=type(v() if callable(v) else v))
397 398
    # unknown_args are passed directly to db_stress
    args, unknown_args = parser.parse_known_args()
399

400 401 402 403 404 405
    test_tmpdir = os.environ.get(_TEST_DIR_ENV_VAR)
    if test_tmpdir is not None and not os.path.isdir(test_tmpdir):
        print('%s env var is set to a non-existent directory: %s' %
                (_TEST_DIR_ENV_VAR, test_tmpdir))
        sys.exit(1)

406
    if args.test_type == 'blackbox':
407
        blackbox_crash_main(args, unknown_args)
408
    if args.test_type == 'whitebox':
409
        whitebox_crash_main(args, unknown_args)
410 411 412

if __name__ == '__main__':
    main()