db_crashtest.py 12.7 KB
Newer Older
1
#! /usr/bin/env python
2
import os
3
import re
4 5
import sys
import time
6
import random
7
import logging
8
import tempfile
9
import subprocess
I
Igor Canadi 已提交
10
import shutil
11
import argparse
12

13 14 15 16 17
# params overwrite priority:
#   for default:
#       default_params < blackbox|whitebox_default_params < args
#   for simple:
#       simple_default_params < blackbox|whitebox_simple_default_params < args
18

19 20 21 22 23 24 25 26
default_params = {
    "block_size": 16384,
    "cache_size": 1048576,
    "delpercent": 5,
    "destroy_db_initially": 0,
    "disable_data_sync": 0,
    "disable_wal": 0,
    "filter_deletes": lambda: random.randint(0, 1),
27
    "allow_concurrent_memtable_write": 0,
28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48
    "iterpercent": 10,
    "max_background_compactions": 20,
    "max_bytes_for_level_base": 10485760,
    "max_key": 100000000,
    "max_write_buffer_number": 3,
    "memtablerep": "prefix_hash",
    "mmap_read": lambda: random.randint(0, 1),
    "open_files": 500000,
    "prefix_size": 7,
    "prefixpercent": 5,
    "progress_reports": 0,
    "readpercent": 45,
    "reopen": 20,
    "sync": 0,
    "target_file_size_base": 2097152,
    "target_file_size_multiplier": 2,
    "threads": 32,
    "verify_checksum": 1,
    "write_buffer_size": 4 * 1024 * 1024,
    "writepercent": 35,
}
49

50

51
def get_dbname(test_name):
S
sdong 已提交
52 53
    test_tmpdir = os.environ.get("TEST_TMPDIR")
    if test_tmpdir is None or test_tmpdir == "":
54
        dbname = tempfile.mkdtemp(prefix='rocksdb_crashtest_' + test_name)
S
sdong 已提交
55
    else:
56
        dbname = test_tmpdir + "/rocksdb_crashtest_" + test_name
57
        shutil.rmtree(dbname, True)
58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88
    return dbname

blackbox_default_params = {
    # total time for this script to test db_stress
    "duration": 6000,
    # time for one db_stress instance to run
    "interval": 120,
    # since we will be killing anyway, use large value for ops_per_thread
    "ops_per_thread": 100000000,
    "set_options_one_in": 10000,
    "test_batches_snapshots": 1,
}

whitebox_default_params = {
    "duration": 10000,
    "log2_keys_per_lock": 10,
    "nooverwritepercent": 1,
    "ops_per_thread": 200000,
    "test_batches_snapshots": lambda: random.randint(0, 1),
    "write_buffer_size": 4 * 1024 * 1024,
}

simple_default_params = {
    "block_size": 16384,
    "cache_size": 1048576,
    "column_families": 1,
    "delpercent": 5,
    "destroy_db_initially": 0,
    "disable_data_sync": 0,
    "disable_wal": 0,
    "filter_deletes": lambda: random.randint(0, 1),
89
    "allow_concurrent_memtable_write": lambda: random.randint(0, 1),
90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130
    "iterpercent": 10,
    "max_background_compactions": 1,
    "max_bytes_for_level_base": 67108864,
    "max_key": 100000000,
    "max_write_buffer_number": 3,
    "memtablerep": "skip_list",
    "mmap_read": lambda: random.randint(0, 1),
    "prefix_size": 0,
    "prefixpercent": 0,
    "progress_reports": 0,
    "readpercent": 50,
    "reopen": 20,
    "sync": 0,
    "target_file_size_base": 16777216,
    "target_file_size_multiplier": 1,
    "test_batches_snapshots": 0,
    "threads": 32,
    "verify_checksum": 1,
    "write_buffer_size": 32 * 1024 * 1024,
    "writepercent": 35,
}

blackbox_simple_default_params = {
    "duration": 6000,
    "interval": 120,
    "open_files": -1,
    "ops_per_thread": 100000000,
    "set_options_one_in": 0,
    "test_batches_snapshots": 0,
}

whitebox_simple_default_params = {
    "duration": 10000,
    "log2_keys_per_lock": 10,
    "nooverwritepercent": 1,
    "open_files": 500000,
    "ops_per_thread": 200000,
    "write_buffer_size": 32 * 1024 * 1024,
}


131 132 133 134 135 136 137 138 139
def finalize_and_sanitize(src_params):
    dest_params = dict([(k,  v() if callable(v) else v)
                        for (k, v) in src_params.items()])
    # --allow_concurrent_memtable_write with --filter_deletes is not supported.
    if dest_params.get("allow_concurrent_memtable_write", 1) == 1:
        dest_params["filter_deletes"] = 0
    return dest_params


140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164
def gen_cmd_params(args):
    params = {}

    if args.simple:
        params.update(simple_default_params)
        if args.test_type == 'blackbox':
            params.update(blackbox_simple_default_params)
        if args.test_type == 'whitebox':
            params.update(whitebox_simple_default_params)

    if not args.simple:
        params.update(default_params)
        if args.test_type == 'blackbox':
            params.update(blackbox_default_params)
        if args.test_type == 'whitebox':
            params.update(whitebox_default_params)

    for k, v in vars(args).items():
        if v is not None:
            params[k] = v
    return params


def gen_cmd(params):
    cmd = './db_stress ' + ' '.join(
165 166
        '--{0}={1}'.format(k, v)
        for k, v in finalize_and_sanitize(params).items()
167 168 169 170 171 172 173 174 175
        if k not in set(['test_type', 'simple', 'duration', 'interval'])
        and v is not None)
    return cmd


# This script runs and kills db_stress multiple times. It checks consistency
# in case of unsafe crashes in RocksDB.
def blackbox_crash_main(args):
    cmd_params = gen_cmd_params(args)
S
Shusen Liu 已提交
176
    dbname = get_dbname('blackbox')
177 178 179 180 181 182 183 184
    exit_time = time.time() + cmd_params['duration']

    print("Running blackbox-crash-test with \n"
          + "interval_between_crash=" + str(cmd_params['interval']) + "\n"
          + "total-duration=" + str(cmd_params['duration']) + "\n"
          + "threads=" + str(cmd_params['threads']) + "\n"
          + "ops_per_thread=" + str(cmd_params['ops_per_thread']) + "\n"
          + "write_buffer_size=" + str(cmd_params['write_buffer_size']) + "\n")
I
Igor Canadi 已提交
185

186 187
    while time.time() < exit_time:
        run_had_errors = False
188 189
        killtime = time.time() + cmd_params['interval']

S
Shusen Liu 已提交
190
        cmd = gen_cmd(dict(cmd_params.items() + {'db': dbname}.items()))
191 192

        child = subprocess.Popen([cmd],
193
                                 stderr=subprocess.PIPE, shell=True)
194 195 196
        print("Running db_stress with pid=%d: %s\n\n"
              % (child.pid, cmd))

L
Lei Jin 已提交
197
        stop_early = False
198
        while time.time() < killtime:
L
Lei Jin 已提交
199 200 201 202 203 204
            if child.poll() is not None:
                print("WARNING: db_stress ended before kill: exitcode=%d\n"
                      % child.returncode)
                stop_early = True
                break
            time.sleep(1)
205

L
Lei Jin 已提交
206 207 208 209 210 211 212 213
        if not stop_early:
            if child.poll() is not None:
                print("WARNING: db_stress ended before kill: exitcode=%d\n"
                      % child.returncode)
            else:
                child.kill()
                print("KILLED %d\n" % child.pid)
                time.sleep(1)  # time to stabilize after a kill
214 215 216 217 218 219 220

        while True:
            line = child.stderr.readline().strip()
            if line != '':
                run_had_errors = True
                print('***' + line + '^')
            else:
221 222
                break

223 224 225 226
        if run_had_errors:
            sys.exit(2)

        time.sleep(1)  # time to stabilize before the next run
227

I
Igor Canadi 已提交
228
    # we need to clean up after ourselves -- only do this on test success
S
Shusen Liu 已提交
229
    shutil.rmtree(dbname, True)
I
Igor Canadi 已提交
230

231 232 233 234 235

# This python script runs db_stress multiple times. Some runs with
# kill_random_test that causes rocksdb to crash at various points in code.
def whitebox_crash_main(args):
    cmd_params = gen_cmd_params(args)
S
Shusen Liu 已提交
236
    dbname = get_dbname('whitebox')
237 238 239 240 241 242 243 244 245 246 247 248 249

    cur_time = time.time()
    exit_time = cur_time + cmd_params['duration']
    half_time = cur_time + cmd_params['duration'] / 2

    print("Running whitebox-crash-test with \n"
          + "total-duration=" + str(cmd_params['duration']) + "\n"
          + "threads=" + str(cmd_params['threads']) + "\n"
          + "ops_per_thread=" + str(cmd_params['ops_per_thread']) + "\n"
          + "write_buffer_size=" + str(cmd_params['write_buffer_size']) + "\n")

    total_check_mode = 4
    check_mode = 0
250
    kill_random_test = 888887
251 252 253 254 255 256 257 258
    kill_mode = 0

    while time.time() < exit_time:
        if check_mode == 0:
            additional_opts = {
                # use large ops per thread since we will kill it anyway
                "ops_per_thread": 100 * cmd_params['ops_per_thread'],
            }
259 260 261 262
            # run with kill_random_test, with three modes.
            # Mode 0 covers all kill points. Mode 1 covers less kill points but
            # increases change of triggering them. Mode 2 covers even less
            # frequent kill points and further increases triggering change.
263 264 265 266 267 268
            if kill_mode == 0:
                additional_opts.update({
                    "kill_random_test": kill_random_test,
                })
            elif kill_mode == 1:
                additional_opts.update({
269
                    "kill_random_test": (kill_random_test / 10 + 1),
270 271 272
                    "kill_prefix_blacklist": "WritableFileWriter::Append,"
                    + "WritableFileWriter::WriteBuffered",
                })
273 274
            elif kill_mode == 2:
                additional_opts.update({
275
                    "kill_random_test": (kill_random_test / 5000 + 1),
276 277 278 279 280 281
                    "kill_prefix_blacklist": "WritableFileWriter::Append,"
                    "WritableFileWriter::WriteBuffered,"
                    "PosixMmapFile::Allocate,WritableFileWriter::Flush",
                })
            # Run kill mode 0, 1 and 2 by turn.
            kill_mode = (kill_mode + 1) % 3
282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304
        elif check_mode == 1:
            # normal run with universal compaction mode
            additional_opts = {
                "kill_random_test": None,
                "ops_per_thread": cmd_params['ops_per_thread'],
                "compaction_style": 1,
            }
        elif check_mode == 2:
            # normal run with FIFO compaction mode
            # ops_per_thread is divided by 5 because FIFO compaction
            # style is quite a bit slower on reads with lot of files
            additional_opts = {
                "kill_random_test": None,
                "ops_per_thread": cmd_params['ops_per_thread'] / 5,
                "compaction_style": 2,
            }
        else:
            # normal run
            additional_opts = additional_opts = {
                "kill_random_test": None,
                "ops_per_thread": cmd_params['ops_per_thread'],
            }

S
Shusen Liu 已提交
305 306
        cmd = gen_cmd(dict(cmd_params.items() + additional_opts.items()
                           + {'db': dbname}.items()))
307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348

        print "Running:" + cmd + "\n"

        popen = subprocess.Popen([cmd], stdout=subprocess.PIPE,
                                 stderr=subprocess.STDOUT,
                                 shell=True)
        stdoutdata, stderrdata = popen.communicate()
        retncode = popen.returncode
        msg = ("check_mode={0}, kill option={1}, exitcode={2}\n".format(
               check_mode, additional_opts['kill_random_test'], retncode))
        print msg
        print stdoutdata

        expected = False
        if additional_opts['kill_random_test'] is None and (retncode == 0):
            # we expect zero retncode if no kill option
            expected = True
        elif additional_opts['kill_random_test'] is not None and retncode < 0:
            # we expect negative retncode if kill option was given
            expected = True

        if not expected:
            print "TEST FAILED. See kill option and exit code above!!!\n"
            sys.exit(1)

        stdoutdata = stdoutdata.lower()
        errorcount = (stdoutdata.count('error') -
                      stdoutdata.count('got errors 0 times'))
        print "#times error occurred in output is " + str(errorcount) + "\n"

        if (errorcount > 0):
            print "TEST FAILED. Output has 'error'!!!\n"
            sys.exit(2)
        if (stdoutdata.find('fail') >= 0):
            print "TEST FAILED. Output has 'fail'!!!\n"
            sys.exit(2)

        # First half of the duration, keep doing kill test. For the next half,
        # try different modes.
        if time.time() > half_time:
            # we need to clean up after ourselves -- only do this on test
            # success
S
Shusen Liu 已提交
349
            shutil.rmtree(dbname, True)
350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378
            check_mode = (check_mode + 1) % total_check_mode

        time.sleep(1)  # time to stabilize after a kill


def main():
    parser = argparse.ArgumentParser(description="This script runs and kills \
        db_stress multiple times")
    parser.add_argument("test_type", choices=["blackbox", "whitebox"])
    parser.add_argument("--simple", action="store_true")

    all_params = dict(default_params.items()
                      + blackbox_default_params.items()
                      + whitebox_default_params.items()
                      + simple_default_params.items()
                      + blackbox_simple_default_params.items()
                      + whitebox_simple_default_params.items())

    for k, v in all_params.items():
        parser.add_argument("--" + k, type=type(v() if callable(v) else v))
    args = parser.parse_args()

    if args.test_type == 'blackbox':
        blackbox_crash_main(args)
    if args.test_type == 'whitebox':
        whitebox_crash_main(args)

if __name__ == '__main__':
    main()